Skip to content

E2E Nightly_OnDemand Tests #247

E2E Nightly_OnDemand Tests

E2E Nightly_OnDemand Tests #247

name: E2E Nightly_OnDemand Tests
on:
schedule:
# GMT+8 21:00 every day
- cron: '0 13 * * *'
workflow_dispatch:
inputs:
python:
required: false
type: string
default: '3.10'
description: Specify python version
triton:
required: false
type: string
default: ''
description: Specify triton commit, use pytorch pined commit by default
suite:
required: true
type: string
default: 'huggingface'
description: Dynamo benchmarks test suite. huggingface,timm_models,torchbench. Delimiter is comma
dt:
required: true
type: string
default: 'float32'
description: Data precision of the test.float32,bfloat16,float16,amp_bf16,amp_fp16. Delimiter is comma
mode:
required: true
type: string
default: 'inference'
description: inference,training. Delimiter is comma
scenario:
required: true
type: string
default: 'accuracy'
description: accuracy,performance. Delimiter is comma
model:
required: false
type: string
default: ''
description: If set, will only launch this one
permissions: read-all
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ inputs.suite }}-${{ inputs.dt }}-${{ inputs.mode }}-${{ inputs.scenario }}-${{ inputs.triton }}-${{ inputs.model }}
cancel-in-progress: true
jobs:
Inductor-XPU-E2E-Nightly-Tests:
runs-on: pvc_e2e
timeout-minutes: 900
outputs:
TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }}
TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }}
DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }}
BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }}
OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }}
GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }}
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
- name: Prepare Conda ENV
run: |
which conda && conda clean -ay
conda remove --all -y -n e2e_ci || rm -rf $(dirname ${CONDA_EXE})/../envs/e2e_ci
conda create -n e2e_ci python=${{ inputs.python }} cmake ninja -y
source activate e2e_ci
conda install -c intel mkl-static mkl-include -y
pip install pandas scipy tqdm
- name: Prepare Stock Pytorch
run: |
pwd
cd ../ && rm -rf pytorch
source activate e2e_ci
git clone -b main https://github.com/pytorch/pytorch pytorch
cd pytorch
# apply PRs for stock pytorch
pip install requests
python ../torch-xpu-ops/.github/scripts/apply_torch_pr.py
git status && git show -s
git submodule sync && git submodule update --init --recursive
rm -rf third_party/torch-xpu-ops && cp -r ../torch-xpu-ops third_party/
# Workaround for torch-xpu-ops ci test
sed -i "s/checkout --quiet \${TORCH_XPU_OPS_COMMIT}/log -n 1/g" caffe2/CMakeLists.txt
- name: Identify pinned versions
id: pinned
run: |
cd ../pytorch
if [ -z ${{ inputs.triton }} ]; then
echo "TRITON_COMMIT_ID=$(<.ci/docker/ci_commit_pins/triton-xpu.txt)" >> "${GITHUB_ENV}"
else
echo "TRITON_COMMIT_ID=${{ inputs.triton }}" >> "${GITHUB_ENV}"
fi
echo "TORCH_BRANCH_ID=$(git rev-parse --abbrev-ref HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCH_COMMIT_ID=$(git rev-parse HEAD)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "TORCHBENCH_COMMIT_ID=$(<third_party/torch-xpu-ops/.github/ci_commit_pins/torchbench.txt)" >> "${GITHUB_ENV}"
echo "TORCHVISION_COMMIT_ID=$(<.github/ci_commit_pins/vision.txt)" >> "${GITHUB_ENV}"
echo "TORCHTEXT_COMMIT_ID=$(<.github/ci_commit_pins/text.txt)" >> "${GITHUB_ENV}"
echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}"
echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}"
echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}"
echo "MODEL_ONLY_NAME=${{ inputs.model }}" >> "${GITHUB_ENV}"
source /opt/intel/oneapi/compiler/latest/env/vars.sh
echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
. /etc/os-release
echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo "GCC_VERSION=$(gcc -dumpversion)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}"
echo ${GITHUB_ENV}
- name: Triton Installation
run: |
source activate e2e_ci
cd ../pytorch
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
echo ${TRITON_REPO}@${TRITON_COMMIT_ID}
pip install --force-reinstall "git+${TRITON_REPO}@${TRITON_COMMIT_ID}#subdirectory=python"
- name: Build Pytorch XPU
run: |
source activate e2e_ci
cd ../pytorch
pip install -r requirements.txt
export USE_XPU=1
source /opt/intel/oneapi/compiler/latest/env/vars.sh
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
python setup.py bdist_wheel
pip install --force-reinstall dist/*.whl
- name: Show GITHUB_ENV
run: |
echo "$GITHUB_ENV"
rm -rf ../pytorch/inductor_log
- name: Nightly Huggingface FP32 Inference Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
env_prepare: true
dt: float32
mode: inference
scenario: accuracy
expected_pass_num: 46
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Huggingface BF16 Inference Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: bfloat16
mode: inference
scenario: accuracy
expected_pass_num: 46
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Huggingface FP16 Inference Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: float16
mode: inference
scenario: accuracy
expected_pass_num: 46
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Huggingface FP32 Training Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
env_prepare: true
dt: float32
mode: training
scenario: accuracy
expected_pass_num: 46
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Huggingface BF16 Training Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: bfloat16
mode: training
scenario: accuracy
expected_pass_num: 46
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Huggingface FP16 Training Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: huggingface
dt: float16
mode: training
scenario: accuracy
expected_pass_num: 46
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Torchbench BF16 Training Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: torchbench
dt: bfloat16
mode: training
scenario: accuracy
expected_pass_num: 39
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Nightly Timm_models FP16 Training Accuracy Test
if: ${{ !inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: timm_models
dt: float16
mode: training
scenario: accuracy
expected_pass_num: 22
env_prepare: true
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: OnDemand Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
if: ${{ inputs.suite }}
uses: ./.github/actions/inductor-xpu-e2e-test
with:
suite: ${{ inputs.suite }}
env_prepare: true
dt: ${{ inputs.dt }}
mode: ${{ inputs.mode }}
scenario: ${{ inputs.scenario }}
hf_token: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
- name: Summarize archieve files
run: |
rm -rf ${{ github.workspace }}/upload_files
cp -r ${{ github.workspace }}/../pytorch/inductor_log ${{ github.workspace }}/upload_files
- name: Upload Inductor XPU E2E Data
if: always()
uses: actions/upload-artifact@v4
with:
name: Inductor-XPU-E2E-Data-${{ github.event.pull_request.number || github.sha }}
path: ${{ github.workspace }}/upload_files
Tests-Failure-And-Report:
if: always()
runs-on: pvc_e2e
permissions:
issues: write
env:
GH_TOKEN: ${{ github.token }}
needs: Inductor-XPU-E2E-Nightly-Tests
steps:
- name: Report github issue for XPU OPS nightly
run: |
set -xe
# Test status
if [ "${{ needs.Inductor-XPU-E2E-Nightly-Tests.result }}" == "success" ];then
test_status=Success
elif [ "${{ needs.Inductor-XPU-E2E-Nightly-Tests.result }}" == "failure" ];then
test_status=Failure
cc_comment="CC ${{ secrets.NIGHTLY_EMAIL_LIST }}"
else
test_status=None
exit 0
fi
build_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
repo="${{ github.repository }}"
test_xpu="${GITHUB_WORKFLOW_SHA} on ${GITHUB_REF_NAME}"
test_torch="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCH_COMMIT_ID }} on ${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.TORCH_BRANCH_ID }}"
test_driver="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.DRIVER_VERSION }}"
test_bundle="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.BUNDLE_VERSION }}"
test_os="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.OS_PRETTY_NAME }}"
test_gcc="${{ needs.Inductor-XPU-E2E-Nightly-Tests.outputs.GCC_VERSION }}"
# On-demand
if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then
cc_comment="CC @${GITHUB_TRIGGERING_ACTOR}"
test_scope="Inputs|suite=${{ inputs.suite }}; dt=${{ inputs.dt }}; mode=${{ inputs.mode }}; scenario=${{ inputs.scenario }};"
if [ "${{ inputs.triton }}" != "" ];then
test_scope+=" triton=${{ inputs.triton }};"
fi
if [ "${{ inputs.model }}" != "" ];then
test_scope+=" model=${{ inputs.model }};"
fi
gh --repo $repo issue comment 426 --body "$cc_comment
$(date +'%F') On-demand Test Status: **${test_status}**
See: $build_url
Torch-xpu-ops | PyTorch
--- | ---
$test_xpu | $test_torch
Device | OS | GCC | Python | Driver | Bundle
--- | --- | --- | --- | --- | ---
$RUNNER_NAME | $test_os | $test_gcc | ${{ inputs.python }} | $test_driver| $test_bundle
Inputs | $test_scope
--- | ---
"
# Nightly
else
gh --repo $repo issue comment 432 --body "$cc_comment
$(date +'%F') Nightly Test Status: **${test_status}**
See: $build_url
Torch-xpu-ops | PyTorch
--- | ---
$test_xpu | $test_torch
Device | OS | GCC | Python | Driver | Bundle
--- | --- | --- | --- | --- | ---
$RUNNER_NAME | $test_os | $test_gcc | ${{ inputs.python }} | $test_driver| $test_bundle
"
fi