Skip to content

Commit

Permalink
[ci] Remove hardcoded test shards
Browse files Browse the repository at this point in the history
This moves the sharding logic from being inlined in the Jenkinsfile to templated, so we can change just the number of shards and the test allocation in `conftest.py` and the Jenkinsfile will work to match. This also changes the test allocation from a manual balancing before to be random between shards. Each shard needs to know only its shard number and the total number of shards, then it hashes each test and skips it unless that hash falls within its allocated tests. This breaks up related tests across shards but has the downside that any change to the number of shards will shuffle around where the tests end up (but ideally this is rare as we settle on a good number of shards to use).

This only does this for the GPU frontend tests but eventually we could expand it to more.
  • Loading branch information
driazati committed Mar 29, 2022
1 parent a3a155c commit a26b0ce
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 113 deletions.
79 changes: 59 additions & 20 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
tvm_multilib

// command to start a docker container
docker_run = 'docker/bash.sh'
docker_run = 'docker/bash.sh --env CI'
docker_run_shard = "${docker_run} --env SHARD_INDEX --env NUM_SHARDS"
docker_build = 'docker/build.sh'
// timeout in minutes
max_time = 240
Expand Down Expand Up @@ -428,7 +429,7 @@ def fsim_test(image) {

def cmake_build(image, path, make_flag) {
sh (
script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
label: 'Run cmake build',
)
}
Expand Down Expand Up @@ -778,50 +779,88 @@ stage('Test') {
Utils.markStageSkippedForConditional('topi: GPU')
}
},
'frontend: GPU 1': {
'frontend: GPU 1 of 3': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
label: 'Run Python frontend tests (shard 1)',
)
withEnv([
'NUM_SHARDS=3',
'SHARD_INDEX=1'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)

})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 1')
} else {
Utils.markStageSkippedForConditional('frontend: GPU 1 of 3')
}
},
'frontend: GPU 2': {
'frontend: GPU 2 of 3': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
label: 'Run Python frontend tests (shard 2)',
)
withEnv([
'NUM_SHARDS=3',
'SHARD_INDEX=2'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)

})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 2')
} else {
Utils.markStageSkippedForConditional('frontend: GPU 2 of 3')
}
},
'frontend: GPU 3 of 3': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
try {
init_git()
timeout(time: max_time, unit: 'MINUTES') {
withEnv([
'NUM_SHARDS=3',
'SHARD_INDEX=3'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)

})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 3 of 3')
}
},
'frontend: CPU': {
Expand Down
29 changes: 29 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,34 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import hashlib
import pytest
import os

pytest_plugins = ["tvm.testing.plugin"]


def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
"""
Return true if this test should run on this shard
"""
hash = hashlib.md5(nodeid.encode())
hash = int(hash.hexdigest(), 16)
return hash % num_shards == shard_index


def pytest_collection_modifyitems(config, items):
if "CI" not in os.environ:
return

# Only apportion tests if in CI and in a job that is set up for it
if "NUM_SHARDS" not in os.environ or "SHARD_INDEX" not in os.environ:
return

num_shards = int(os.environ["NUM_SHARDS"])
shard_index = int(os.environ["SHARD_INDEX"])

print(f"Marking tests for shard {shard_index} of {num_shards}")
for item in items:
if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index):
item.add_marker(pytest.mark.skip())
59 changes: 11 additions & 48 deletions jenkins/Jenkinsfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
tvm_multilib

// command to start a docker container
docker_run = 'docker/bash.sh'
docker_run = 'docker/bash.sh --env CI'
docker_run_shard = "${docker_run} --env SHARD_INDEX --env NUM_SHARDS"
docker_build = 'docker/build.sh'
// timeout in minutes
max_time = 240
Expand Down Expand Up @@ -425,7 +426,7 @@ def fsim_test(image) {

def cmake_build(image, path, make_flag) {
sh (
script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
label: 'Run cmake build',
)
}
Expand Down Expand Up @@ -775,52 +776,14 @@ stage('Test') {
Utils.markStageSkippedForConditional('topi: GPU')
}
},
'frontend: GPU 1': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
label: 'Run Python frontend tests (shard 1)',
)
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 1')
}
},
'frontend: GPU 2': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
label: 'Run Python frontend tests (shard 2)',
)
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 2')
}
},
{% call m.sharded_test_step(name="frontend: GPU", node="GPU", num_shards=3, ws="tvm/frontend-python-gpu") %}
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)
{% endcall %}
'frontend: CPU': {
if (!skip_ci && is_docs_only_build != 1) {
node('CPU') {
Expand Down
27 changes: 27 additions & 0 deletions jenkins/macros.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,30 @@
{% macro per_exec_ws(folder) -%}
"workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
{%- endmacro -%}

{% macro sharded_test_step(name, num_shards, node, ws) %}
{% for shard_index in range(1, num_shards + 1) %}
'{{ name }} {{ shard_index }} of {{ num_shards }}': {
if (!skip_ci && is_docs_only_build != 1) {
node('{{ node }}') {
ws({{ per_exec_ws(ws) }}) {
try {
init_git()
timeout(time: max_time, unit: 'MINUTES') {
withEnv([
'NUM_SHARDS={{ num_shards }}',
'SHARD_INDEX={{ shard_index }}'], {
{{ caller() | indent(width=16) }}
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('{{ name }} {{ shard_index }} of {{ num_shards }}')
}
},
{% endfor %}
{% endmacro %}
68 changes: 23 additions & 45 deletions tests/scripts/task_python_frontend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,56 +30,34 @@ find . -type f -path "*.pyc" | xargs rm -f
# Rebuild cython
make cython3

# These tests are sharded into two sections in order to increase parallelism in CI.
# The split is purely based on balancing the runtime of each shard so they should
# be about the same. This may need rebalancing in the future if this is no longer
# the case.
function shard1 {
echo "Running relay MXNet frontend test..."
run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet

echo "Running relay ONNX frontend test..."
run_pytest cython python-frontend-onnx tests/python/frontend/onnx
echo "Running relay MXNet frontend test..."
run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet

echo "Running relay PyTorch frontend test..."
run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
}
echo "Running relay ONNX frontend test..."
run_pytest cython python-frontend-onnx tests/python/frontend/onnx

function shard2 {
echo "Running relay Tensorflow frontend test..."
# Note: Tensorflow tests often have memory issues, so invoke each one separately
TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
i=0
for node_id in $TENSORFLOW_TESTS; do
echo "$node_id"
run_pytest cython "python-frontend-tensorflow-$i" "$node_id"
i=$((i+1))
done
echo "Running relay PyTorch frontend test..."
run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch

echo "Running relay caffe2 frontend test..."
run_pytest cython python-frontend-caffe2 tests/python/frontend/caffe2
echo "Running relay Tensorflow frontend test..."
# Note: Tensorflow tests often have memory issues, so invoke each one separately
TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
i=0
for node_id in $TENSORFLOW_TESTS; do
echo "$node_id"
run_pytest cython "python-frontend-tensorflow-$i" "$node_id"
i=$((i+1))
done

echo "Running relay DarkNet frontend test..."
run_pytest cython python-frontend-darknet tests/python/frontend/darknet
echo "Running relay caffe2 frontend test..."
run_pytest cython python-frontend-caffe2 tests/python/frontend/caffe2

echo "Running relay PaddlePaddle frontend test..."
run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddle
echo "Running relay DarkNet frontend test..."
run_pytest cython python-frontend-darknet tests/python/frontend/darknet

echo "Running relay CoreML frontend test..."
run_pytest cython python-frontend-coreml tests/python/frontend/coreml
}
echo "Running relay PaddlePaddle frontend test..."
run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddle


if [ -z ${1+x} ]; then
# TODO: This case can be removed once https://github.com/apache/tvm/pull/10413
# is merged.
# No sharding set, run everything
shard1
shard2
else
if [ "$1" == "1" ]; then
shard1
else
shard2
fi
fi
echo "Running relay CoreML frontend test..."
run_pytest cython python-frontend-coreml tests/python/frontend/coreml

0 comments on commit a26b0ce

Please sign in to comment.