Skip to content

Commit

Permalink
[ci] Remove hardcoded test shards
Browse files Browse the repository at this point in the history
This moves the sharding logic from being inlined in the Jenkinsfile to templated, so we can change just the number of shards and the test allocation in `conftest.py` and the Jenkinsfile will work to match. This also changes the test allocation from a manual balancing before to be random between shards. Each shard needs to know only its shard number and the total number of shards, then it hashes each test and skips it unless that hash falls within its allocated tests. This breaks up related tests across shards but has the downside that any change to the number of shards will shuffle around where the tests end up (but ideally this is rare as we settle on a good number of shards to use).

This only does this for the GPU frontend tests but eventually we could expand it to more.
  • Loading branch information
driazati committed Apr 6, 2022
1 parent e09e939 commit a69f079
Show file tree
Hide file tree
Showing 5 changed files with 343 additions and 215 deletions.
207 changes: 162 additions & 45 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// 'python3 jenkins/generate.py'
// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
// always rebased on main before merging:
// Generated at 2022-03-30T11:40:52.107833
// Generated at 2022-04-06T11:18:33.573732

import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
// NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
Expand Down Expand Up @@ -88,7 +88,8 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
upstream_revision = null

// command to start a docker container
docker_run = 'docker/bash.sh'
docker_run = 'docker/bash.sh --env CI'
docker_run_shard = "${docker_run} --env SHARD_INDEX --env NUM_SHARDS"
docker_build = 'docker/build.sh'
// timeout in minutes
max_time = 240
Expand Down Expand Up @@ -454,7 +455,7 @@ def fsim_test(image) {

def cmake_build(image, path, make_flag) {
sh (
script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
label: 'Run cmake build',
)
}
Expand Down Expand Up @@ -673,27 +674,58 @@ stage('Test') {
Utils.markStageSkippedForConditional('unittest: GPU')
}
},
'integration: CPU': {
'integration: CPU 1 of 2': {
if (!skip_ci && is_docs_only_build != 1) {
node('CPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
try {
init_git()
unpack_lib('cpu', tvm_multilib_tsim)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_cpu)
sh (
script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
label: 'Run CPU integration tests',
)
withEnv([
'TVM_NUM_SHARDS=2',
'TVM_SHARD_INDEX=0'], {
unpack_lib('cpu', tvm_multilib_tsim)
ci_setup(ci_cpu)
sh (
script: "${docker_run_shard} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
label: 'Run CPU integration tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('integration: CPU')
Utils.markStageSkippedForConditional('integration: CPU 1 of 2')
}
},
'integration: CPU 2 of 2': {
if (!skip_ci && is_docs_only_build != 1) {
node('CPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
try {
init_git()
timeout(time: max_time, unit: 'MINUTES') {
withEnv([
'TVM_NUM_SHARDS=2',
'TVM_SHARD_INDEX=1'], {
unpack_lib('cpu', tvm_multilib_tsim)
ci_setup(ci_cpu)
sh (
script: "${docker_run_shard} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
label: 'Run CPU integration tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('integration: CPU 2 of 2')
}
},
'unittest: CPU': {
Expand Down Expand Up @@ -748,17 +780,16 @@ stage('Test') {
Utils.markStageSkippedForConditional('python3: i386')
}
},
'python3: aarch64': {
'topi: aarch64': {
if (!skip_ci && is_docs_only_build != 1) {
node('ARM') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
try {
init_git()
unpack_lib('arm', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
timeout(time: max_time, unit: 'MINUTES') {
try {
init_git()
unpack_lib('arm', tvm_multilib)
ci_setup(ci_arm)
cpp_unittest(ci_arm)
python_unittest(ci_arm)
sh (
script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
label: 'Run test_arm_compute_lib test',
Expand All @@ -767,87 +798,173 @@ stage('Test') {
script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
label: 'Run TOPI tests',
)
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
}
} else {
Utils.markStageSkippedForConditional('topi: aarch64')
}
},
'integration: aarch64': {
if (!skip_ci && is_docs_only_build != 1) {
node('ARM') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
timeout(time: max_time, unit: 'MINUTES') {
try {
init_git()
unpack_lib('arm', tvm_multilib)
ci_setup(ci_arm)
python_unittest(ci_arm)
sh (
script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
label: 'Run CPU integration tests',
)
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
}
} else {
Utils.markStageSkippedForConditional('integration: aarch64')
}
},
'topi: GPU 1 of 2': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
try {
init_git()
timeout(time: max_time, unit: 'MINUTES') {
withEnv([
'TVM_NUM_SHARDS=2',
'TVM_SHARD_INDEX=0'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
label: 'Run TOPI tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('python3: arm')
Utils.markStageSkippedForConditional('topi: GPU 1 of 2')
}
},
'topi: GPU': {
'topi: GPU 2 of 2': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
label: 'Run TOPI tests',
)
withEnv([
'TVM_NUM_SHARDS=2',
'TVM_SHARD_INDEX=1'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
label: 'Run TOPI tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('topi: GPU')
Utils.markStageSkippedForConditional('topi: GPU 2 of 2')
}
},
'frontend: GPU 1': {
'frontend: GPU 1 of 3': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
label: 'Run Python frontend tests (shard 1)',
)
withEnv([
'TVM_NUM_SHARDS=3',
'TVM_SHARD_INDEX=0'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 1')
} else {
Utils.markStageSkippedForConditional('frontend: GPU 1 of 3')
}
},
'frontend: GPU 2': {
'frontend: GPU 2 of 3': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
try {
init_git()
unpack_lib('gpu', tvm_multilib)
timeout(time: max_time, unit: 'MINUTES') {
ci_setup(ci_gpu)
sh (
script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
label: 'Run Python frontend tests (shard 2)',
)
withEnv([
'TVM_NUM_SHARDS=3',
'TVM_SHARD_INDEX=1'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 2')
} else {
Utils.markStageSkippedForConditional('frontend: GPU 2 of 3')
}
},
'frontend: GPU 3 of 3': {
if (!skip_ci && is_docs_only_build != 1) {
node('GPU') {
ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
try {
init_git()
timeout(time: max_time, unit: 'MINUTES') {
withEnv([
'TVM_NUM_SHARDS=3',
'TVM_SHARD_INDEX=2'], {
unpack_lib('gpu', tvm_multilib)
ci_setup(ci_gpu)
sh (
script: "${docker_run_shard} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
label: 'Run Python frontend tests',
)
})
}
} finally {
junit 'build/pytest-results/*.xml'
}
}
}
} else {
Utils.markStageSkippedForConditional('frontend: GPU 3 of 3')
}
},
'frontend: CPU': {
Expand Down
62 changes: 62 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,67 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import hashlib
import pytest
import os
from collections import OrderedDict

pytest_plugins = ["tvm.testing.plugin"]


# These are long running tests (manually curated and extracted from CI logs)
# that should be allocated to test shards in a round-robin fashion. These are
# taken from the 20 (arbitrary number) of tests as from
# https://ci.tlcpack.ai/job/tvm/job/main/2907/testReport
_slowest_tests = [
"tests/python/frontend/tensorflow/test_forward.py::test_forward_broadcast_args",
"tests/python/frontend/tensorflow/test_forward.py::test_forward_broadcast_to",
"tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
"tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
"tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
"tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
"tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
"tests/python/frontend/tflite/test_forward.py::test_all_elemwise",
"tests/python/frontend/pytorch/test_object_detection.py::test_detection_models",
"tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
"tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
"tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py::test_conv2d_hwnc_tensorcore",
"tests/python/contrib/test_tensorrt.py::test_binary[compile]",
"tests/python/frontend/pytorch/test_forward.py::test_segmentation_models",
"tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
"tests/python/relay/test_py_converter.py::test_global_recursion",
"tests/python/frontend/tensorflow/test_forward.py::test_forward_ptb",
"tests/python/relay/test_op_level6.py::test_topk",
"tests/python/topi/python/test_topi_conv2d_winograd.py::test_conv2d_nchw",
"tests/python/relay/test_py_converter.py::test_global_recursion",
]
HARDCODED_ALLOCATIONS = {}
for idx, test in enumerate(_slowest_tests):
HARDCODED_ALLOCATIONS[test] = idx


def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
"""
Return true if this test should run on this shard
"""
if nodeid in HARDCODED_ALLOCATIONS:
hash = HARDCODED_ALLOCATIONS[nodeid]
else:
hash = hashlib.md5(nodeid.encode())
hash = int(hash.hexdigest(), 16)

return hash % num_shards == shard_index


def pytest_collection_modifyitems(config, items):
if not all(k in os.environ for k in ["CI", "TVM_NUM_SHARDS", "TVM_SHARD_INDEX"]):
# Only apportion tests if in CI and in a job that is set up for it
return

num_shards = int(os.environ["TVM_NUM_SHARDS"])
shard_index = int(os.environ["TVM_SHARD_INDEX"])

print(f"Marking tests for shard {shard_index} of {num_shards}")
for item in items:
if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index):
item.add_marker(pytest.mark.skip())
Loading

0 comments on commit a69f079

Please sign in to comment.