From 0c17f07aa7dcfb54abffade0212400f56f913f55 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 8 Apr 2022 11:06:12 -0700
Subject: [PATCH] [ci] Remove hardcoded test shards (#10743)

This moves the sharding logic from being inlined in the Jenkinsfile to templated, so we can change just the number of shards and the test allocation in `conftest.py` and the Jenkinsfile will work to match. This also changes the test allocation from a manual balancing before to be random between shards. Each shard needs to know only its shard number and the total number of shards, then it hashes each test and skips it unless that hash falls within its allocated tests. This breaks up related tests across shards but has the downside that any change to the number of shards will shuffle around where the tests end up (but ideally this is rare as we settle on a good number of shards to use).

This only does this for the GPU frontend tests but eventually we could expand it to more.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                           | 246 +++++++++++++++++++++-----
 conftest.py                           |  75 ++++++++
 jenkins/Jenkinsfile.j2                | 175 +++++-------------
 jenkins/macros.j2                     |  49 +++++
 tests/scripts/task_python_frontend.sh |  64 +++----
 5 files changed, 390 insertions(+), 219 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f2c5e18d24a9..b0e263c51360 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-03-30T11:40:52.107833
+// Generated at 2022-04-07T13:50:22.427152
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -88,7 +88,7 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
@@ -454,7 +454,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
@@ -673,19 +673,50 @@ stage('Test') {
       Utils.markStageSkippedForConditional('unittest: GPU')
     }
   },
-  'integration: CPU': {
+  'integration: CPU 1 of 2': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
           try {
             init_git()
-            unpack_lib('cpu', tvm_multilib_tsim)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('cpu', tvm_multilib_tsim)
+                ci_setup(ci_cpu)
+                sh (
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('integration: CPU 1 of 2')
+    }
+  },
+  'integration: CPU 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('cpu', tvm_multilib_tsim)
+                ci_setup(ci_cpu)
+                sh (
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -693,7 +724,7 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('integration: CPU')
+      Utils.markStageSkippedForConditional('integration: CPU 2 of 2')
     }
   },
   'unittest: CPU': {
@@ -748,17 +779,16 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
-  'python3: aarch64': {
+  'topi: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-          try {
-            init_git()
-            unpack_lib('arm', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('arm', tvm_multilib)
               ci_setup(ci_arm)
               cpp_unittest(ci_arm)
-              python_unittest(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
                 label: 'Run test_arm_compute_lib test',
@@ -767,10 +797,34 @@ stage('Test') {
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
                 label: 'Run TOPI tests',
               )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('topi: aarch64')
+    }
+  },
+  'integration: aarch64 1 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('arm', tvm_multilib)
+                ci_setup(ci_arm)
+                python_unittest(ci_arm)
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -778,22 +832,54 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('python3: arm')
+      Utils.markStageSkippedForConditional('integration: aarch64 1 of 2')
     }
   },
-  'topi: GPU': {
+  'integration: aarch64 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('arm', tvm_multilib)
+                ci_setup(ci_arm)
+                python_unittest(ci_arm)
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('integration: aarch64 2 of 2')
+    }
+  },
+  'topi: GPU 1 of 2': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                  label: 'Run TOPI tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -801,53 +887,115 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('topi: GPU')
+      Utils.markStageSkippedForConditional('topi: GPU 1 of 2')
     }
   },
-  'frontend: GPU 1': {
+  'topi: GPU 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                  label: 'Run TOPI tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('topi: GPU 2 of 2')
+    }
+  },
+  'frontend: GPU 1 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
-                label: 'Run Python frontend tests (shard 1)',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
           }
         }
       }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 1')
+    } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 1 of 3')
     }
   },
-  'frontend: GPU 2': {
+  'frontend: GPU 2 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
-                label: 'Run Python frontend tests (shard 2)',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
           }
         }
       }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 2')
+    } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 2 of 3')
+    }
+  },
+  'frontend: GPU 3 of 3': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=2'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 3 of 3')
     }
   },
   'frontend: CPU': {
diff --git a/conftest.py b/conftest.py
index 28859fd4a17b..9768b6cc528d 100644
--- a/conftest.py
+++ b/conftest.py
@@ -14,5 +14,80 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import hashlib
+import pytest
+import os
+from collections import OrderedDict
 
 pytest_plugins = ["tvm.testing.plugin"]
+
+
+# These are long running tests (manually curated and extracted from CI logs)
+# that should be allocated to test shards in a round-robin fashion. These are
+# taken from the 20 (arbitrary number) of tests as from
+# https://ci.tlcpack.ai/job/tvm/job/main/2907/testReport
+_slowest_tests = [
+    "tests/python/frontend/tensorflow/test_forward.py::test_forward_broadcast_args",
+    "tests/python/frontend/tensorflow/test_forward.py::test_forward_broadcast_to",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
+    "tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
+    "tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
+    "tests/python/frontend/tflite/test_forward.py::test_all_elemwise",
+    "tests/python/frontend/pytorch/test_object_detection.py::test_detection_models",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
+    "tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
+    "tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py::test_conv2d_hwnc_tensorcore",
+    "tests/python/contrib/test_tensorrt.py::test_binary[compile]",
+    "tests/python/frontend/pytorch/test_forward.py::test_segmentation_models",
+    "tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
+    "tests/python/relay/test_py_converter.py::test_global_recursion",
+    "tests/python/frontend/tensorflow/test_forward.py::test_forward_ptb",
+    "tests/python/relay/test_op_level6.py::test_topk",
+    "tests/python/topi/python/test_topi_conv2d_winograd.py::test_conv2d_nchw",
+    "tests/python/relay/test_py_converter.py::test_global_recursion",
+]
+HARDCODED_ALLOCATIONS = {}
+for idx, test in enumerate(_slowest_tests):
+    HARDCODED_ALLOCATIONS[test] = idx
+
+# These rely on running on the same node to pass successfully
+FIXED_ALLOCATION_PREFIXES = {
+    "tests/python/unittest/test_tvm_testing_features.py": 0,
+}
+
+
+def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
+    """
+    Return true if this test should run on this shard
+    """
+    for prefix, target_shard_idx in FIXED_ALLOCATION_PREFIXES.items():
+        if nodeid.startswith(prefix):
+            if target_shard_idx >= num_shards:
+                raise RuntimeError(
+                    f"Cannot collect sharded tests, {nodeid} has hardcoded shard index {target_shard_idx} among only {num_shards} shards"
+                )
+            return target_shard_idx == shard_index
+
+    if nodeid in HARDCODED_ALLOCATIONS:
+        hash = HARDCODED_ALLOCATIONS[nodeid]
+    else:
+        hash = hashlib.md5(nodeid.encode())
+        hash = int(hash.hexdigest(), 16)
+
+    return hash % num_shards == shard_index
+
+
+def pytest_collection_modifyitems(config, items):
+    if not all(k in os.environ for k in ["CI", "TVM_NUM_SHARDS", "TVM_SHARD_INDEX"]):
+        # Only apportion tests if in CI and in a job that is set up for it
+        return
+
+    num_shards = int(os.environ["TVM_NUM_SHARDS"])
+    shard_index = int(os.environ["TVM_SHARD_INDEX"])
+
+    print(f"Marking tests for shard {shard_index} of {num_shards}")
+    for item in items:
+        if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index):
+            item.add_marker(pytest.mark.skip())
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 828f251e7857..1a61d140c3f7 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -85,7 +85,7 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
@@ -451,7 +451,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
@@ -670,29 +670,14 @@ stage('Test') {
       Utils.markStageSkippedForConditional('unittest: GPU')
     }
   },
-  'integration: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws({{ m.per_exec_ws('tvm/ut-python-cpu') }}) {
-          try {
-            init_git()
-            unpack_lib('cpu', tvm_multilib_tsim)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('integration: CPU')
-    }
-  },
+  {% call m.sharded_test_step(name="integration: CPU", node="CPU", num_shards=2, ws="tvm/integration-python-cpu") %}
+    unpack_lib('cpu', tvm_multilib_tsim)
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
   'unittest: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
@@ -745,108 +730,44 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
-  'python3: aarch64': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws({{ m.per_exec_ws('tvm/ut-python-arm') }}) {
-          try {
-            init_git()
-            unpack_lib('arm', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('python3: arm')
-    }
-  },
-  'topi: GPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/topi-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('topi: GPU')
-    }
-  },
-  'frontend: GPU 1': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
-                label: 'Run Python frontend tests (shard 1)',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 1')
-    }
-  },
-  'frontend: GPU 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
-                label: 'Run Python frontend tests (shard 2)',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 2')
-    }
-  },
+  {% call m.test_step(name="topi: aarch64", node="ARM", ws="tvm/ut-python-arm") %}
+    unpack_lib('arm', tvm_multilib)
+    ci_setup(ci_arm)
+    cpp_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+      label: 'Run test_arm_compute_lib test',
+    )
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="integration: aarch64", num_shards=2, node="ARM", ws="tvm/ut-python-arm") %}
+    unpack_lib('arm', tvm_multilib)
+    ci_setup(ci_arm)
+    python_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="topi: GPU", node="GPU", num_shards=2, ws="tvm/topi-python-gpu") %}
+    unpack_lib('gpu', tvm_multilib)
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="frontend: GPU", node="GPU", num_shards=3, ws="tvm/frontend-python-gpu") %}
+    unpack_lib('gpu', tvm_multilib)
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
   'frontend: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 7edfb7e9d122..033afbe94921 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -18,3 +18,52 @@
 {% macro per_exec_ws(folder) -%}
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
+
+{% macro sharded_test_step(name, num_shards, node, ws) %}
+{% for shard_index in range(1, num_shards + 1) %}
+  '{{ name }} {{ shard_index }} of {{ num_shards }}': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('{{ node }}') {
+        ws({{ per_exec_ws(ws) }}) {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS={{ num_shards }}',
+                'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
+                {{ caller() | trim | indent(width=12) }}
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('{{ name }} {{ shard_index }} of {{ num_shards }}')
+    }
+  },
+{% endfor %}
+{% endmacro %}
+
+
+{% macro test_step(name, node, ws) %}
+  '{{ name }}': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('{{ node }}') {
+        ws({{ per_exec_ws(ws) }}) {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              {{ caller() | indent(width=10) | trim }}
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('{{ name }}')
+    }
+  },
+{% endmacro %}
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index d7e1b5113f7c..bbcba37c6d01 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -30,53 +30,31 @@ find . -type f -path "*.pyc" | xargs rm -f
 # Rebuild cython
 make cython3
 
-# These tests are sharded into two sections in order to increase parallelism in CI.
-# The split is purely based on balancing the runtime of each shard so they should
-# be about the same. This may need rebalancing in the future if this is no longer
-# the case.
-function shard1 {
-    echo "Running relay MXNet frontend test..."
-    run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet
 
-    echo "Running relay ONNX frontend test..."
-    run_pytest cython python-frontend-onnx tests/python/frontend/onnx
+echo "Running relay MXNet frontend test..."
+run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet
 
-    echo "Running relay PyTorch frontend test..."
-    run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
-}
+echo "Running relay ONNX frontend test..."
+run_pytest cython python-frontend-onnx tests/python/frontend/onnx
 
-function shard2 {
-    echo "Running relay Tensorflow frontend test..."
-    # Note: Tensorflow tests often have memory issues, so invoke each one separately
-    TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
-    i=0
-    for node_id in $TENSORFLOW_TESTS; do
-        echo "$node_id"
-        run_pytest cython "python-frontend-tensorflow-$i" "$node_id"
-        i=$((i+1))
-    done
+echo "Running relay PyTorch frontend test..."
+run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
 
-    echo "Running relay DarkNet frontend test..."
-    run_pytest cython python-frontend-darknet tests/python/frontend/darknet
+echo "Running relay Tensorflow frontend test..."
+# Note: Tensorflow tests often have memory issues, so invoke each one separately
+TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
+i=0
+for node_id in $TENSORFLOW_TESTS; do
+    echo "$node_id"
+    run_pytest cython "python-frontend-tensorflow-$i" "$node_id"
+    i=$((i+1))
+done
 
-    echo "Running relay PaddlePaddle frontend test..."
-    run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddle
+echo "Running relay DarkNet frontend test..."
+run_pytest cython python-frontend-darknet tests/python/frontend/darknet
 
-    echo "Running relay CoreML frontend test..."
-    run_pytest cython python-frontend-coreml tests/python/frontend/coreml
-}
+echo "Running relay PaddlePaddle frontend test..."
+run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddle
 
-
-if [ -z ${1+x} ]; then
-    # TODO: This case can be removed once https://github.com/apache/tvm/pull/10413
-    # is merged.
-    # No sharding set, run everything
-    shard1
-    shard2
-else
-    if [ "$1" == "1" ]; then
-        shard1
-    else
-        shard2
-    fi
-fi
+echo "Running relay CoreML frontend test..."
+run_pytest cython python-frontend-coreml tests/python/frontend/coreml