Skip to content

Commit

Permalink
[UnitTest][NVPTX] Avoid cascading failures from CUDA postproc (apache…
Browse files Browse the repository at this point in the history
…#15136)

Prior to this commit, the tests in
`test_tir_transform_inject_ptx_async_copy.py` registered the
`"tvm_callback_cuda_postproc"` function during pytest collection, and
used a global variable to disable its functionality outside of the
tests in this file.  This had two major issues.  First, if any other
test also installs a postproc function, these postproc function
required by the NVPTX tests would be overwritten.  Second, if one of
the NTPTX tests fails, the global variable controlling the postproc
function would not be reset, causing any subsequent CUDA-related tests
to also fail.

This commit updates these NVPTX tests to conditionally install the
postproc function, to de-register it after the test instead of
disabling its functionality, and to de-register it regardless of the
test result.

This issue was initially found when debugging
apache#15103, when a failure in
`test_tir_transform_inject_ptx_async_copy.py::test_cp_async_in_if_then_else`
caused failures in 32 unrelated tests ([CI
link](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-gpu/detail/PR-15103/7/tests)).
  • Loading branch information
Lunderberg authored Jul 5, 2023
1 parent c928852 commit 0bb390b
Showing 1 changed file with 51 additions and 46 deletions.
97 changes: 51 additions & 46 deletions tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,14 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import numpy as np

import tvm
import tvm.testing
from tvm.script import tir as T

import pytest
import numpy as np


def count_cp_async(stmt):
num_alloc = [0]
Expand Down Expand Up @@ -351,36 +354,54 @@ def test_inject_async_copy_shared_dyn():
"""


generated_code = ""
support_async = True
@pytest.fixture
def postproc_if_missing_async_support():
arch = tvm.contrib.nvcc.get_target_compute_version()
major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
support_async = major >= 8

func_name = "tvm_callback_cuda_postproc"
prev_postproc = tvm.get_global_func(func_name, allow_missing=True)

# Store the generated code prior to the post-processing. This
# way, even though the generated code doesn't compile on platforms
# that do not support async, the comparison against an expected
# output can still be performed. We cannot use
# `mod.get_source()`, as that contains the source after all
# post-processing.
original_code = None

def get_original_code():
nonlocal original_code
return original_code

@tvm.register_func(func_name, override=True)
def tvm_callback_cuda_postproc(code, _):
nonlocal original_code
original_code = code
if support_async:
return code
else:
ret = []
for line in code.split("\n"):
ret.append(line)
ret.append("\n")
if line.startswith('extern "C" __global__') and line.endswith("{"):
break
ret.append("}")
return "".join(ret)

yield get_original_code

@tvm.register_func
def tvm_callback_cuda_postproc(code, _):
global generated_code
global support_async
generated_code = code
# return a dummy code so that device < sm80 could build correctly
if not support_async:
ret = ""
for line in code.split("\n"):
ret += line + "\n"
if line.startswith('extern "C" __global__'):
break
ret += "}"
return ret
return code
# Restore previous postproc func to avoid impacting other tests
if prev_postproc is None:
tvm._ffi.registry.remove_global_func(func_name)
else:
tvm.register_func(func_name, prev_postproc, override=True)


@tvm.testing.requires_cuda
def test_cp_async_in_if_then_else():
global support_async
arch = tvm.contrib.nvcc.get_target_compute_version()
major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
if major < 8:
# At least sm80 is required
support_async = False

def test_cp_async_in_if_then_else(postproc_if_missing_async_support):
@T.prim_func
def simple_compute(
A: T.Buffer((16, 14), "float32"),
Expand Down Expand Up @@ -422,22 +443,12 @@ def simple_compute(
mod = tvm.IRModule.from_expr(simple_compute)
with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
tvm.build(mod, target="cuda")
generated_code = postproc_if_missing_async_support()
assert generated_code == expected_cuda_script

if not support_async:
# avoid return dummy code to other tests
support_async = True


@tvm.testing.requires_cuda
def test_vectorize_cp_async_in_if_then_else():
global support_async
arch = tvm.contrib.nvcc.get_target_compute_version()
major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
if major < 8:
# At least sm80 is required
support_async = False

def test_vectorize_cp_async_in_if_then_else(postproc_if_missing_async_support):
@T.prim_func
def complex_compute(
A: T.Buffer((2, 16, 16, 1280), "float16"),
Expand Down Expand Up @@ -887,16 +898,10 @@ def complex_compute(
mod = tvm.IRModule.from_expr(complex_compute)
with tvm.transform.PassContext(config={"tir.use_async_copy": 1}):
tvm.build(mod, target="cuda")
generated_code = postproc_if_missing_async_support()
# generated_code must contain " setp.ne.b32 p, %0, 0;"
assert "setp.ne.b32" in generated_code

if not support_async:
# avoid return dummy code to other tests
support_async = True


if __name__ == "__main__":
test_inject_async_copy()
test_inject_async_copy_shared_dyn()
test_cp_async_in_if_then_else()
test_vectorize_cp_async_in_if_then_else()
tvm.testing.main()

0 comments on commit 0bb390b

Please sign in to comment.