Skip to content

Commit

Permalink
Smoke test for cuda runtime errors (#1315)
Browse files Browse the repository at this point in the history
* Add test for cuda runtime errors

* Add cuda exception smoke test

* Move cuda runtime error to end

* Move cuda runtime error to end

* Address comments

* Address comments
  • Loading branch information
atalman authored Mar 6, 2023
1 parent 18c5017 commit 5276fae
Showing 1 changed file with 29 additions and 10 deletions.
39 changes: 29 additions & 10 deletions test/smoke_test/smoke_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,23 @@ def check_nightly_binaries_date(package: str) -> None:
f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}"
)

def test_cuda_runtime_errors_captured() -> None:
cuda_exception_missed=True
try:
torch._assert_async(torch.tensor(0, device="cuda"))
torch._assert_async(torch.tensor(0 + 0j, device="cuda"))
except RuntimeError as e:
if re.search("CUDA", f"{e}"):
print(f"Caught CUDA exception with success: {e}")
cuda_exception_missed = False
else:
raise e
if(cuda_exception_missed):
raise RuntimeError( f"Expected CUDA RuntimeError but have not received!")

def smoke_test_cuda(package: str) -> None:
if not torch.cuda.is_available() and is_cuda_system:
raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")
if torch.cuda.is_available():
if torch.version.cuda != gpu_arch_ver:
raise RuntimeError(
f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
)
print(f"torch cuda: {torch.version.cuda}")
# todo add cudnn version validation
print(f"torch cudnn: {torch.backends.cudnn.version()}")
print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")

if(package == 'all' and is_cuda_system):
for module in MODULES:
Expand All @@ -80,6 +85,19 @@ def smoke_test_cuda(package: str) -> None:
version = imported_module._extension._check_cuda_version()
print(f"{module['name']} CUDA: {version}")

if torch.cuda.is_available():
if torch.version.cuda != gpu_arch_ver:
raise RuntimeError(
f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
)
print(f"torch cuda: {torch.version.cuda}")
# todo add cudnn version validation
print(f"torch cudnn: {torch.backends.cudnn.version()}")
print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")

# This check has to be run last, since its messing up CUDA runtime
test_cuda_runtime_errors_captured()


def smoke_test_conv2d() -> None:
import torch.nn as nn
Expand Down Expand Up @@ -128,7 +146,6 @@ def main() -> None:
)
options = parser.parse_args()
print(f"torch: {torch.__version__}")
smoke_test_cuda(options.package)
smoke_test_conv2d()

if options.package == "all":
Expand All @@ -138,6 +155,8 @@ def main() -> None:
if installation_str.find("nightly") != -1:
check_nightly_binaries_date(options.package)

smoke_test_cuda(options.package)


if __name__ == "__main__":
main()

0 comments on commit 5276fae

Please sign in to comment.