combine eager and compile benchmarks

NVIDIA · Jan 26, 2024 · 105fd9e · 105fd9e
1 parent 480e3fc
commit 105fd9e
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 37 deletions.
diff --git a/python_benchmarks/conftest.py b/python_benchmarks/conftest.py
@@ -15,6 +15,19 @@ def pytest_addoption(parser):
         default=False,
         help="Disable benchmarking.",
     )
+    parser.addoption(
+        "--benchmark-eager",
+        action="store_true",
+        default=False,
+        help="Benchmarks torch eager mode.",
+    )
+
+    parser.addoption(
+        "--benchmark-torchcompile",
+        action="store_true",
+        default=False,
+        help="Benchmarks torch.compile mode.",
+    )
 
 
 @pytest.fixture
@@ -33,3 +46,27 @@ def pytest_make_parametrize_id(val):
 
 def pytest_benchmark_update_machine_info(config, machine_info):
     machine_info.update(DEVICE_PROPERTIES)
+
+
+def pytest_collection_modifyitems(session, config, items):
+    run_eager = config.getoption("--benchmark-eager")
+    run_torchcompile = config.getoption("--benchmark-torchcompile")
+
+    if not run_eager:
+        skip_eager = pytest.mark.skip(reason="need --benchmark-eager option to run")
+        for item in items:
+            # If the benchmark has compile=False parameter (eager mode), skip it.
+            if (
+                "compile" in item.callspec.params
+                and not item.callspec.params["compile"]
+            ):
+                item.add_marker(skip_eager)
+
+    if not run_torchcompile:
+        skip_torchcompile = pytest.mark.skip(
+            reason="need --benchmark-torchcompile option to run"
+        )
+        for item in items:
+            # If the benchmark has compile=True parameter (torch.compile mode), skip it.
+            if "compile" in item.callspec.params and item.callspec.params["compile"]:
+                item.add_marker(skip_torchcompile)
diff --git a/python_benchmarks/test_softmax_bwd.py b/python_benchmarks/test_softmax_bwd.py
@@ -54,7 +54,7 @@ def softmax_bwd_fusion(
     fd.add_output(T19)
 
 
-def softmax_bwd_fn(inputs: list):  # [in_tensor, output, grads]
+def unary_bwd_torch(inputs: list):  # [in_tensor, output, grads]
     inputs[1].backward(inputs[2], retain_graph=True)
     return inputs[0].grad
 
@@ -89,34 +89,23 @@ def test_softmax_bwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
+@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
 @pytest.mark.parametrize("size", generate_input_sizes(dims=2))
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
-def test_softmax_bwd_eager_benchmark(
-    benchmark,
-    size: tuple,
-    dtype: torch.dtype,
-    reduction_axis: int,
-):
-    clear_cuda_cache()
-    input = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
-    grads = torch.randn(*size, device="cuda", dtype=dtype)
-
-    output = torch.nn.functional.softmax(input, dim=reduction_axis)
-    run_benchmark(benchmark, softmax_bwd_fn, [input, output, grads])
-
-
-@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
-@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
-@pytest.mark.parametrize("reduction_axis", [0, 1])
-def test_softmax_bwd_compile_benchmark(
+def test_softmax_bwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
+    compile: bool,
 ):
     clear_cuda_cache()
     input = torch.randn(*size, device="cuda", dtype=dtype, requires_grad=True)
     grads = torch.randn(*size, device="cuda", dtype=dtype)
     output = torch.nn.functional.softmax(input, dim=reduction_axis)
-    run_benchmark(benchmark, torch.compile(softmax_bwd_fn), [input, output, grads])
+    run_benchmark(
+        benchmark,
+        torch.compile(unary_bwd_torch) if compile else unary_bwd_torch,
+        [input, output, grads],
+    )
diff --git a/python_benchmarks/test_softmax_fwd.py b/python_benchmarks/test_softmax_fwd.py
@@ -74,29 +74,21 @@ def test_softmax_fwd_nvf_benchmark(
         run_benchmark(benchmark, fd.execute, inputs)
 
 
-@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
-@pytest.mark.parametrize("dtype", FLOAT_DTYPES)
-@pytest.mark.parametrize("reduction_axis", [0, 1])
-def test_softmax_fwd_eager_benchmark(
-    benchmark,
-    size: tuple,
-    dtype: torch.dtype,
-    reduction_axis: int,
-):
-    clear_cuda_cache()
-    input = torch.randn(*size, device="cuda", dtype=dtype)
-    run_benchmark(benchmark, softmax_fwd_fn, [input, reduction_axis])
-
-
-@pytest.mark.parametrize("size", generate_input_sizes(dims=2))
+@pytest.mark.parametrize("compile", [False, True], ids=["eager", "compile"])
+@pytest.mark.parametrize("size", [(128, 768)])
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
 @pytest.mark.parametrize("reduction_axis", [0, 1])
-def test_softmax_fwd_compile_benchmark(
+def test_softmax_fwd_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
     reduction_axis: int,
+    compile: bool,
 ):
     clear_cuda_cache()
     input = torch.randn(*size, device="cuda", dtype=dtype)
-    run_benchmark(benchmark, torch.compile(softmax_fwd_fn), [input, reduction_axis])
+    run_benchmark(
+        benchmark,
+        torch.compile(softmax_fwd_fn) if compile else softmax_fwd_fn,
+        [input, reduction_axis],
+    )