diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 43905082b7caf..5452ce6be8110 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -6,7 +6,8 @@ @pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"]) def test_full_graph(model): # make sure these models can be captured in full graph mode - os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1" + if "VLLM_TEST_DYNAMO_GRAPH_CAPTURE" not in os.environ: + os.environ["VLLM_TEST_DYNAMO_GRAPH_CAPTURE"] = "1" from vllm import LLM, SamplingParams prompts = [ diff --git a/vllm/envs.py b/vllm/envs.py index b3678399fe207..2003ede95d2d8 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -202,6 +202,11 @@ def get_default_config_root(): (os.environ.get("VLLM_DYNAMO_USE_CUSTOM_DISPATCHER", "True").lower() in ("true", "1")), + # Internal flag to control whether we use custom op, + # or use the native pytorch implementation + "VLLM_TEST_COMPILE_NO_CUSTOM_OPS": + lambda: int(os.environ.get("VLLM_TEST_COMPILE_NO_CUSTOM_OPS", "0")), + # Internal flag to enable Dynamo fullgraph capture "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool( diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 49247cd5de42a..9102b5e19ebec 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,5 +1,6 @@ import torch.nn as nn +import vllm.envs as envs from vllm.platforms import current_platform from vllm.utils import is_cpu, is_hip, is_xpu @@ -53,6 +54,10 @@ def forward_gaudi(self, *args, **kwargs): def dispatch_forward(self): # NOTE(woosuk): Here we assume that vLLM was built for only one # specific backend. Currently, we do not support dynamic dispatching. + + if envs.VLLM_TEST_COMPILE_NO_CUSTOM_OPS: + return self.forward_native + if is_hip(): return self.forward_hip elif is_cpu():