diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c1a75924c6d72..f5b2145c22d6f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -284,7 +284,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", is_driver_worker=True, ) - self._run_workers("init_model", cupy_port=get_open_port()) + # don't use cupy for eager mode + self._run_workers("init_model", + cupy_port=get_open_port() + if not model_config.enforce_eager else None) self._run_workers( "load_model", max_concurrent_workers=self.parallel_config.