Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[Core] Fix engine-use-ray broken (vllm-project#4105)
Browse files Browse the repository at this point in the history
  • Loading branch information
rkooo567 authored and robertgshaw2-neuralmagic committed Apr 21, 2024
1 parent 180b6f9 commit ead1e24
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 8 deletions.
17 changes: 13 additions & 4 deletions tests/async_engine/test_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,30 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
def api_server(tokenizer_pool_size: int):
def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
worker_use_ray: bool):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
uvicorn_process = subprocess.Popen([
commands = [
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m", "--host",
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
])
]
if engine_use_ray:
commands.append("--engine-use-ray")
if worker_use_ray:
commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands)
yield
uvicorn_process.terminate()


@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
def test_api_server(api_server, tokenizer_pool_size: int):
@pytest.mark.parametrize("worker_use_ray", [False, True])
@pytest.mark.parametrize("engine_use_ray", [False, True])
def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
engine_use_ray: bool):
"""
Run the API server and test it.
Expand Down
7 changes: 3 additions & 4 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,7 @@ def from_engine_args(
if engine_config.device_config.device_type == "neuron":
raise NotImplementedError("Neuron is not supported for "
"async engine yet.")
elif (engine_config.parallel_config.worker_use_ray
or engine_args.engine_use_ray):
elif engine_config.parallel_config.worker_use_ray:
initialize_ray_cluster(engine_config.parallel_config)
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
executor_class = RayGPUExecutorAsync
Expand Down Expand Up @@ -410,8 +409,8 @@ def _init_engine(self, *args,
else:
# FIXME(woosuk): This is a bit hacky. Be careful when changing the
# order of the arguments.
cache_config = args[1]
parallel_config = args[2]
cache_config = kwargs["cache_config"]
parallel_config = kwargs["parallel_config"]
if parallel_config.tensor_parallel_size == 1:
num_gpus = cache_config.gpu_memory_utilization
else:
Expand Down

0 comments on commit ead1e24

Please sign in to comment.