diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 56c2417d6a6e6..7de60d738113e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -287,6 +287,12 @@ def __reduce__(self): # the closure used to initialize Ray worker actors raise RuntimeError("LLMEngine should not be pickled!") + def __del__(self): + # Shutdown model executor when engine is garbage collected + # Use getattr since __init__ can fail before the field is set + if model_executor := getattr(self, "model_executor", None): + model_executor.shutdown() + def get_tokenizer(self) -> "PreTrainedTokenizer": return self.tokenizer.get_lora_tokenizer(None) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 1839b5603ff3e..1838c34be2fda 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -95,6 +95,13 @@ def check_health(self) -> None: exception.""" raise NotImplementedError + def shutdown(self) -> None: + """Shutdown the executor.""" + return + + def __del__(self): + self.shutdown() + class ExecutorAsyncBase(ExecutorBase):