From 2b9b5ddc7b4e1fdb50463ebd2e44d534719d4f1e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 17 Mar 2024 22:57:01 -0700 Subject: [PATCH 1/3] print error before deadlock --- vllm/engine/ray_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 742f3dc575190..ff7dbf31c21cb 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -33,8 +33,16 @@ def __getattr__(self, name): return getattr(self.worker, name) def execute_method(self, method, *args, **kwargs): - executor = getattr(self, method) - return executor(*args, **kwargs) + try: + executor = getattr(self, method) + return executor(*args, **kwargs) + except Exception as e: + # exceptions in ray worker may cause deadlock + # see https://github.com/vllm-project/vllm/issues/3455 + # print the error and inform the user to solve the error + print(f"Error executing method {method}:\n{e!r}") + print("This might cause deadlock in distributed execution.") + raise e def get_node_ip(self) -> str: return get_ip() From f3fec66caefd25ba779a97ba71d2182fe0b47b52 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 18 Mar 2024 16:13:26 -0700 Subject: [PATCH 2/3] use logger.exception --- vllm/engine/ray_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index ff7dbf31c21cb..cccc30e02746f 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -40,8 +40,9 @@ def execute_method(self, method, *args, **kwargs): # exceptions in ray worker may cause deadlock # see https://github.com/vllm-project/vllm/issues/3455 # print the error and inform the user to solve the error - print(f"Error executing method {method}:\n{e!r}") - print("This might cause deadlock in distributed execution.") + msg = f"Error executing method {method}." + msg += "This might cause deadlock in distributed execution." + logger.exception(msg) raise e def get_node_ip(self) -> str: From e15396543af8d8975a4d731ffe4fc3799ad34372 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 18 Mar 2024 19:52:34 -0700 Subject: [PATCH 3/3] fix formatter, avoid runtime concat of string --- vllm/engine/ray_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index cccc30e02746f..27414f085b45a 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -40,8 +40,8 @@ def execute_method(self, method, *args, **kwargs): # exceptions in ray worker may cause deadlock # see https://github.com/vllm-project/vllm/issues/3455 # print the error and inform the user to solve the error - msg = f"Error executing method {method}." - msg += "This might cause deadlock in distributed execution." + msg = (f"Error executing method {method}. " + "This might cause deadlock in distributed execution.") logger.exception(msg) raise e