diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py index 7053075bf4d8f..d0b8fec48d74f 100644 --- a/vllm/worker/cpu_embedding_model_runner.py +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -66,6 +66,10 @@ def execute_model( hidden_states = model_executable(**execute_model_kwargs) + # Only perform pooling in the driver worker. + if not self.is_driver_worker: + return [] + return [ self.model.pooler(hidden_states=hidden_states, pooling_metadata=model_input.pooling_metadata)