From b089ec623188239b5e4c7c247eea20cf78b02454 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 26 Jun 2024 00:15:22 -0700 Subject: [PATCH] [Bugfix] Fix embedding to support 2D inputs (#5829) --- vllm/model_executor/layers/vocab_parallel_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 1a26c5c63fedc..4650b2c2458d0 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -306,11 +306,11 @@ def forward(self, input_): self.shard_indices.added_vocab_end_index) else: masked_input = input_ - # Get the embeddings. + # Get the embeddings. output_parallel = F.embedding(masked_input.long(), self.weight) # Mask the output embedding. if self.tp_size > 1: - output_parallel.masked_fill_(input_mask.unsqueeze(1), 0) + output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) # Reduce across all the model parallel GPUs. output = tensor_model_parallel_all_reduce(output_parallel) return output