-
-
Notifications
You must be signed in to change notification settings - Fork 4.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Bugfix] Fixes for Phi3v and Ultravox Multimodal EmbeddingInputs Support #8979
Changes from 2 commits
515037b
c5c5c02
9a87aa6
1918192
f0a04ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -119,10 +119,13 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): | |
data = [data] | ||
|
||
audio_features = [] | ||
audio_embeds = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it'd be simpler to do an early return before doing any audio processing: if is_list_of(data, torch.Tensor, check="all"):
return MultiModalInputs({"audio_embeds": data }) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Made the change and added a comment in the code! |
||
is_audio_embeds = False | ||
for audio_input in data: | ||
if not isinstance(audio_input, tuple): | ||
raise NotImplementedError( | ||
f"Unsupported data type: {type(audio_input)}") | ||
is_audio_embeds = True | ||
audio_embeds.append(audio_input) | ||
continue | ||
|
||
(audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input) | ||
feature_extractor = whisper_feature_extractor(ctx) | ||
|
@@ -150,7 +153,8 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): | |
# Remove the batch dimension because we're wrapping it in a list. | ||
audio_features.append(single_audio_features.squeeze(0)) | ||
|
||
return MultiModalInputs({"audio_features": audio_features}) | ||
return (MultiModalInputs({"audio_embeds": audio_embeds}) if is_audio_embeds | ||
else MultiModalInputs({"audio_features": audio_features})) | ||
|
||
|
||
def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs): | ||
|
@@ -164,25 +168,30 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs): | |
audios = [audios] | ||
|
||
audio_token_counts = [] | ||
for audio_data, sample_rate in audios: | ||
audio_length = audio_data.shape[0] | ||
if sample_rate != feature_extractor.sampling_rate: | ||
# Account for resampling. | ||
adjustment = feature_extractor.sampling_rate / sample_rate | ||
audio_length = math.ceil(adjustment * audio_length) | ||
|
||
feature_extractor_output_length = math.ceil( | ||
(audio_length - (feature_extractor.hop_length - 1)) / | ||
feature_extractor.hop_length) | ||
|
||
uv_config = ctx.get_hf_config(UltravoxConfig) | ||
audio_num_tokens = min( | ||
max( | ||
1, | ||
math.ceil(feature_extractor_output_length / | ||
(uv_config.stack_factor * 2))), | ||
get_ultravox_max_audio_tokens(ctx)) | ||
audio_token_counts.append(audio_num_tokens) | ||
for audio in audios: | ||
if isinstance(audio, torch.Tensor): | ||
audio_num_tokens = audio.shape[1] | ||
audio_token_counts.append(audio_num_tokens) | ||
else: | ||
audio_data, sample_rate = audio | ||
audio_length = audio_data.shape[0] | ||
if sample_rate != feature_extractor.sampling_rate: | ||
# Account for resampling. | ||
adjustment = feature_extractor.sampling_rate / sample_rate | ||
audio_length = math.ceil(adjustment * audio_length) | ||
|
||
feature_extractor_output_length = math.ceil( | ||
(audio_length - (feature_extractor.hop_length - 1)) / | ||
feature_extractor.hop_length) | ||
|
||
uv_config = ctx.get_hf_config(UltravoxConfig) | ||
audio_num_tokens = min( | ||
max( | ||
1, | ||
math.ceil(feature_extractor_output_length / | ||
(uv_config.stack_factor * 2))), | ||
get_ultravox_max_audio_tokens(ctx)) | ||
audio_token_counts.append(audio_num_tokens) | ||
|
||
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes -- with the original code, with multiple images, it's a 3D tensor when we want a list of 2D tensors. This transforms it to a list of 2D tensors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if
image_input["data"]
is already a list? (which may happen when the feature size of each image is different)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's fair, the images I use to test all have the same feature size so it seems like they somehow got concatenated together into a tensor. I added this case to catch for that and follow the original functionality: https://github.com/vllm-project/vllm/pull/8979/files#diff-be2abd8a08916397663c17c0ecb4036d478e4cc5f770b2f270b121d11414f080R618. I suspect that in that case, some sort of error may occur here: https://github.com/vllm-project/vllm/pull/8979/files#diff-be2abd8a08916397663c17c0ecb4036d478e4cc5f770b2f270b121d11414f080L602, since it expects a torch Tensor and not a list.