Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Optional installation of audio related packages #8063

Merged
merged 4 commits into from
Sep 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
msgspec
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
importlib_metadata
mistral_common >= 1.3.4
pyyaml
pyyaml
4 changes: 3 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ pytest-shard
awscli
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio test
peft
requests
ray
sentence-transformers # required for embedding
soundfile # required for audio test
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
Expand All @@ -30,4 +32,4 @@ aiohttp

# quantization
bitsandbytes==0.42.0
buildkite-test-collector==0.1.8
buildkite-test-collector==0.1.8
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ def _read_requirements(filename: str) -> List[str]:
ext_modules=ext_modules,
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
"audio": ["librosa", "soundfile"] # Required for audio processing
},
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
package_data=package_data,
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_ultravox.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from typing import List, Optional, Tuple, Type

import librosa
import numpy as np
import pytest
from transformers import AutoModel, AutoTokenizer, BatchEncoding

from vllm.assets.audio import AudioAsset
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

Expand All @@ -21,6 +19,7 @@

@pytest.fixture(scope="session")
def audio_and_sample_rate():
from vllm.assets.audio import AudioAsset
return AudioAsset("mary_had_lamb").audio_and_sample_rate


Expand Down Expand Up @@ -109,6 +108,7 @@ def process(hf_inputs: BatchEncoding):
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModel) as hf_model:
import librosa

hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit(
Expand Down
6 changes: 5 additions & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union, cast)

import librosa
import numpy as np
import torch
import torch.utils.checkpoint
Expand Down Expand Up @@ -107,6 +106,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
feature_extractor = whisper_feature_extractor(ctx)

if sr != feature_extractor.sampling_rate:
try:
import librosa
except ImportError:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
audio = librosa.resample(audio,
orig_sr=sr,
target_sr=feature_extractor.sampling_rate)
Expand Down
20 changes: 17 additions & 3 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import base64
from functools import lru_cache
from io import BytesIO
from typing import List, Optional, Tuple, TypeVar, Union
from typing import Any, List, Optional, Tuple, TypeVar, Union

import librosa
import numpy as np
import soundfile
from PIL import Image

from vllm.connections import global_http_connection
Expand Down Expand Up @@ -73,10 +71,22 @@ async def async_fetch_image(image_url: str,
return image.convert(image_mode)


def try_import_audio_packages() -> Tuple[Any, Any]:
try:
import librosa
import soundfile
except ImportError:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
return librosa, soundfile


def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
"""
Load audio from a URL.
"""
librosa, _ = try_import_audio_packages()

if audio_url.startswith("http"):
audio_bytes = global_http_connection.get_bytes(
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
Expand All @@ -95,6 +105,8 @@ async def async_fetch_audio(
"""
Asynchronously fetch audio from a URL.
"""
librosa, _ = try_import_audio_packages()

if audio_url.startswith("http"):
audio_bytes = await global_http_connection.async_get_bytes(
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
Expand Down Expand Up @@ -123,6 +135,8 @@ def encode_audio_base64(
sampling_rate: int,
) -> str:
"""Encode audio as base64."""
_, soundfile = try_import_audio_packages()

buffered = BytesIO()
soundfile.write(buffered, audio, sampling_rate, format="WAV")

Expand Down
Loading