Skip to content

Commit

Permalink
[Misc] Optional installation of audio related packages (vllm-project#…
Browse files Browse the repository at this point in the history
  • Loading branch information
ywang96 authored and siddharth9820 committed Sep 30, 2024
1 parent ab01d09 commit 26c4ec0
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 10 deletions.
4 changes: 1 addition & 3 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@ typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
msgspec
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
importlib_metadata
mistral_common >= 1.3.4
pyyaml
pyyaml
4 changes: 3 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@ pytest-shard
awscli
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio test
peft
requests
ray
sentence-transformers # required for embedding
soundfile # required for audio test
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
Expand All @@ -30,4 +32,4 @@ aiohttp

# quantization
bitsandbytes==0.42.0
buildkite-test-collector==0.1.8
buildkite-test-collector==0.1.8
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ def _read_requirements(filename: str) -> List[str]:
ext_modules=ext_modules,
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
"audio": ["librosa", "soundfile"] # Required for audio processing
},
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
package_data=package_data,
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_ultravox.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from typing import List, Optional, Tuple, Type

import librosa
import numpy as np
import pytest
from transformers import AutoModel, AutoTokenizer, BatchEncoding

from vllm.assets.audio import AudioAsset
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

Expand All @@ -21,6 +19,7 @@

@pytest.fixture(scope="session")
def audio_and_sample_rate():
from vllm.assets.audio import AudioAsset
return AudioAsset("mary_had_lamb").audio_and_sample_rate


Expand Down Expand Up @@ -109,6 +108,7 @@ def process(hf_inputs: BatchEncoding):
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModel) as hf_model:
import librosa

hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit(
Expand Down
6 changes: 5 additions & 1 deletion vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union, cast)

import librosa
import numpy as np
import torch
import torch.utils.checkpoint
Expand Down Expand Up @@ -107,6 +106,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
feature_extractor = whisper_feature_extractor(ctx)

if sr != feature_extractor.sampling_rate:
try:
import librosa
except ImportError:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
audio = librosa.resample(audio,
orig_sr=sr,
target_sr=feature_extractor.sampling_rate)
Expand Down
20 changes: 17 additions & 3 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import base64
from functools import lru_cache
from io import BytesIO
from typing import List, Optional, Tuple, TypeVar, Union
from typing import Any, List, Optional, Tuple, TypeVar, Union

import librosa
import numpy as np
import soundfile
from PIL import Image

from vllm.connections import global_http_connection
Expand Down Expand Up @@ -73,10 +71,22 @@ async def async_fetch_image(image_url: str,
return image.convert(image_mode)


def try_import_audio_packages() -> Tuple[Any, Any]:
try:
import librosa
import soundfile
except ImportError:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
return librosa, soundfile


def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
"""
Load audio from a URL.
"""
librosa, _ = try_import_audio_packages()

if audio_url.startswith("http"):
audio_bytes = global_http_connection.get_bytes(
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
Expand All @@ -95,6 +105,8 @@ async def async_fetch_audio(
"""
Asynchronously fetch audio from a URL.
"""
librosa, _ = try_import_audio_packages()

if audio_url.startswith("http"):
audio_bytes = await global_http_connection.async_get_bytes(
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
Expand Down Expand Up @@ -123,6 +135,8 @@ def encode_audio_base64(
sampling_rate: int,
) -> str:
"""Encode audio as base64."""
_, soundfile = try_import_audio_packages()

buffered = BytesIO()
soundfile.write(buffered, audio, sampling_rate, format="WAV")

Expand Down

0 comments on commit 26c4ec0

Please sign in to comment.