[Misc] Optional installation of audio related packages (#8063)

This commit is contained in:
Roger Wang 2024-09-01 14:46:57 -07:00 committed by GitHub
parent 5231f0898e
commit 5b86b19954
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 29 additions and 10 deletions

View File

@ -22,9 +22,7 @@ typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
msgspec
librosa # Required for audio processing
soundfile # Required for audio processing
gguf == 0.9.1
importlib_metadata
mistral_common >= 1.3.4
pyyaml
pyyaml

View File

@ -13,10 +13,12 @@ pytest-shard
awscli
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio test
peft
requests
ray
sentence-transformers # required for embedding
soundfile # required for audio test
compressed-tensors==0.4.0 # required for compressed-tensors
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
@ -30,4 +32,4 @@ aiohttp
# quantization
bitsandbytes==0.42.0
buildkite-test-collector==0.1.8
buildkite-test-collector==0.1.8

View File

@ -501,6 +501,7 @@ setup(
ext_modules=ext_modules,
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
"audio": ["librosa", "soundfile"] # Required for audio processing
},
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
package_data=package_data,

View File

@ -1,11 +1,9 @@
from typing import List, Optional, Tuple, Type
import librosa
import numpy as np
import pytest
from transformers import AutoModel, AutoTokenizer, BatchEncoding
from vllm.assets.audio import AudioAsset
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
@ -21,6 +19,7 @@ AudioTuple = Tuple[np.ndarray, int]
@pytest.fixture(scope="session")
def audio_and_sample_rate():
from vllm.assets.audio import AudioAsset
return AudioAsset("mary_had_lamb").audio_and_sample_rate
@ -109,6 +108,7 @@ def run_test(
dtype=dtype,
postprocess_inputs=process,
auto_cls=AutoModel) as hf_model:
import librosa
hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit(

View File

@ -8,7 +8,6 @@ from functools import lru_cache
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union, cast)
import librosa
import numpy as np
import torch
import torch.utils.checkpoint
@ -107,6 +106,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
feature_extractor = whisper_feature_extractor(ctx)
if sr != feature_extractor.sampling_rate:
try:
import librosa
except ImportError:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
audio = librosa.resample(audio,
orig_sr=sr,
target_sr=feature_extractor.sampling_rate)

View File

@ -1,11 +1,9 @@
import base64
from functools import lru_cache
from io import BytesIO
from typing import List, Optional, Tuple, TypeVar, Union
from typing import Any, List, Optional, Tuple, TypeVar, Union
import librosa
import numpy as np
import soundfile
from PIL import Image
from vllm.connections import global_http_connection
@ -73,10 +71,22 @@ async def async_fetch_image(image_url: str,
return image.convert(image_mode)
def try_import_audio_packages() -> Tuple[Any, Any]:
try:
import librosa
import soundfile
except ImportError:
raise ImportError(
"Please install vllm[audio] for audio support.") from None
return librosa, soundfile
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
"""
Load audio from a URL.
"""
librosa, _ = try_import_audio_packages()
if audio_url.startswith("http"):
audio_bytes = global_http_connection.get_bytes(
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
@ -95,6 +105,8 @@ async def async_fetch_audio(
"""
Asynchronously fetch audio from a URL.
"""
librosa, _ = try_import_audio_packages()
if audio_url.startswith("http"):
audio_bytes = await global_http_connection.async_get_bytes(
audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
@ -123,6 +135,8 @@ def encode_audio_base64(
sampling_rate: int,
) -> str:
"""Encode audio as base64."""
_, soundfile = try_import_audio_packages()
buffered = BytesIO()
soundfile.write(buffered, audio, sampling_rate, format="WAV")