Skip to content

vllm.multimodal.audio

AudioMediaIO

Bases: MediaIO[tuple[NDArray, float]]

Source code in vllm/multimodal/audio.py
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
    def __init__(self, **kwargs) -> None:
        super().__init__()

        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality.
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.
        self.kwargs = kwargs

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
        return librosa.load(BytesIO(data), sr=None)

    def load_base64(
        self,
        media_type: str,
        data: str,
    ) -> tuple[npt.NDArray, float]:
        return self.load_bytes(base64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
        return librosa.load(filepath, sr=None)

    def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
        audio, sr = media

        with BytesIO() as buffer:
            soundfile.write(buffer, audio, sr, format="WAV")
            data = buffer.getvalue()

        return base64.b64encode(data).decode("utf-8")

kwargs instance-attribute

kwargs = kwargs

__init__

__init__(**kwargs) -> None
Source code in vllm/multimodal/audio.py
def __init__(self, **kwargs) -> None:
    super().__init__()

    # `kwargs` contains custom arguments from
    # --media-io-kwargs for this modality.
    # They can be passed to the underlying
    # media loaders (e.g. custom implementations)
    # for flexible control.
    self.kwargs = kwargs

encode_base64

encode_base64(media: tuple[NDArray, int]) -> str
Source code in vllm/multimodal/audio.py
def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
    audio, sr = media

    with BytesIO() as buffer:
        soundfile.write(buffer, audio, sr, format="WAV")
        data = buffer.getvalue()

    return base64.b64encode(data).decode("utf-8")

load_base64

load_base64(
    media_type: str, data: str
) -> tuple[NDArray, float]
Source code in vllm/multimodal/audio.py
def load_base64(
    self,
    media_type: str,
    data: str,
) -> tuple[npt.NDArray, float]:
    return self.load_bytes(base64.b64decode(data))

load_bytes

load_bytes(data: bytes) -> tuple[NDArray, float]
Source code in vllm/multimodal/audio.py
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
    return librosa.load(BytesIO(data), sr=None)

load_file

load_file(filepath: Path) -> tuple[NDArray, float]
Source code in vllm/multimodal/audio.py
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
    return librosa.load(filepath, sr=None)

AudioResampler

Resample audio data to a target sample rate.

Source code in vllm/multimodal/audio.py
class AudioResampler:
    """Resample audio data to a target sample rate."""

    def __init__(
        self,
        target_sr: Optional[float] = None,
        method: Literal["librosa", "scipy"] = "librosa",
    ):
        self.target_sr = target_sr
        self.method = method

    def resample(
        self,
        audio: npt.NDArray[np.floating],
        *,
        orig_sr: float,
    ) -> npt.NDArray[np.floating]:
        if self.target_sr is None:
            raise RuntimeError(
                "Audio resampling is not supported when `target_sr` is not provided"
            )
        if self.method == "librosa":
            return resample_audio_librosa(
                audio, orig_sr=orig_sr, target_sr=self.target_sr
            )
        elif self.method == "scipy":
            return resample_audio_scipy(
                audio, orig_sr=orig_sr, target_sr=self.target_sr
            )
        else:
            raise ValueError(
                f"Invalid resampling method: {self.method}. "
                "Supported methods are 'librosa' and 'scipy'."
            )

method instance-attribute

method = method

target_sr instance-attribute

target_sr = target_sr

__init__

__init__(
    target_sr: Optional[float] = None,
    method: Literal["librosa", "scipy"] = "librosa",
)
Source code in vllm/multimodal/audio.py
def __init__(
    self,
    target_sr: Optional[float] = None,
    method: Literal["librosa", "scipy"] = "librosa",
):
    self.target_sr = target_sr
    self.method = method

resample

resample(
    audio: NDArray[floating], *, orig_sr: float
) -> NDArray[floating]
Source code in vllm/multimodal/audio.py
def resample(
    self,
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
) -> npt.NDArray[np.floating]:
    if self.target_sr is None:
        raise RuntimeError(
            "Audio resampling is not supported when `target_sr` is not provided"
        )
    if self.method == "librosa":
        return resample_audio_librosa(
            audio, orig_sr=orig_sr, target_sr=self.target_sr
        )
    elif self.method == "scipy":
        return resample_audio_scipy(
            audio, orig_sr=orig_sr, target_sr=self.target_sr
        )
    else:
        raise ValueError(
            f"Invalid resampling method: {self.method}. "
            "Supported methods are 'librosa' and 'scipy'."
        )

resample_audio_librosa

resample_audio_librosa(
    audio: NDArray[floating],
    *,
    orig_sr: float,
    target_sr: float,
) -> NDArray[floating]
Source code in vllm/multimodal/audio.py
def resample_audio_librosa(
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
    target_sr: float,
) -> npt.NDArray[np.floating]:
    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

resample_audio_scipy

resample_audio_scipy(
    audio: NDArray[floating],
    *,
    orig_sr: float,
    target_sr: float,
)
Source code in vllm/multimodal/audio.py
def resample_audio_scipy(
    audio: npt.NDArray[np.floating],
    *,
    orig_sr: float,
    target_sr: float,
):
    # lazy import scipy.signal, otherwise it will crash doc build.
    import scipy.signal

    if orig_sr > target_sr:
        return scipy.signal.resample_poly(audio, 1, orig_sr // target_sr)
    elif orig_sr < target_sr:
        return scipy.signal.resample_poly(audio, target_sr // orig_sr, 1)
    return audio