From 2de197bdd4b82a004ff99806d054dce1d93b3ced Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 7 Jan 2025 03:47:36 -0800 Subject: [PATCH] [V1] Support audio language models on V1 (#11733) Signed-off-by: Roger Wang --- docs/source/models/supported_models.md | 4 ++-- vllm/model_executor/models/qwen2_audio.py | 9 +++++--- vllm/model_executor/models/ultravox.py | 28 +++++++++++++++++------ 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 7777545b8b3c1..8c5f6836d6aa8 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ - - + - ✅︎ * - `Qwen2VLForConditionalGeneration` - Qwen2-VL - T + IE+ + VE+ @@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ - `fixie-ai/ultravox-v0_3` - - ✅︎ - - + - ✅︎ ``` E Pre-computed embeddings can be inputted for this modality. diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 576b01776e5de..7012ddc66cd9c 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -335,13 +335,16 @@ def _process_audio_input(self, selected_audio_feature = audio_outputs.last_hidden_state audio_features = self.multi_modal_projector(selected_audio_feature) num_audios, max_audio_tokens, embed_dim = audio_features.shape + audio_output_lengths = audio_output_lengths.unsqueeze(1) audio_features_mask = torch.arange(max_audio_tokens).expand( - num_audios, max_audio_tokens - ).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1) + num_audios, max_audio_tokens).to( + audio_output_lengths.device) < audio_output_lengths masked_audio_features = audio_features[audio_features_mask].view( -1, embed_dim) - return masked_audio_features + # Split to tuple of embeddings for individual audio input. + return torch.split(masked_audio_features, + audio_output_lengths.flatten().tolist()) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: audio_input = self._parse_and_validate_audio_input(**kwargs) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index ba823acecbb56..ecafd157b1d61 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,6 +1,5 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" - import math from functools import cached_property from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, @@ -14,6 +13,7 @@ from transformers.models.whisper import WhisperFeatureExtractor from transformers.models.whisper.modeling_whisper import WhisperEncoder +from vllm import envs from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn @@ -35,8 +35,11 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings, merge_multimodal_embeddings_from_map) +_AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>" +_AUDIO_PLACEHOLDER_TOKEN = 128002 _AUDIO_TOKENS_PER_SECOND = 6.25 @@ -64,7 +67,14 @@ def _get_hf_processor( # Ignored in initialization sampling_rate: Optional[int] = None, ) -> ProcessorMixin: - return self.ctx.get_hf_processor() + hf_processor = self.ctx.get_hf_processor() + + # NOTE: Ultravox processing definition uses '<|eot_id|>' as the + # placeholder that will cause confusion with the actual end of turn + # token, thus we override placeholder with a reserved special + # token. + hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE + return hf_processor def _get_feature_extractor( self, @@ -465,11 +475,15 @@ def get_input_embeddings( inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - # TODO(ywang96): use merge_multimodal_embeddings after - # v0 is deprecated - merge_multimodal_embeddings_from_map( - inputs_embeds, multimodal_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) + # TODO(ywang96): remove this block after v0 is deprecated. + if not envs.VLLM_USE_V1: + merge_multimodal_embeddings_from_map( + inputs_embeds, multimodal_embeddings, + attn_metadata.multi_modal_placeholder_index_maps["audio"]) + else: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + _AUDIO_PLACEHOLDER_TOKEN) return inputs_embeds def forward(self,