Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1] Support audio language models on V1 #11733

Merged
merged 7 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `Qwen/Qwen2-Audio-7B-Instruct`
-
- ✅︎
-
- ✅︎
* - `Qwen2VLForConditionalGeneration`
- Qwen2-VL
- T + I<sup>E+</sup> + V<sup>E+</sup>
Expand All @@ -724,7 +724,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `fixie-ai/ultravox-v0_3`
-
- ✅︎
-
- ✅︎
```

<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
Expand Down
9 changes: 6 additions & 3 deletions vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,13 +310,16 @@ def _process_audio_input(self,
selected_audio_feature = audio_outputs.last_hidden_state
audio_features = self.multi_modal_projector(selected_audio_feature)
num_audios, max_audio_tokens, embed_dim = audio_features.shape
audio_output_lengths = audio_output_lengths.unsqueeze(1)
audio_features_mask = torch.arange(max_audio_tokens).expand(
num_audios, max_audio_tokens
).to(audio_output_lengths.device) < audio_output_lengths.unsqueeze(1)
num_audios, max_audio_tokens).to(
audio_output_lengths.device) < audio_output_lengths
masked_audio_features = audio_features[audio_features_mask].view(
-1, embed_dim)

return masked_audio_features
# Split to tuple of embeddings for individual audio input.
return torch.split(masked_audio_features,
audio_output_lengths.flatten().tolist())

def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
audio_input = self._parse_and_validate_audio_input(**kwargs)
Expand Down
18 changes: 12 additions & 6 deletions vllm/model_executor/models/ultravox.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
"""PyTorch Ultravox model."""

import math
import os
from functools import cached_property
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
TypedDict, Union)
Expand Down Expand Up @@ -36,8 +36,10 @@
from .interfaces import SupportsMultiModal, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings,
merge_multimodal_embeddings_from_map)

_AUDIO_PLACEHOLDER_TOKEN = 128002
_AUDIO_TOKENS_PER_SECOND = 6.25


Expand Down Expand Up @@ -449,11 +451,15 @@ def get_input_embeddings(
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:

# TODO(ywang96): use merge_multimodal_embeddings after
# v0 is deprecated
merge_multimodal_embeddings_from_map(
inputs_embeds, multimodal_embeddings,
attn_metadata.multi_modal_placeholder_index_maps["audio"])
# TODO(ywang96): remove this block after v0 is deprecated.
if os.environ.get("VLLM_USE_V1") == "0":
merge_multimodal_embeddings_from_map(
inputs_embeds, multimodal_embeddings,
attn_metadata.multi_modal_placeholder_index_maps["audio"])
else:
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
_AUDIO_PLACEHOLDER_TOKEN)
return inputs_embeds

def forward(self,
Expand Down
Loading