Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision #11685

Merged
merged 55 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
022c6b4
initial
ywang96 Jan 1, 2025
43fdf45
fix llava ov
ywang96 Jan 1, 2025
e0fb002
iterate
ywang96 Jan 1, 2025
a9b9757
Merge branch 'vllm-project:main' into v1-llava-ov
ywang96 Jan 2, 2025
b45010b
revert padding tensor
ywang96 Jan 2, 2025
d83e25e
simplify
ywang96 Jan 2, 2025
d13b0f7
comment
ywang96 Jan 2, 2025
7d1f19a
Merge branch 'vllm-project:main' into v1-llava-ov
ywang96 Jan 2, 2025
6959ec0
simplify and doc
ywang96 Jan 2, 2025
ba071c6
refactor logic
ywang96 Jan 3, 2025
ba2f399
format
ywang96 Jan 3, 2025
ff4cdea
Merge branch 'vllm-project:main' into v1-llava-ov
ywang96 Jan 3, 2025
2eebfd9
switch order
ywang96 Jan 3, 2025
20dd84d
refactor
ywang96 Jan 3, 2025
34ec194
typing
ywang96 Jan 3, 2025
9f19629
hasher
ywang96 Jan 3, 2025
66484aa
consolidate mm hasher
ywang96 Jan 4, 2025
1423f5f
typing
ywang96 Jan 4, 2025
ba17100
Merge branch 'vllm-project:main' into v1-llava-ov
ywang96 Jan 4, 2025
b3c41ce
Merge branch 'main' into v1-llava-ov
ywang96 Jan 5, 2025
14481fd
fix length check
ywang96 Jan 5, 2025
6f435cf
update profiling
ywang96 Jan 5, 2025
16e5b04
update dummy data for llava-ov
ywang96 Jan 5, 2025
612880b
preserve modality order
ywang96 Jan 5, 2025
3022754
format
ywang96 Jan 5, 2025
20d6a67
simplify
ywang96 Jan 5, 2025
3dd2db2
typo
ywang96 Jan 5, 2025
5ce6f7a
clarify
ywang96 Jan 5, 2025
4113e51
add test
ywang96 Jan 5, 2025
3ca30fc
fix test
ywang96 Jan 5, 2025
ef8c6d1
add note
ywang96 Jan 5, 2025
87f4216
Merge branch 'v1-llava-ov' of https://github.com/ywang96/vllm into v1…
ywang96 Jan 5, 2025
bc1debd
comment
ywang96 Jan 5, 2025
56a7ef0
typo
ywang96 Jan 5, 2025
568a586
rename
ywang96 Jan 5, 2025
6ca99a3
remove redundant constants
ywang96 Jan 5, 2025
6c8ff3b
update interface with note
ywang96 Jan 5, 2025
293b3fe
update doc
ywang96 Jan 5, 2025
14482bf
address review comments
ywang96 Jan 6, 2025
eeee402
use namedtuple
ywang96 Jan 6, 2025
7f4815e
add comment
ywang96 Jan 6, 2025
1ba40e9
update
ywang96 Jan 6, 2025
2eb4cf1
format
ywang96 Jan 6, 2025
fe71431
format
ywang96 Jan 6, 2025
1a7b39c
remove unneeded check
ywang96 Jan 6, 2025
61991b6
Merge branch 'main' into v1-llava-ov
ywang96 Jan 6, 2025
ceec26e
remove unused import
ywang96 Jan 6, 2025
7879952
restrict mm_hash to V1
ywang96 Jan 6, 2025
72ae769
fix test and reorder code for readability
ywang96 Jan 6, 2025
48811b6
typo
ywang96 Jan 6, 2025
b31fd4f
format
ywang96 Jan 6, 2025
be54b2c
Fix dummy requests
DarkLight1337 Jan 6, 2025
b2cbc5a
Pass sanity check
DarkLight1337 Jan 6, 2025
3400d07
format
DarkLight1337 Jan 6, 2025
2461f0f
Merge branch 'main' into v1-llava-ov
DarkLight1337 Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-
- ✅︎
-
- ✅︎
* - `MiniCPMV`
- MiniCPM-V
- T + I<sup>E+</sup>
Expand Down
209 changes: 208 additions & 1 deletion tests/multimodal/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
import mimetypes
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
from typing import Dict, Tuple
from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple

import numpy as np
import pytest
from PIL import Image, ImageChops
from transformers import AutoConfig, AutoTokenizer

from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import (MediaConnector,
merge_and_sort_multimodal_metadata,
repeat_and_pad_placeholder_tokens)

if TYPE_CHECKING:
from vllm.multimodal.hasher import MultiModalHashDict
from vllm.multimodal.inputs import MultiModalPlaceholderDict

# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
Expand Down Expand Up @@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model):
assert new_prompt == expected_prompt
assert new_token_ids == expected_token_ids
assert ranges == expected_ranges


# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict"
mm_hashes: Optional["MultiModalHashDict"]
expected_modalities: list[str]
expected_ranges: list[PlaceholderRange]
expected_hashes: Optional[list[str]]


def test_merge_and_sort_multimodal_metadata():

test_cases = [
DarkLight1337 marked this conversation as resolved.
Show resolved Hide resolved
# Single modality should return result as is but flattened
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=2),
]
},
mm_hashes={"image": ["hash1", "hash2"]},
expected_modalities=["image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=2),
],
expected_hashes=["hash1", "hash2"],
),

# Single modality without hashes return None for mm hash.
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2),
]
},
mm_hashes=None,
expected_modalities=["image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2),
],
expected_hashes=None,
),

# Multiple modalities with hashes should return sorted modalities
# and flattened ranges and hashes.
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
],
"audio": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=["audio", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
],
expected_hashes=[
"audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
],
),

# Multiple modalities without hashes should return sorted modalities
# and flattened ranges and None.
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
],
"audio": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
]
},
mm_hashes=None,
expected_modalities=["audio", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
],
expected_hashes=None,
),

# Three modalities
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=15, length=7),
PlaceholderRange(offset=22, length=8),
],
"audio": [
PlaceholderRange(offset=0, length=2),
],
"video": [
PlaceholderRange(offset=3, length=4),
PlaceholderRange(offset=7, length=5),
PlaceholderRange(offset=12, length=6),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1"],
"video": ["video_hash1", "video_hash2", "video_hash3"]
},
expected_modalities=["audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=4),
PlaceholderRange(offset=7, length=5),
PlaceholderRange(offset=12, length=6),
PlaceholderRange(offset=15, length=7),
PlaceholderRange(offset=22, length=8),
],
expected_hashes=[
"audio_hash1", "video_hash1", "video_hash2", "video_hash3",
"image_hash1", "image_hash2"
],
),
]

for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
expected_hashes) in test_cases:
modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
mm_positions, mm_hashes)

assert modalities == expected_modalities
assert ranges == expected_ranges
assert hashes == expected_hashes


def test_merge_and_sort_multimodal_metadata_with_interleaving():

test_cases = [

# <image> <audio> <image> <audio>
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=8, length=2),
],
"audio": [
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=11, length=4),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=[],
expected_ranges=[],
expected_hashes=None,
),

# <image> <image> <video> <audio> <image>
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=20, length=4),
],
"audio": [
PlaceholderRange(offset=5, length=2),
],
"video": [
PlaceholderRange(offset=8, length=5),
]
},
mm_hashes=None,
expected_modalities=[],
expected_ranges=[],
expected_hashes=None,
),
]

for case in test_cases:
with pytest.raises(ValueError) as ex_info:
merge_and_sort_multimodal_metadata(case.mm_positions,
case.mm_hashes)

assert "Interleaved mixed-modality" in str(ex_info.value)
6 changes: 5 additions & 1 deletion vllm/model_executor/models/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:

The output embeddings must be one of the following formats:
- A list or tuple of 2D tensors, where each tensor corresponds to
each input image.
each input multimodal data item (e.g, image).
- A single 3D tensor, with the batch dimension grouping the 2D tensors.

NOTE: The returned multimodal embeddings must be in the same order as
the appearances of their corresponding multimodal data item in the
input prompt.
"""
...

Expand Down
65 changes: 33 additions & 32 deletions vllm/model_executor/models/llava_onevision.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
maybe_prefix, merge_multimodal_embeddings)

# For profile run
_MAX_FRAMES_PER_VIDEO = 16


class LlavaOnevisionVideoPixelInputs(TypedDict):
type: Literal["pixel_values_videos"]
Expand Down Expand Up @@ -207,8 +210,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
max_image_tokens = self._get_max_image_tokens() * max_images
max_total_frames = self._get_max_video_frames(seq_len -
max_image_tokens)
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
_MAX_FRAMES_PER_VIDEO)

return max(max_total_frames // max(max_videos, 1), 1)
return max(max_frames_per_video, 1)

def _get_max_video_tokens(self, seq_len: int) -> int:
target_width, target_height = self._get_image_size_with_most_features()
Expand Down Expand Up @@ -542,13 +547,15 @@ def _parse_and_validate_video_input(
def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
modalities = {}

if "pixel_values" in kwargs:
modalities["images"] = self._parse_and_validate_image_input(
**kwargs)

if "pixel_values_videos" in kwargs:
modalities["videos"] = self._parse_and_validate_video_input(
**kwargs)
# Preserve the order of modalities if there are multiple of them
# from the order of kwargs.
for input_key in kwargs:
if input_key == "pixel_values" and "images" not in modalities:
modalities["images"] = self._parse_and_validate_image_input(
**kwargs)
if input_key == "pixel_values_videos" and "videos" not in modalities: # noqa E501
modalities["videos"] = self._parse_and_validate_video_input(
**kwargs)

return modalities

Expand Down Expand Up @@ -808,21 +815,21 @@ def get_multimodal_embeddings(
if not modalities:
return None

# We make a tuple of each embedding with its modality string. This is a
# temporary workaround for models to handle mixed modalities when
# get_multimodal_embeddings and get_input_embeddings are called
# separately.
# TODO(ywang96): Add support for mixed-modality inference for v1.
multimodal_embeddings: List[Tuple[NestedTensors, str]] = []

if "images" in modalities:
image_input = modalities["images"]
vision_embeddings = self._process_image_input(image_input)
multimodal_embeddings.append((vision_embeddings, "image"))
if "videos" in modalities:
video_input = modalities["videos"]
video_embeddings = self._process_video_pixels(video_input)
multimodal_embeddings.append((video_embeddings, "video"))
# The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = ()

# NOTE: It is important to iterate over the keys in this dictionary
# to preserve the order of the modalities.
for modality in modalities:
if modality == "images":
image_input = modalities["images"]
vision_embeddings = self._process_image_input(image_input)
multimodal_embeddings += tuple(vision_embeddings)
if modality == "videos":
video_input = modalities["videos"]
video_embeddings = self._process_video_pixels(video_input)
multimodal_embeddings += tuple(video_embeddings)

return multimodal_embeddings

Expand All @@ -834,15 +841,9 @@ def get_input_embeddings(
) -> torch.Tensor:
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
for embeddings, modality in multimodal_embeddings:
if modality == "image":
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, embeddings,
self.config.image_token_index)
if modality == "video":
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, embeddings,
self.config.video_token_index)
inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings,
[self.config.image_token_index, self.config.video_token_index])
return inputs_embeds

def forward(
Expand Down
3 changes: 0 additions & 3 deletions vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,8 +972,6 @@ def image_input_mapper_for_molmo(
assert len(data) == 1, "Molmo supports only one image per prompt."
data = data[0]

# Remove unused dummy PIL image
data.pop('raw_mm_data', None)
ywang96 marked this conversation as resolved.
Show resolved Hide resolved
return MultiModalKwargs(data)


Expand Down Expand Up @@ -1019,7 +1017,6 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
dummy_imgdata = {
"images": out["images"],
"image_input_idx": out["image_input_idx"],
"raw_mm_data": dummy_image,
}
if "image_masks" in out:
dummy_imgdata["image_masks"] = out["image_masks"]
Expand Down
3 changes: 3 additions & 0 deletions vllm/multimodal/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .base import MultiModalPlaceholderMap, MultiModalPlugin
from .hasher import MultiModalHashDict, MultiModalHasher
from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
MultiModalDataDict, MultiModalKwargs,
MultiModalPlaceholderDict, NestedTensors)
Expand All @@ -18,6 +19,8 @@
"ModalityData",
"MultiModalDataBuiltins",
"MultiModalDataDict",
"MultiModalHashDict",
"MultiModalHasher",
"MultiModalKwargs",
"MultiModalPlaceholderDict",
"MultiModalPlaceholderMap",
Expand Down
Loading
Loading