Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VLM] Reorganize profiling/processing-related code #11812

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,17 @@
import pytest
from PIL import Image
from pqdm.threads import pqdm
from transformers import AutoTokenizer

from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processor import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer

from ....utils import build_model_context


# Fixtures lazy import to avoid initializing CUDA during test collection
@pytest.fixture()
def processor_for_llava_next():
from vllm.model_executor.models.llava_next import (
LlavaNextMultiModalProcessor)
return LlavaNextMultiModalProcessor


def _validate_image_prompt_replacements_one(
processor,
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
Expand Down Expand Up @@ -78,20 +71,17 @@ def _test_image_prompt_replacements(

@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(
processor_for_llava_next,
model_id: str,
num_imgs: int,
):
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_next(ctx)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)

image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
Expand All @@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(
processor_for_llava_next,
model_id: str,
num_imgs: int,
):
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_next(ctx)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)

seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,17 @@
import pytest
from PIL import Image
from pqdm.threads import pqdm
from transformers import AutoTokenizer

from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processor import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer

from ....utils import build_model_context


# Fixtures lazy import to avoid initializing CUDA during test collection
@pytest.fixture()
def processor_for_llava_onevision():
from vllm.model_executor.models.llava_onevision import (
LlavaOnevisionMultiModalProcessor)
return LlavaOnevisionMultiModalProcessor


def _validate_image_prompt_replacements_one(
processor,
processor: BaseMultiModalProcessor,
num_imgs: int,
failed_size_excs: list[tuple[ImageSize, Exception]],
image_size: ImageSize,
Expand Down Expand Up @@ -77,20 +70,17 @@ def _test_image_prompt_replacements(
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(
processor_for_llava_onevision,
model_id: str,
num_imgs: int,
):
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_onevision(ctx)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)

image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
Expand All @@ -111,20 +101,17 @@ def test_processor_prompt_replacements_regression(
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(
processor_for_llava_onevision,
model_id: str,
num_imgs: int,
):
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
ctx = InputProcessingContext(ctx.model_config, tokenizer)
processor = processor_for_llava_onevision(ctx)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
)

seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()
Expand Down
52 changes: 25 additions & 27 deletions tests/multimodal/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@
from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
_PlaceholderInfo, find_mm_placeholders,
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (PlaceholderInfo, ProcessingCache,
PromptReplacement,
find_mm_placeholders,
find_text_matches, find_token_matches,
iter_token_matches,
replace_text_matches,
replace_token_matches)
# yapf: enable
from vllm.multimodal.profiler import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import full_groupby
Expand Down Expand Up @@ -431,7 +436,7 @@ def test_find_replace_tokens(
[1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
{
"pattern_1": [
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_1",
item_idx=0,
start_idx=6,
Expand All @@ -445,21 +450,21 @@ def test_find_replace_tokens(
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
{
"pattern_1": [
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_1",
item_idx=0,
start_idx=1,
replacement=[32000, 32000],
),
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_1",
item_idx=1,
start_idx=5,
replacement=[32000, 32000],
),
],
"pattern_3": [
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_3",
item_idx=0,
start_idx=7,
Expand All @@ -472,21 +477,21 @@ def test_find_replace_tokens(
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
{
"pattern_1": [
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_1",
item_idx=0,
start_idx=1,
replacement=[32000, 32000],
),
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_1",
item_idx=1,
start_idx=3,
replacement=[32000, 32000],
),
],
"pattern_3": [
_PlaceholderInfo(
PlaceholderInfo(
modality="pattern_3",
item_idx=0,
start_idx=6,
Expand Down Expand Up @@ -577,27 +582,23 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
)

processor = processor_factory(ctx, cache=None)
profiler = processor.profiling_info
profiler = MultiModalProfiler(processor)

mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
profiler.get_supported_mm_limits = mock_supported_mm_limits
processor.info.get_supported_mm_limits = mock_supported_mm_limits

if is_valid:
exc_ctx = nullcontext()
else:
exc_ctx = pytest.raises(ValueError, match="this model only supports")

with exc_ctx:
profiler.get_mm_limits()
profiler.get_dummy_data(model_config.max_model_len)


@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
Expand All @@ -620,16 +621,12 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
processor = MULTIMODAL_REGISTRY.create_processor(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
)

processor = processor_factory(ctx, cache=None)

rng = np.random.RandomState(0)
image = _rand_img(rng, min_wh=128, max_wh=256)
if num_images == 0:
Expand Down Expand Up @@ -681,18 +678,19 @@ def _test_processing_cache_correctness(
hf_overrides=hf_overrides,
limit_mm_per_prompt=limit_mm_per_prompt,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls]
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
model_config,
tokenizer=cached_get_tokenizer(model_config.tokenizer),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)

baseline_processor = processor_factory(ctx, cache=None)
cached_processor = processor_factory(ctx, cache=cache)
baseline_processor = factories.build_processor(ctx, cache=None)
cached_processor = factories.build_processor(ctx, cache=cache)
dummy_inputs = baseline_processor.dummy_inputs

rng = np.random.RandomState(0)

Expand Down Expand Up @@ -724,7 +722,7 @@ def _test_processing_cache_correctness(
}

mm_counts = {k: len(vs) for k, vs in mm_data.items()}
prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
prompt = dummy_inputs.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
).prompt_text
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import torch

from vllm.model_executor.models.llava import (LlavaForConditionalGeneration,
LlavaMultiModalProcessor)
from vllm.model_executor.models.llava import (LlavaDummyInputsBuilder,
LlavaForConditionalGeneration,
LlavaMultiModalProcessor,
LlavaProcessingInfo)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY


@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor)
@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor,
info=LlavaProcessingInfo,
dummy=LlavaDummyInputsBuilder)
class MyLlava(LlavaForConditionalGeneration):

def compute_logits(
Expand Down
2 changes: 1 addition & 1 deletion vllm/inputs/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from vllm.utils import print_info_once, print_warning_once
Expand Down
4 changes: 3 additions & 1 deletion vllm/inputs/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ def dummy_data_for_profiling(
# Avoid circular import
from vllm.model_executor.model_loader import get_model_architecture
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.profiler import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer

if mm_registry.has_processor(model_config):
Expand All @@ -331,7 +332,8 @@ def dummy_data_for_profiling(
trust_remote_code=model_config.trust_remote_code,
)
processor = mm_registry.create_processor(model_config, tokenizer)
dummy_data = processor.get_dummy_data(seq_len)
profiler = MultiModalProfiler(processor)
dummy_data = profiler.get_dummy_data(seq_len)
else:
model_cls, _ = get_model_architecture(model_config)
if is_encoder_data:
Expand Down
Loading
Loading