diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 85d844f3d3f55..d07cde3db5c6e 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -618,7 +618,7 @@ See [this page](#generative-models) for more information on how to use generativ
* - `DeepseekVLV2ForCausalLM`
- DeepSeek-VL2
- T + I+
- - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+ - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
-
- ✅︎
- ✅︎
@@ -768,9 +768,8 @@ See [this page](#generative-models) for more information on how to use generativ
+ Multiple items can be inputted per text prompt for this modality.
````{note}
-The `deepseek-ai/deepseek-vl2-tiny` is not supported yet.
-
To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package:
+
```shell
pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git
```
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index ad32b9fe242e9..8bc715a50e0db 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -70,7 +70,7 @@ def run_chameleon(question: str, modality: str):
def run_deepseek_vl2(question: str, modality: str):
assert modality == "image"
- model_name = "deepseek-ai/deepseek-vl2-small"
+ model_name = "deepseek-ai/deepseek-vl2-tiny"
llm = LLM(model=model_name,
max_model_len=4096,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index c6cf3f30c31cb..33ef5f316f040 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
def load_deepseek_vl2(question: str, image_urls: List[str]):
- model_name = "deepseek-ai/deepseek-vl2-small"
+ model_name = "deepseek-ai/deepseek-vl2-tiny"
llm = LLM(model=model_name,
max_model_len=4096,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7620ed1107e8f..5710303548c34 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,6 +9,7 @@
import pytest
from transformers import AutoModelForVision2Seq
+from transformers import __version__ as TRANSFORMERS_VERSION
from transformers.utils import is_flash_attn_2_available
from vllm.platforms import current_platform
@@ -189,30 +190,27 @@
dtype="bfloat16",
),
"deepseek_vl_v2": VLMTestInfo(
- models=["deepseek-ai/deepseek-vl2-small"],
+ models=["deepseek-ai/deepseek-vl2-tiny"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
- dtype="bfloat16",
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts({
- "stop_sign": "\nWhat's the color of the stop sign and car?",
- "cherry_blossom": "\nWhat's the color of the tower?",
+ "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501
+ "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501
}),
- multi_image_prompt="image_1:\nimage_2:\nDescribe the two images shortly.", # noqa: E501
+ multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
- image_size_factors=[(0.10, 0.15)],
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501
- num_logprobs=5,
+ image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
marks=[
pytest.mark.skipif(
- not is_flash_attn_2_available(),
- reason="Model needs flash-attn for numeric convergence.",
- ),
- large_gpu_mark(min_gb=48),
+ TRANSFORMERS_VERSION >= "4.48.0",
+ reason="HF model is not compatible with transformers>=4.48.0",
+ )
],
),
"fuyu": VLMTestInfo(
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b0f0f9767a90f..938c838617e8b 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -181,8 +181,7 @@ class _HfExamplesInfo:
trust_remote_code=True),
"ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
is_available_online=False),
- # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported
- "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"), # noqa: E501
+ "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"), # noqa: E501
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 99fa941c055d2..4553695022169 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -356,13 +356,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
)
+ if self.text_config.topk_method == "noaux_tc":
+ architectures = ["DeepseekV3ForCausalLM"]
+ elif not self.text_config.use_mla:
+ architectures = ["DeepseekForCausalLM"]
+ else:
+ architectures = ["DeepseekV2ForCausalLM"]
+
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=self.text_config,
prefix=maybe_prefix(prefix, "language"),
- architectures=["DeepseekV3ForCausalLM"]
- if self.text_config.topk_method == "noaux_tc" else
- ["DeepseekV2ForCausalLM"],
+ architectures=architectures,
)
self.make_empty_intermediate_tensors = (