Skip to content

Commit

Permalink
[Doc] Add video example to openai client for multimodal (vllm-project…
Browse files Browse the repository at this point in the history
…#11521)

Signed-off-by: Isotr0py <[email protected]>
Co-authored-by: Cyrus Leung <[email protected]>
Signed-off-by: xcnick <[email protected]>
  • Loading branch information
2 people authored and xcnick committed Dec 31, 2024
1 parent 64641ac commit 0f2af42
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 11 deletions.
52 changes: 49 additions & 3 deletions docs/source/usage/multimodal_inputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>

### Video

Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).

You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
First, launch the OpenAI-compatible server:

```bash
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
```

Then, you can use the OpenAI client as follows:
```python
from openai import OpenAI

openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"

## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
```

Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>

````{note}
By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
You can override this by setting the environment variable:
```console
Expand Down
73 changes: 65 additions & 8 deletions examples/openai_chat_completion_client_for_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import requests
from openai import OpenAI

from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser

# Modify OpenAI's API key and API base to use vLLM's API server.
Expand Down Expand Up @@ -151,8 +150,66 @@ def run_multi_image() -> None:
print("Chat completion output:", result)


# Video input inference
def run_video() -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url)

## Use video url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)

result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)

## Use base64 encoded video in the payload
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": f"data:video/mp4;base64,{video_base64}"
},
},
],
}],
model=model,
max_completion_tokens=64,
)

result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)


# Audio input inference
def run_audio() -> None:
from vllm.assets.audio import AudioAsset

audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)

Expand Down Expand Up @@ -240,6 +297,7 @@ def run_audio() -> None:
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"video": run_video,
"audio": run_audio,
}

Expand All @@ -253,12 +311,11 @@ def main(args) -> None:
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online inference with '
'multimodal language models served with vLLM.')
parser.add_argument(
'--chat-type',
'-c',
type=str,
default="single-image",
choices=["text-only", "single-image", "multi-image", "audio"],
help='Conversation type with multimodal data.')
parser.add_argument('--chat-type',
'-c',
type=str,
default="single-image",
choices=list(example_function_map.keys()),
help='Conversation type with multimodal data.')
args = parser.parse_args()
main(args)

0 comments on commit 0f2af42

Please sign in to comment.