From 0ae256de34e3c95d8de2e161eda13650895661c8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 00:33:55 +0800
Subject: [PATCH] [Frontend] Enable decord to load video from base64 (#11492)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/multimodal/utils.py | 39 +++++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c898ca4e6573e..be9643598448d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -125,17 +125,7 @@ async def async_fetch_image(image_url: str,
     return image.convert(image_mode)
 
 
-def _load_video_frames_from_bytes(b: bytes):
-    frame = Image.open(BytesIO(b))
-    return np.array(frame)
-
-
-def load_video_frames_from_base64(frame: Union[bytes, str]):
-    """Load frame from base64 format."""
-    return _load_video_frames_from_bytes(base64.b64decode(frame))
-
-
-def _load_video_from_bytes(b: bytes, num_frames: int = 32):
+def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
     _, decord = try_import_video_packages()
 
     video_path = BytesIO(b)
@@ -155,13 +145,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
     return frames
 
 
-def _load_video_from_data_url(video_url: str):
-    # Only split once and assume the second part is the base64 encoded image
-    frames_base64 = video_url.split(",")[1:]
-    return np.stack([
-        load_video_frames_from_base64(frame_base64)
-        for frame_base64 in frames_base64
-    ])
+def _load_video_from_data_url(video_url: str) -> npt.NDArray:
+    # Only split once and assume the second part is the base64 encoded video
+    _, video_base64 = video_url.split(",", 1)
+
+    if video_url.startswith("data:video/jpeg;"):
+        return np.stack([
+            np.array(load_image_from_base64(frame_base64))
+            for frame_base64 in video_base64.split(",")
+        ])
+
+    return load_video_from_base64(video_base64)
 
 
 def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
@@ -342,7 +336,7 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
-def try_import_video_packages() -> Any:
+def try_import_video_packages():
     try:
         import cv2
         import decord
@@ -384,7 +378,7 @@ def sample_frames_from_video(frames: npt.NDArray,
         return sampled_frames
 
 
-def encode_video_base64(frames: npt.NDArray):
+def encode_video_base64(frames: npt.NDArray) -> str:
     base64_frames = []
     frames_list = [frames[i] for i in range(frames.shape[0])]
     for frame in frames_list:
@@ -393,6 +387,11 @@ def encode_video_base64(frames: npt.NDArray):
     return ",".join(base64_frames)
 
 
+def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
+    """Load video from base64 format."""
+    return _load_video_from_bytes(base64.b64decode(video))
+
+
 def resolve_visual_encoder_outputs(
     encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
     feature_sample_layers: Optional[list[int]],