From 0ae256de34e3c95d8de2e161eda13650895661c8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 26 Dec 2024 00:33:55 +0800 Subject: [PATCH] [Frontend] Enable decord to load video from base64 (#11492) Signed-off-by: DarkLight1337 --- vllm/multimodal/utils.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c898ca4e6573e..be9643598448d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -125,17 +125,7 @@ async def async_fetch_image(image_url: str, return image.convert(image_mode) -def _load_video_frames_from_bytes(b: bytes): - frame = Image.open(BytesIO(b)) - return np.array(frame) - - -def load_video_frames_from_base64(frame: Union[bytes, str]): - """Load frame from base64 format.""" - return _load_video_frames_from_bytes(base64.b64decode(frame)) - - -def _load_video_from_bytes(b: bytes, num_frames: int = 32): +def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray: _, decord = try_import_video_packages() video_path = BytesIO(b) @@ -155,13 +145,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32): return frames -def _load_video_from_data_url(video_url: str): - # Only split once and assume the second part is the base64 encoded image - frames_base64 = video_url.split(",")[1:] - return np.stack([ - load_video_frames_from_base64(frame_base64) - for frame_base64 in frames_base64 - ]) +def _load_video_from_data_url(video_url: str) -> npt.NDArray: + # Only split once and assume the second part is the base64 encoded video + _, video_base64 = video_url.split(",", 1) + + if video_url.startswith("data:video/jpeg;"): + return np.stack([ + np.array(load_image_from_base64(frame_base64)) + for frame_base64 in video_base64.split(",") + ]) + + return load_video_from_base64(video_base64) def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: @@ -342,7 +336,7 @@ def rescale_image_size(image: Image.Image, return image -def try_import_video_packages() -> Any: +def try_import_video_packages(): try: import cv2 import decord @@ -384,7 +378,7 @@ def sample_frames_from_video(frames: npt.NDArray, return sampled_frames -def encode_video_base64(frames: npt.NDArray): +def encode_video_base64(frames: npt.NDArray) -> str: base64_frames = [] frames_list = [frames[i] for i in range(frames.shape[0])] for frame in frames_list: @@ -393,6 +387,11 @@ def encode_video_base64(frames: npt.NDArray): return ",".join(base64_frames) +def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray: + """Load video from base64 format.""" + return _load_video_from_bytes(base64.b64decode(video)) + + def resolve_visual_encoder_outputs( encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], feature_sample_layers: Optional[list[int]],