Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] Enable decord to load video from base64 #11492

Merged
merged 3 commits into from
Dec 25, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 19 additions & 20 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,7 @@ async def async_fetch_image(image_url: str,
return image.convert(image_mode)


def _load_video_frames_from_bytes(b: bytes):
frame = Image.open(BytesIO(b))
return np.array(frame)


def load_video_frames_from_base64(frame: Union[bytes, str]):
"""Load frame from base64 format."""
return _load_video_frames_from_bytes(base64.b64decode(frame))


def _load_video_from_bytes(b: bytes, num_frames: int = 32):
def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
_, decord = try_import_video_packages()

video_path = BytesIO(b)
Expand All @@ -155,13 +145,17 @@ def _load_video_from_bytes(b: bytes, num_frames: int = 32):
return frames


def _load_video_from_data_url(video_url: str):
# Only split once and assume the second part is the base64 encoded image
frames_base64 = video_url.split(",")[1:]
return np.stack([
load_video_frames_from_base64(frame_base64)
for frame_base64 in frames_base64
])
def _load_video_from_data_url(video_url: str) -> npt.NDArray:
# Only split once and assume the second part is the base64 encoded video
_, video_base64 = video_url.split(",", 1)

if video_url.startswith("data:video/jpeg;"):
return np.stack([
np.array(load_image_from_base64(frame_base64))
for frame_base64 in video_base64.split(",")
])

return load_video_from_base64(video_base64)


def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
Expand Down Expand Up @@ -342,7 +336,7 @@ def rescale_image_size(image: Image.Image,
return image


def try_import_video_packages() -> Any:
def try_import_video_packages():
try:
import cv2
import decord
Expand Down Expand Up @@ -384,7 +378,7 @@ def sample_frames_from_video(frames: npt.NDArray,
return sampled_frames


def encode_video_base64(frames: npt.NDArray):
def encode_video_base64(frames: npt.NDArray) -> str:
base64_frames = []
frames_list = [frames[i] for i in range(frames.shape[0])]
for frame in frames_list:
Expand All @@ -393,6 +387,11 @@ def encode_video_base64(frames: npt.NDArray):
return ",".join(base64_frames)


def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray:
"""Load video from base64 format."""
return _load_video_from_bytes(base64.b64decode(video))


def resolve_visual_encoder_outputs(
encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
feature_sample_layers: Optional[list[int]],
Expand Down
Loading