From 9abd1b80eb86f620651997852282860d449c1ef3 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Fri, 7 Jun 2024 01:45:06 +0800 Subject: [PATCH 01/23] add videollava --- src/llamafactory/data/aligner.py | 24 ++++++++++- src/llamafactory/data/parser.py | 3 +- src/llamafactory/data/processors/mm_utils.py | 42 ++++++++++++++++++- .../data/processors/supervised.py | 24 +++++++++-- src/llamafactory/data/template.py | 3 ++ src/llamafactory/hparams/model_args.py | 4 ++ src/llamafactory/model/loader.py | 2 +- 7 files changed, 92 insertions(+), 10 deletions(-) diff --git a/src/llamafactory/data/aligner.py b/src/llamafactory/data/aligner.py index 2a382c60db..8e4ceb45c4 100644 --- a/src/llamafactory/data/aligner.py +++ b/src/llamafactory/data/aligner.py @@ -33,14 +33,30 @@ def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: " return outputs +def _convert_videos(videos: List[Any], dataset_attr: "DatasetAttr", data_args: "DataArguments") -> List[Any]: + r""" + Optionally concatenates image path to dataset dir when loading from local disk. + """ + outputs = [] + if dataset_attr.load_from in ["script", "file"]: + for video in videos: + if isinstance(video, str) and os.path.isfile(os.path.join(data_args.dataset_dir, video)): + outputs.append(os.path.join(data_args.dataset_dir, video)) + else: + outputs.append(video) + + return outputs + + def convert_alpaca( examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" ) -> Dict[str, List[Any]]: r""" Converts alpaca format dataset to the standard format. """ - outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} + outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": [], "videos": []} convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args) + convert_videos = partial(_convert_videos, dataset_attr=dataset_attr, data_args=data_args) for i in range(len(examples[dataset_attr.prompt])): prompt = [] if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list): @@ -82,6 +98,7 @@ def convert_alpaca( outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) + outputs["videos"].append(convert_videos(examples[dataset_attr.videos][i]) if dataset_attr.videos else []) return outputs @@ -92,8 +109,9 @@ def convert_sharegpt( r""" Converts sharegpt format dataset to the standard format. """ - outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} + outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": [], "videos": []} convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args) + convert_videos = partial(_convert_videos, dataset_attr=dataset_attr, data_args=data_args) tag_mapping = { dataset_attr.user_tag: Role.USER.value, dataset_attr.assistant_tag: Role.ASSISTANT.value, @@ -170,6 +188,7 @@ def convert_sharegpt( outputs["system"].append(system) outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) + outputs["videos"].append(convert_videos(examples[dataset_attr.videos][i]) if dataset_attr.videos else []) return outputs @@ -202,6 +221,7 @@ def align_dataset( "system": {"dtype": "string", "_type": "Value"}, "tools": {"dtype": "string", "_type": "Value"}, "images": [{"_type": "Image"}], + "videos": [{"dtype": "string", "_type": "Value"}] } ) kwargs = {} diff --git a/src/llamafactory/data/parser.py b/src/llamafactory/data/parser.py index ec97bfc1a2..50fe0fe31b 100644 --- a/src/llamafactory/data/parser.py +++ b/src/llamafactory/data/parser.py @@ -30,6 +30,7 @@ class DatasetAttr: system: Optional[str] = None tools: Optional[str] = None images: Optional[str] = None + videos: Optional[str] = None """ rlhf columns """ chosen: Optional[str] = None rejected: Optional[str] = None @@ -110,7 +111,7 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: dataset_attr.set_attr("num_samples", dataset_info[name]) if "columns" in dataset_info[name]: - column_names = ["system", "tools", "images", "chosen", "rejected", "kto_tag"] + column_names = ["system", "tools", "images", "videos", "chosen", "rejected", "kto_tag"] if dataset_attr.formatting == "alpaca": column_names.extend(["prompt", "query", "response", "history"]) else: diff --git a/src/llamafactory/data/processors/mm_utils.py b/src/llamafactory/data/processors/mm_utils.py index abc7c4b2c2..4cde4b7153 100644 --- a/src/llamafactory/data/processors/mm_utils.py +++ b/src/llamafactory/data/processors/mm_utils.py @@ -1,5 +1,8 @@ from typing import TYPE_CHECKING, List, Sequence +import av +import numpy as np + from ...extras.packages import is_pillow_available @@ -12,13 +15,48 @@ from PIL.Image import Image as ImageObject from transformers import ProcessorMixin from transformers.image_processing_utils import BaseImageProcessor + from av import Container -def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "NDArray": +def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin", image_key: "str" = "pixel_values") -> "NDArray": # process visual inputs (currently only supports a single image) image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255)) - return image_processor(image, return_tensors="pt")["pixel_values"][0] # shape (C, H, W) + return image_processor(image, return_tensors="pt")[image_key][0] # shape (C, H, W) + + +def get_pixel_values_videos(videos: Sequence["str"], processor: "ProcessorMixin", video_key: "str" = "pixel_values_videos") -> "NDArray": + # process video inputs (currently only supports a single video) + image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") + container = av.open(videos[0]) + total_frames = container.streams.video[0].frames + indices = np.arange(0, total_frames, total_frames / 8).astype(int) + clip = read_video_pyav(container, indices) + inputs = image_processor(videos=clip, padding=True, return_tensors="pt", images=None)[video_key][0] + return inputs + + +def read_video_pyav(container: "Container", indices: "NDArray"): + ''' + Decode the video with PyAV decoder. + + Args: + container (av.container.input.InputContainer): PyAV container. + indices (List[int]): List of frame indices to decode. + + Returns: + np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") -> List[int]: diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index b119aa2222..637373eb32 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -2,8 +2,7 @@ from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger -from .mm_utils import get_paligemma_token_type_ids, get_pixel_values - +from .mm_utils import get_paligemma_token_type_ids, get_pixel_values, get_pixel_values_videos if TYPE_CHECKING: from transformers import ProcessorMixin @@ -26,8 +25,17 @@ def preprocess_supervised_dataset( # build inputs with format ` X Y ` and labels with format ` ... Y ` # for multiturn examples, we only mask the prompt part in each prompt-response pair. model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + + image_key = "pixel_values" + video_key = "pixel_values_videos" + if processor is not None: - model_inputs["pixel_values"] = [] + if len(examples["images"][0]): + model_inputs[image_key] = [] + + if len(examples["videos"][0]): + model_inputs[video_key] = [] + if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"] = [] @@ -39,6 +47,9 @@ def preprocess_supervised_dataset( if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models examples["prompt"][i][0]["content"] = template.image_token + examples["prompt"][i][0]["content"] + if len(examples["videos"][i]): + examples["prompt"][i][0]["content"] = template.video_token + examples["prompt"][i][0]["content"] + messages = examples["prompt"][i] + examples["response"][i] input_ids, labels = [], [] @@ -75,7 +86,12 @@ def preprocess_supervised_dataset( model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) if processor is not None: - model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) + if len(examples["images"][i]): + model_inputs[image_key].append(get_pixel_values(examples["images"][i], processor, image_key)) + + if len(examples["videos"][i]): + model_inputs[video_key].append(get_pixel_values_videos(examples["videos"][i], processor, video_key)) + if hasattr(processor, "image_seq_length"): # paligemma models model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py index fe0211c674..b421f3a1cc 100644 --- a/src/llamafactory/data/template.py +++ b/src/llamafactory/data/template.py @@ -27,6 +27,7 @@ class Template: default_system: str stop_words: List[str] image_token: str + video_token: str efficient_eos: bool replace_eos: bool force_system: bool @@ -211,6 +212,7 @@ def _register_template( default_system: str = "", stop_words: List[str] = [], image_token: str = "", + video_token: str = "