hiyouga · BUAADreamer · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 7, 2024
diff --git a/data/dataset_info.json b/data/dataset_info.json
@@ -38,6 +38,35 @@
       "assistant_tag": "assistant"
     }
   },
+  "video_demo": {
+    "file_name": "video_demo.json",
+    "formatting": "sharegpt",
+    "columns": {
+      "messages": "messages",
+      "videos": "videos"
+    },
+    "tags": {
+      "role_tag": "role",
+      "content_tag": "content",
+      "user_tag": "user",
+      "assistant_tag": "assistant"
+    }
+  },
+  "visual_mix_demo": {
+    "file_name": "visual_mix_demo.json",
+    "formatting": "sharegpt",
+    "columns": {
+      "messages": "messages",
+      "videos": "videos",
+      "images": "images"
+    },
+    "tags": {
+      "role_tag": "role",
+      "content_tag": "content",
+      "user_tag": "user",
+      "assistant_tag": "assistant"
+    }
+  },
   "alpaca_en": {
     "hf_hub_url": "llamafactory/alpaca_en",
     "ms_hub_url": "llamafactory/alpaca_en"

diff --git a/data/mllm_demo_data/1.mp4 b/data/mllm_demo_data/1.mp4
diff --git a/data/mllm_demo_data/2.avi b/data/mllm_demo_data/2.avi
diff --git a/data/mllm_demo_data/3.mp4 b/data/mllm_demo_data/3.mp4
diff --git a/data/video_demo.json b/data/video_demo.json
@@ -0,0 +1,47 @@
+[
+  {
+    "messages": [
+      {
+        "content": "Why is this video funny?",
+        "role": "user"
+      },
+      {
+        "content": "Because a baby is reading, and he is so cute!",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/1.mp4"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "What is she doing?",
+        "role": "user"
+      },
+      {
+        "content": "She is cooking",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/2.avi"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "What's in the video?",
+        "role": "user"
+      },
+      {
+        "content": "A baby is playing in the living room",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/3.mp4"
+    ]
+  }
+]
diff --git a/data/visual_mix_demo.json b/data/visual_mix_demo.json
@@ -0,0 +1,56 @@
+[
+  {
+    "messages": [
+      {
+        "content": "<video>Why is this video funny?<image>",
+        "role": "user"
+      },
+      {
+        "content": "Because a baby is reading, and he is so cute!",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/1.mp4"
+    ],
+    "images": [
+      "mllm_demo_data/1.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<video>What is she doing?<image>",
+        "role": "user"
+      },
+      {
+        "content": "She is cooking",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/2.avi"
+    ],
+    "images": [
+      "mllm_demo_data/2.jpg"
+    ]
+  },
+  {
+    "messages": [
+      {
+        "content": "<video>Why is this video funny?<image>",
+        "role": "user"
+      },
+      {
+        "content": "A baby is playing!",
+        "role": "assistant"
+      }
+    ],
+    "videos": [
+      "mllm_demo_data/3.mp4"
+    ],
+    "images": [
+      "mllm_demo_data/3.jpg"
+    ]
+  }
+]
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,8 @@ known-third-party = [
     "peft",
     "torch",
     "transformers",
-    "trl"
+    "trl",
+    "av",
 ]
 
 [tool.ruff.format]

diff --git a/requirements.txt b/requirements.txt
@@ -18,4 +18,5 @@ matplotlib>=3.7.0
 fire
 packaging
 pyyaml
+av
 numpy<2.0.0
diff --git a/src/llamafactory/chat/base_engine.py b/src/llamafactory/chat/base_engine.py
@@ -57,6 +57,7 @@ async def chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]: ...
 
@@ -67,6 +68,7 @@ async def stream_chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]: ...
 

diff --git a/src/llamafactory/chat/chat_model.py b/src/llamafactory/chat/chat_model.py
@@ -78,9 +78,10 @@ def stream_chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> Generator[str, None, None]:
-        generator = self.astream_chat(messages, system, tools, image, **input_kwargs)
+        generator = self.astream_chat(messages, system, tools, image, video, **input_kwargs)
         while True:
             try:
                 task = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop)
@@ -94,9 +95,10 @@ async def astream_chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]:
-        async for new_token in self.engine.stream_chat(messages, system, tools, image, **input_kwargs):
+        async for new_token in self.engine.stream_chat(messages, system, tools, image, video, **input_kwargs):
             yield new_token
 
     def get_scores(

diff --git a/src/llamafactory/chat/hf_engine.py b/src/llamafactory/chat/hf_engine.py
@@ -26,12 +26,11 @@
 from ..extras.misc import get_logits_processor
 from ..model import load_model, load_tokenizer
 from .base_engine import BaseEngine, Response
-
+from ..data.processors.processor_utils import get_pixel_values, get_pixel_values_videos
 
 if TYPE_CHECKING:
     from numpy.typing import NDArray
     from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
-    from transformers.image_processing_utils import BaseImageProcessor
     from trl import PreTrainedModelWrapper
 
     from ..data import Template
@@ -79,30 +78,58 @@ def _process_args(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> Tuple[Dict[str, Any], int]:
+        processor_class = "" if processor is None else type(processor).__name__
+
+        visual_token_flag = template.image_token in messages[0]["content"] or \
+                            template.video_token in messages[0]["content"]
+
         if (
             processor is not None
             and image is not None
-            and not hasattr(processor, "image_seq_length")
+            and not processor_class == 'PaliGemmaProcessor'
             and template.image_token not in messages[0]["content"]
-        ):  # llava-like models
+            and not visual_token_flag
+        ):
             messages[0]["content"] = template.image_token + messages[0]["content"]
 
+        if (
+            processor is not None
+            and video is not None
+            and template.video_token not in messages[0]["content"]
+            and not visual_token_flag
+        ):
+            messages[0]["content"] = template.video_token + messages[0]["content"]
+
+        if processor_class == 'Idefics2Processor':
+            fake_image_token = processor.fake_image_token.content
+            image_str = f"{fake_image_token}{template.image_token * processor.image_seq_len}{fake_image_token}"
+            image_str = image_str * 5
+            for j in range(len(messages)):
+                content = messages[j]['content']
+                content = content.replace(template.image_token, image_str)
+                content = content.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
+                messages[j]['content'] = content
+
         paired_messages = messages + [{"role": "assistant", "content": ""}]
         system = system or generating_args["default_system"]
         pixel_values = None
+        pixel_values_video = None
         prompt_ids, _ = template.encode_oneturn(
             tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools
         )
+
         if processor is not None and image is not None:  # add image features
-            image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
-            batch_feature = image_processor(image, return_tensors="pt")
-            pixel_values = batch_feature.to(model.device)["pixel_values"]  # shape (B, C, H, W)
-            if hasattr(processor, "image_seq_length"):  # paligemma models
+            pixel_values = get_pixel_values([image], processor, template.image_data_key)
+            if processor_class == 'PaliGemmaProcessor':  # paligemma models
                 image_token_id = tokenizer.convert_tokens_to_ids(template.image_token)
                 prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids
 
+        if processor is not None and video is not None:  # add video features
+            pixel_values_video = get_pixel_values_videos([video], processor, template.video_data_key)
+
         prompt_length = len(prompt_ids)
         inputs = torch.tensor([prompt_ids], device=model.device)
         attention_mask = torch.ones_like(inputs, dtype=torch.bool)
@@ -165,7 +192,12 @@ def _process_args(
         )
 
         if pixel_values is not None:
-            gen_kwargs["pixel_values"] = pixel_values
+            for key in template.image_data_key:
+                gen_kwargs[key] = pixel_values[key].to(model.device)
+
+        if pixel_values_video is not None:
+            for key in template.video_data_key:
+                gen_kwargs[key] = pixel_values_video[key].to(model.device)
 
         return gen_kwargs, prompt_length
 
@@ -181,10 +213,11 @@ def _chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> List["Response"]:
         gen_kwargs, prompt_length = HuggingfaceEngine._process_args(
-            model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs
+            model, tokenizer, processor, template, generating_args, messages, system, tools, image, video, input_kwargs
         )
         generate_output = model.generate(**gen_kwargs)
         response_ids = generate_output[:, prompt_length:]
@@ -216,10 +249,11 @@ def _stream_chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         input_kwargs: Optional[Dict[str, Any]] = {},
     ) -> Callable[[], str]:
         gen_kwargs, _ = HuggingfaceEngine._process_args(
-            model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs
+            model, tokenizer, processor, template, generating_args, messages, system, tools, image, video, input_kwargs
         )
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs["streamer"] = streamer
@@ -273,6 +307,7 @@ async def chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> List["Response"]:
         if not self.can_generate:
@@ -289,6 +324,7 @@ async def chat(
             system,
             tools,
             image,
+            video,
             input_kwargs,
         )
         async with self.semaphore:
@@ -301,6 +337,7 @@ async def stream_chat(
         system: Optional[str] = None,
         tools: Optional[str] = None,
         image: Optional["NDArray"] = None,
+        video: Optional["NDArray"] = None,
         **input_kwargs,
     ) -> AsyncGenerator[str, None]:
         if not self.can_generate:
@@ -317,6 +354,7 @@ async def stream_chat(
             system,
             tools,
             image,
+            video,
             input_kwargs,
         )
         async with self.semaphore: