dora-rs · haixuanTao · Oct 7, 2024 · Sep 26, 2024 · Oct 1, 2024 · Oct 7, 2024
diff --git a/node-hub/dora-qwenvl/dora_qwenvl/main.py b/node-hub/dora-qwenvl/dora_qwenvl/main.py
@@ -12,6 +12,7 @@
     "DEFAULT_QUESTION",
     "Describe this image",
 )
+ADAPTER_PATH = os.getenv("ADAPTER_PATH", "")
 
 # Check if flash_attn is installed
 try:
@@ -23,16 +24,20 @@
         device_map="auto",
         attn_implementation="flash_attention_2",
     )
-except ImportError:
+except (ImportError, ModuleNotFoundError):
     model = Qwen2VLForConditionalGeneration.from_pretrained(
         CUSTOM_MODEL_PATH,
         torch_dtype="auto",
         device_map="auto",
     )
 
 
+if ADAPTER_PATH != "":
+    model.load_adapter(ADAPTER_PATH, "dora")
+
+
 # default processor
-processor = AutoProcessor.from_pretrained(DEFAULT_PATH)
+processor = AutoProcessor.from_pretrained(CUSTOM_MODEL_PATH)
 
 
 def generate(frames: dict, question):

diff --git a/node-hub/dora-qwenvl/pyproject.toml b/node-hub/dora-qwenvl/pyproject.toml
@@ -16,7 +16,7 @@ dora-rs = "^0.3.6"
 numpy = "< 2.0.0"
 torch = "^2.4.0"
 torchvision = "^0.19"
-transformers = { git = "https://github.com/huggingface/transformers" }
+transformers = "^4.45"
 qwen-vl-utils = "^0.0.2"
 accelerate = "^0.33"
 # flash_attn = "^2.6.1" # Install using: pip install -U flash-attn --no-build-isolation

diff --git a/node-hub/llama-factory-recorder/llama_factory_recorder/main.py b/node-hub/llama-factory-recorder/llama_factory_recorder/main.py
@@ -42,7 +42,7 @@ def write_dict_to_json(file_path, key: str, new_data):
 
 
 def save_image_and_add_to_json(
-    image_array, root_path, llama_root_path, jsonl_file, messages
+    frame_dict: dict, root_path, llama_root_path, jsonl_file, messages
 ):
     """
     Saves an image from a NumPy array and adds a new JSON object as a line to a JSONL file.
@@ -69,17 +69,19 @@ def save_image_and_add_to_json(
             if os.path.isfile(os.path.join(llama_root_path / root_path, name))
         ]
     )
+    image_paths = []
+    for event_id, data in frame_dict.items():
+        # Define the image filename
+        image_filename = f"{event_id}-{image_id}.png"
+        image_path = os.path.join(root_path, image_filename)
 
-    # Define the image filename
-    image_filename = f"{image_id}.png"
-    image_path = os.path.join(root_path, image_filename)
-
-    # Save the image
-    image = Image.fromarray(image_array)
-    image.save(llama_root_path / image_path)
+        # Save the image
+        image = Image.fromarray(data)
+        image.save(llama_root_path / image_path)
+        image_paths.append(image_path)
 
     # Create the JSON entry with 'messages' and 'images'
-    new_entry = {"messages": messages, "images": [image_path]}
+    new_entry = {"messages": messages, "images": image_paths}
 
     # Add the entry to the JSONL file with UTF-8 encoding
     with open(jsonl_file, "a", encoding="utf-8") as f:
@@ -123,15 +125,15 @@ def main():
     )
 
     question = DEFAULT_QUESTION
-    frame = None
+    frames = {}
 
     for event in node:
         event_type = event["type"]
 
         if event_type == "INPUT":
             event_id = event["id"]
 
-            if event_id == "image":
+            if "image" in event_id:
                 storage = event["value"]
                 metadata = event["metadata"]
                 encoding = metadata["encoding"]
@@ -153,7 +155,7 @@ def main():
                     .reshape((height, width, channels))
                 )
                 if encoding == "bgr8":
-                    frame = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
+                    frames[event_id] = frame[:, :, ::-1]  # OpenCV image (BGR to RGB)
                 elif encoding == "rgb8":
                     pass
                 else:
@@ -164,20 +166,23 @@ def main():
                 if text != "":
                     question = text
             elif event_id == "ground_truth":
-                if frame is None:
+                if len(frames.keys()) == 0:
                     continue
                 ground_truth = event["value"][0].as_py()
 
                 messages = [
-                    {"content": "<image>" + question, "role": "user"},
+                    {
+                        "content": "<image>" * len(frames.keys()) + question,
+                        "role": "user",
+                    },
                     {
                         "content": ground_truth,
                         "role": "assistant",
                     },
                 ]
 
                 save_image_and_add_to_json(
-                    image_array=frame,
+                    frame_dict=frames,
                     root_path=entry_name,
                     llama_root_path=llama_factory_root_path,
                     jsonl_file=default_record_json_path,