fix model_dtype issue and reformat mllm code (#335)

intel · Nov 22, 2024 · fcaaac9 · fcaaac9
1 parent 8fb9552
commit fcaaac9
Show file tree

Hide file tree

Showing 10 changed files with 209 additions and 176 deletions.
diff --git a/auto_round/mllm/README.md b/auto_round/mllm/README.md
@@ -1,6 +1,10 @@
 # AutoRound for MLLMs
+
 ## Basic Usage (Gaudi2/CPU/GPU)
-A user guide detailing the full list of supported arguments is provided by calling ```auto-round-mllm -h``` on the terminal.Alternatively, you can use ```auto_round_mllm``` instead of ```auto-round-mllm```. Set the format you want in `format` and
+
+A user guide detailing the full list of supported arguments is provided by calling ```auto-round-mllm -h``` on the
+terminal.Alternatively, you can use ```auto_round_mllm``` instead of ```auto-round-mllm```. Set the format you want
+in `format` and
 multiple formats exporting has been supported.
 
 ```bash
@@ -14,7 +18,9 @@ auto—round-mllm \
     --format "auto_round" \
     --output_dir ./tmp_autoround
 ```
+
 ## API Usage (Gaudi2/CPU/GPU)
+
 ```python
 from auto_round import AutoRoundMLLM
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
@@ -24,58 +30,76 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=trust_remote_code)
 tokenizer.processor = processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
-    model_name, trust_remote_code=True) 
+    model_name, trust_remote_code=True)
 dataset = "/path/to/llava.json"
-extra_data_dir = "/path/to/images/dir" 
+extra_data_dir = "/path/to/images/dir"
 
 bits, group_size = 4, 128
-autoround = AutoRoundMLLM(model, tokenizer, bits=bits, group_size=group_size, dataset=dataset, extra_data_dir=extra_data_dir)
+autoround = AutoRoundMLLM(model, tokenizer, bits=bits, group_size=group_size, dataset=dataset,
+                          extra_data_dir=extra_data_dir)
 
 autoround.quantize()
 output_dir = "./tmp_autoround"
 autoround.save_quantized(output_dir, format='auto_round', inplace=True)
 ```
 
 ### Dataset
-For mllm, we used liuhaotian/llava_conv_58k as our default calib datasets. Through argument ```--dataset```, user can use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local file.
+
+For mllm, we used liuhaotian/llava_conv_58k as our default calib datasets. Through argument ```--dataset```, user can
+use other datasets such as "liuhaotian/llava_instruct_80k", "liuhaotian/llava_instruct_150k" or a file path to use local
+file.
 
 ### Support Matrix
-So far, auto-round for mllm supports five model families, include Qwen2-VL, Llama-Vision, Phi3-Vision, Llava-v1.5 and CogVLM2.
 
-|Model          |Eval Lib   |calibration dataset|quant nontext module|
-|---------------|-----------|-------------------|--------------------|
-|Qwen2-VL       |vlmeval    |pile/llava         |-                   |
-|Llama-Vision   |lmms_eval  |llava              |✔                   |
-|Phi3-Vision    |vlmeval    |pile/llava         |✔                   |
-|Llava-v1.5     |lmms_eval  |pile/llava         |-                   |
-|CogVLM2        |lmms_eval  |pile/llava         |✔                   |
+So far, auto-round for mllm supports five model families, include Qwen2-VL, Llama-Vision, Phi3-Vision, Llava-v1.5 and
+CogVLM2.
+
+| Model        | Eval Lib  | calibration dataset | quant nontext module |
+|--------------|-----------|---------------------|----------------------|
+| Qwen2-VL     | vlmeval   | pile/llava          | -                    |
+| Llama-Vision | lmms_eval | llava               | ✔                    |
+| Phi3-Vision  | vlmeval   | pile/llava          | ✔                    |
+| Llava-v1.5   | lmms_eval | pile/llava          | -                    |
+| CogVLM2      | lmms_eval | pile/llava          | ✔                    |
 
 ## New Models Support
+
 ### Template
-For autoround MLLMs, using Template to customize different operations for different models. User can add a custom chat template through json file as below.
+
+For autoround MLLMs, using Template to customize different operations for different models. User can add a custom chat
+template through json file as below.
+
 ```json
 {
-    "model_type": "qwen2_vl",
-    "format_user": "<|im_start|>user\n{{content}}<|im_end|>\n",
-    "format_assistant": "<|im_start|>assistant\n{{content}}<|im_end|>\n",
-    "format_system": "<|im_start|>system\n{{content}}<|im_end|>\n",
-    "format_observation": "<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n",
-    "format_separator": "\n",
-    "default_system": "You are a helpful assistant.",
-    "replace_tokens": ["<image>", "<|vision_start|><|image_pad|><|vision_end|>"],
-    "extra_encode": "True",
-    "processor": "qwen2_vl" 
+  "model_type": "qwen2_vl",
+  "format_user": "<|im_start|>user\n{{content}}<|im_end|>\n",
+  "format_assistant": "<|im_start|>assistant\n{{content}}<|im_end|>\n",
+  "format_system": "<|im_start|>system\n{{content}}<|im_end|>\n",
+  "format_observation": "<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n",
+  "format_separator": "\n",
+  "default_system": "You are a helpful assistant.",
+  "replace_tokens": [
+    "<image>",
+    "<|vision_start|><|image_pad|><|vision_end|>"
+  ],
+  "extra_encode": "True",
+  "processor": "qwen2_vl"
 }
 ```
-The special token ```{{content}}``` is a placeholder to tell the preprocessor where to fill in the corresponding dialogue content.
+
+The special token ```{{content}}``` is a placeholder to tell the preprocessor where to fill in the corresponding
+dialogue content.
 
 ```format_*```: Add specific token to chat content depends on different role names.
 
 For example, the input conversations:<br>
- ```[{'role': 'user', 'value': '<image>\nWhat are the colors of the bus in the image?'}, {'role': 'assistant', 'value': 'The bus in the image is white and red.'}]```
+```[{'role': 'user', 'value': '<image>\nWhat are the colors of the bus in the image?'}, {'role': 'assistant', 'value': 'The bus in the image is white and red.'}]```
 
 Using the above template, the input will be converted to the specified format required by Qwen2-vl as below: <br>
- ```'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>\nWhat are the colors of the bus in the image?<|im_end|>\n<|im_start|>assistant\nThe bus in the image is white and red.<|im_end|>\n<|im_start|>user\nWhat feature can be seen on the back of the bus?<|im_end|>\n<|im_start|>assistant\nThe back of the bus features an advertisement.<|im_end|>\n<|im_start|>user\nIs the bus driving down the street or pulled off to the side?<|im_end|>\n<|im_start|>assistant\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\n'```.
+```'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>\nWhat are the colors of the bus in the image?<|im_end|>\n<|im_start|>assistant\nThe bus in the image is white and red.<|im_end|>\n<|im_start|>user\nWhat feature can be seen on the back of the bus?<|im_end|>\n<|im_start|>assistant\nThe back of the bus features an advertisement.<|im_end|>\n<|im_start|>user\nIs the bus driving down the street or pulled off to the side?<|im_end|>\n<|im_start|>assistant\nThe bus is driving down the street, which is crowded with people and other vehicles.<|im_end|>\n'```.
 
 ### Processor
-Processor is callback interface for calling different processors, such as texts or images processors, for MLLMs. User can define own processor and use registration function to declare. For more information, please refer to the relevant code in ```auto_round/mllm/processor.py```.
+
+Processor is callback interface for calling different processors, such as texts or images processors, for MLLMs. User
+can define own processor and use registration function to declare. For more information, please refer to the relevant
+code in ```auto_round/mllm/processor.py```.
diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py
@@ -30,12 +30,11 @@
 from ..low_cpu_mem.utils import get_layers_before_block
 
 
-
 def _only_text_test(model, tokenizer):
     """Test if the model whether can use text-only datasets."""
     try:
-        text =  ["only text", "test"]
-        tokenizer.padding_side  = 'left'
+        text = ["only text", "test"]
+        tokenizer.padding_side = 'left'
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
@@ -97,7 +96,7 @@ def __init__(
             self,
             model,
             tokenizer,
-            image_processor = None,
+            image_processor=None,
             bits: int = 4,
             group_size: int = 128,
             sym: bool = False,
@@ -145,7 +144,7 @@ def __init__(
         self.template = template if template is not None else model.config.model_type
         self.template = get_template(
             self.template, model=model, tokenizer=tokenizer, image_processor=image_processor)
-        
+
         dataset = self.template.default_dataset if dataset is None else dataset
         from ..calib_dataset import CALIB_DATASETS
         if truncation is None:
@@ -162,14 +161,13 @@ def __init__(
                                "will use liuhaotian/llava_conv_58k with default config as an alternative.")
             else:
                 logger.warning(f"{model.config.model_type} not support for {dataset},"
-                           " will use liuhaotian/llava_conv_58k with default config as an alternative.")
+                               " will use liuhaotian/llava_conv_58k with default config as an alternative.")
             dataset = "liuhaotian/llava_conv_58k"
             self.truncation = False
             batch_size = 1
             gradient_accumulate_steps = 4
             seqlen = 512
 
-
         super(AutoRoundMLLM, self).__init__(
             model=model,
             tokenizer=tokenizer,
@@ -209,10 +207,6 @@ def __init__(
             **kwargs,
         )
 
-
-
-
-
     def calib(self, nsamples, bs):
         """Perform calibration for quantization.
 
@@ -228,19 +222,19 @@ def calib(self, nsamples, bs):
         if isinstance(self.dataset, str):
             dataset = self.dataset.replace(" ", "")
             self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_mllm_dataloader(
-                    template=self.template,
-                    model=self.model,
-                    tokenizer=self.tokenizer,
-                    image_processor=self.image_processor,
-                    dataset=dataset, 
-                    extra_data_dir=self.extra_data_dir,
-                    seqlen=self.seqlen, 
-                    bs=self.batch_size,
-                    seed=self.seed,
-                    truncation=self.truncation,
-                    nsamples=self.nsamples,
-                    gradient_accumulate_steps=self.gradient_accumulate_steps,
-                    quant_nontext_module=self.quant_nontext_module
+                template=self.template,
+                model=self.model,
+                tokenizer=self.tokenizer,
+                image_processor=self.image_processor,
+                dataset=dataset,
+                extra_data_dir=self.extra_data_dir,
+                seqlen=self.seqlen,
+                bs=self.batch_size,
+                seed=self.seed,
+                truncation=self.truncation,
+                nsamples=self.nsamples,
+                gradient_accumulate_steps=self.gradient_accumulate_steps,
+                quant_nontext_module=self.quant_nontext_module
             )
         else:
             self.dataloader = self.dataset
@@ -338,10 +332,8 @@ def calib(self, nsamples, bs):
                     if isinstance(v[key], list) and len(v[key]) == total_cnt:
                         self.inputs[k][key] = v[key][:max_len]
 
-
         # clean embed weight to save memory
         if self.low_cpu_mem_usage:
             for n, m in embed_layers:
                 m = m.to("meta")
         # torch.cuda.empty_cache()
-