microsoft · apsonawane · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -437,7 +437,7 @@ def save_model(self, out_dir):
         # Quantize ONNX model to desired precision
         # TODO: Replace by quantizing the MatMuls as they are created
         already_quantized_in_qdq_format = self.quant_type is not None and self.quant_attrs["use_qdq"]  # Skip quantizing `MatMul` in `DequantizeLinear --> Transpose --> MatMul` path
-        if self.onnx_dtype == "int4" and not already_quantized_in_qdq_format:
+        if self.onnx_dtype == "int4" and not already_quantized_in_qdq_format and not self.matmul_attrs["use_lora"]:
             model = self.to_int4(model)
 
         # Save ONNX model with only one external data file and delete any existing duplicate copies
@@ -714,7 +714,7 @@ def make_tanh(self, name, root_input, dtype, shape):
         self.make_value_info(output, dtype, shape=shape)
 
     def make_matmul(self, matmul, basename, root_input, **kwargs):
-        if hasattr(matmul, "base_layer"):
+        if hasattr(matmul, "lora_A"):
             # For LoRA `MatMul`
             return self.make_matmul_lora(matmul, basename, root_input, **kwargs)
         else:
@@ -853,14 +853,26 @@ def make_matmul_lora(self, matmul, basename, root_input, **kwargs):
         matmul_A_name = self.make_matmul_op(matmul.lora_A.default, matmul_A_basename, root_input=root_input)
         lora_A = f"{matmul_A_name}/output_0"
 
-        matmul.lora_B.default.weight *= matmul.scaling["default"]
+        from peft import PeftConfig
+        peft_config = PeftConfig.from_pretrained(extra_options["adapter_path"], trust_remote_code=True)
+
+        matmul.lora_B.default.weight *= (peft_config.lora_alpha/peft_config.r)
         matmul_B_basename = "/".join(basename_parts[:-1] + ["lora_B"] + basename_parts[-1:])
         matmul_B_name = self.make_matmul_op(matmul.lora_B.default, matmul_B_basename, root_input=lora_A)
         lora_B = f"{matmul_B_name}/output_0"
 
-        # Make regular MatMul path
-        last_dim = matmul.base_layer.weight.shape[0]
-        matmul_name = self.make_matmul_op(matmul.base_layer, basename, root_input, **kwargs)
+        if hasattr(matmul, "base_layer"):
+            # Make MatMul with base_layer
+            last_dim = matmul.base_layer.weight.shape[0]
+            matmul_name = self.make_matmul_op(matmul.base_layer, basename, root_input, **kwargs)
+        elif hasattr(matmul, "qweight"):
+            # Make quantized MatMul path
+            last_dim = matmul.qweight.shape[0]
+            matmul_name = self.make_matmul_op(matmul, basename, root_input, **kwargs)
+        else:
+            # Make regular MatMul path
+            last_dim = matmul.weight.shape[0]
+            matmul_name = self.make_matmul_op(matmul, basename, root_input, **kwargs)
 
         # Make LoRA Add node
         add_name = "/".join(basename_parts[:-1] + ["lora", "Add"])
@@ -2036,13 +2048,14 @@ def make_model(self, input_path):
                 kv_size,
                 self.intermediate_size,
                 self.num_layers,
+                self.extra_options["adapter_path"] if "adapter_path" in self.extra_options else None,
             )
         else:
             # Load PyTorch model
             extra_kwargs = {"num_hidden_layers": self.num_layers} if "num_hidden_layers" in self.extra_options else {}
             model = AutoModelForCausalLM.from_pretrained(self.model_name_or_path, cache_dir=self.cache_dir, token=self.hf_token, trust_remote_code=True, **extra_kwargs)
 
-        if "adapter_path" in self.extra_options:
+        if "adapter_path" in self.extra_options and self.quant_type is None:
             from peft import PeftModel
             model = PeftModel.from_pretrained(model, self.extra_options["adapter_path"], cache_dir=self.cache_dir, token=self.hf_token)