From 9fb19c164e91a5bbf2dcb7884e530b0daecb5ed2 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 8 Nov 2024 14:04:11 -0800
Subject: [PATCH] [Quant Tool] Add reduce_range option to get_qdq_config()
 (#22782)

### Description
Adds `reduce_range` option to `get_qdq_config()`


### Motivation and Context
Make it easier to set this option when calling get_qdq_config().
Otherwise, user has to set the option manually.
---
 onnxruntime/python/tools/quantization/quantize.py           | 6 +++++-
 onnxruntime/test/python/quantization/test_get_qdq_config.py | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index f368f35955955..4ffd8b9872982 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -231,6 +231,7 @@ def get_qdq_config(
     activation_symmetric: bool = False,
     weight_symmetric: bool | None = None,
     per_channel: bool = False,
+    reduce_range: bool = False,
     keep_removable_activations: bool = False,
     min_real_range: float | None = None,
     tensor_quant_overrides: dict[str, list[dict[str, Any]]] | None = None,
@@ -245,7 +246,7 @@ def get_qdq_config(
         calibration_data_reader: Calibration data reader.
         calibrate_methode: The calibration method. Defaults to MinMax.
         activation_type: The default activation quantization type. Defaults to QUInt8.
-        weight_type: The default weight quantization type. Defaults to QUInt8.
+        weight_type: The default weight quantization type. Defaults to QInt8.
         activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
             Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16,
             the zero-point values are 127 and 32,767, respectively.
@@ -254,6 +255,8 @@ def get_qdq_config(
         per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
             Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators
             and their quantization axes.
+        reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false.
+            May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode.
         keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
                         be removed, and will be explicitly represented in the QDQ model. If false, these activations
                         are automatically removed if activations are asymmetrically quantized. Keeping these activations
@@ -373,6 +376,7 @@ def get_qdq_config(
         op_types_to_quantize=list(op_types.difference(op_types_to_exclude)),
         nodes_to_exclude=final_nodes_to_exclude,
         per_channel=per_channel,
+        reduce_range=reduce_range,
         use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD),
         extra_options=final_extra_options,
     )
diff --git a/onnxruntime/test/python/quantization/test_get_qdq_config.py b/onnxruntime/test/python/quantization/test_get_qdq_config.py
index d7055764f745a..58d00272475cd 100644
--- a/onnxruntime/test/python/quantization/test_get_qdq_config.py
+++ b/onnxruntime/test/python/quantization/test_get_qdq_config.py
@@ -93,6 +93,7 @@ def test_basic_args(self):
             activation_type=QuantType.QUInt16,
             weight_type=QuantType.QInt16,
             per_channel=True,
+            reduce_range=True,
             nodes_to_exclude=["Mul"],
             # Other options converted to extra_options:
             min_real_range=0.0001,
@@ -104,6 +105,7 @@ def test_basic_args(self):
         self.assertEqual(qdq_config.activation_type, QuantType.QUInt16)
         self.assertEqual(qdq_config.weight_type, QuantType.QInt16)
         self.assertTrue(qdq_config.per_channel)
+        self.assertTrue(qdq_config.reduce_range)
         self.assertEqual(set(qdq_config.nodes_to_exclude), {"Mul"})
         self.assertEqual(set(qdq_config.op_types_to_quantize), {"Add"})