Make SE algorithm also return zero points (#3054)

### Changes 1. Made Scale Estimation return zero points besides scales. 2. Minor fixes to `do_int_quantization` function, made its signature more strict. ### Reason for changes 1. Currently zero points need to be re-computed for the weights for which SE computed scales. 2. Avoid unnecessary computations inside `do_int_quantization`. Avoid wrong usage of the function. ### Tests Added a test to verify correctness of feeding precomputed quantization parameters to `do_int_quantization`. ### Tickets Prerequisite to 139047.
openvinotoolkit · Nov 6, 2024 · 5d2be87 · 5d2be87
1 parent f5ef50a
commit 5d2be87
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 7 deletions.
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -617,7 +617,7 @@ def apply(
         else:
             if self._scale_estimation:
                 scale_estimation_params = self._advanced_parameters.scale_estimation_params
-                scales = ScaleEstimation(
+                scales, zero_points = ScaleEstimation(
                     model,
                     self._backend_entity.name_to_node_mapping,
                     all_weight_params,

diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py
@@ -112,7 +112,7 @@ def apply(
         graph: NNCFGraph,
         statistic_points: Optional[StatisticPointsContainer] = None,
         dataset: Optional[Dataset] = None,
-    ) -> Dict[str, Tensor]:
+    ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]:
         """
         Estimates better scale for the int4 nodes in the model.
         Minimizes per-group difference between floating point MatMul and
@@ -124,10 +124,10 @@ def apply(
         :param graph: Model graph.
         :param statistic_points: Statistic points with collected statistics values.
         :param dataset: A representative dataset for the calibration process.
-        :return: Dict with pairs (weight name, estimated scale).
+        :return: Two dictionaries for estimated scales and zero points for each weight name.
         """
 
-        scales = dict()
+        scales, zero_points = dict(), dict()
 
         for wp in track(self._all_weight_params, description="Applying Scale Estimation"):
             weight_name = wp.weight_name
@@ -147,7 +147,7 @@ def apply(
 
             weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph)
 
-            scales[weight_name], _ = self.calculate_quantization_params(
+            scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params(
                 self._backend_entity,
                 stats,
                 weight,
@@ -159,7 +159,7 @@ def apply(
                 self._weight_penalty,
             )
 
-        return scales
+        return scales, zero_points
 
     @staticmethod
     def calculate_quantization_params(
@@ -369,6 +369,8 @@ def calculate_quantization_params(
 
         if config.group_size == -1:
             result_scale = fns.squeeze(result_scale, axis=1)
+        if zp is not None and config.group_size == -1:
+            zp = fns.squeeze(zp, axis=1)
 
         return result_scale, zp
 

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -358,6 +358,12 @@ def do_int_quantization(
     """
     assert config.is_integer(), "The function supports integer quantization only"
     group_size = config.group_size
+    is_asym = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]
+    if is_asym and (precomputed_scale is None) != (precomputed_zero_point is None):
+        raise ValueError(
+            "If precomputed quantization parameters are provided, both scale and zero point are required "
+            "for asymmetric quantization."
+        )
 
     if weight.dtype != TensorDataType.float32:
         weight = weight.astype(TensorDataType.float32)
@@ -366,7 +372,8 @@ def do_int_quantization(
         # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2]
         weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size)
 
-    if precomputed_zero_point is None or precomputed_zero_point is None:
+    scale, zero_point = None, None
+    if precomputed_scale is None or (is_asym and precomputed_zero_point is None):
         scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config)
     if precomputed_scale is not None:
         scale = precomputed_scale

diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py
@@ -1074,6 +1074,53 @@ def test_compressed_weighs_range(mode, data):
     assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data))
 
 
+@pytest.mark.parametrize(
+    ("config", "precompute_scale", "precompute_zero_point", "raises"),
+    [
+        (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), False, False, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), True, True, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), True, False, True),
+        (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), False, True, True),
+        (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), False, False, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), True, True, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), True, False, True),
+        (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), False, True, True),
+        (WeightCompressionConfig(CompressWeightsMode.INT8_SYM), True, False, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT8_SYM), False, False, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT4_SYM), True, False, False),
+        (WeightCompressionConfig(CompressWeightsMode.INT4_SYM), False, False, False),
+    ],
+)
+def test_int_quantization_with_precomputed_parameters(config, precompute_scale, precompute_zero_point, raises):
+    is_asym = config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM]
+
+    precomputed_scale, precomputed_zero_point = None, None
+    weight = Tensor(((np.arange(11) - 5) / 10).astype(np.float32)[:, None])
+    if precompute_scale:
+        precomputed_scale = Tensor(-((np.arange(11) - 5) / 100).astype(np.float32)[:, None])
+    if precompute_zero_point:
+        precomputed_zero_point = Tensor(np.arange(11).astype(np.int32)[:, None])
+
+    if raises:
+        with pytest.raises(ValueError) as exc_info:
+            _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point)
+            assert exc_info.value == (
+                "If precomputed quantization parameters are provided, both scale and zero point "
+                "are required for asymmetric quantization."
+            )
+        return
+    else:
+        _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point)
+
+    if precompute_scale:
+        assert np.allclose(scale.data, precomputed_scale.data)
+    if is_asym:
+        if precompute_zero_point:
+            assert np.allclose(zero_point.data, precomputed_zero_point.data)
+    else:
+        assert zero_point is None
+
+
 @pytest.mark.parametrize("mode", INT4_NF4_MODES)
 def test_call_max_var_criterion_with_dataset_gptq_neg_group_size(mode):
     model = AWQMatmulModel().ov_model