From 5d2be87e0fe842d682fab4f84c33c47c4f389afa Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 6 Nov 2024 15:15:30 +0100 Subject: [PATCH] Make SE algorithm also return zero points (#3054) ### Changes 1. Made Scale Estimation return zero points besides scales. 2. Minor fixes to `do_int_quantization` function, made its signature more strict. ### Reason for changes 1. Currently zero points need to be re-computed for the weights for which SE computed scales. 2. Avoid unnecessary computations inside `do_int_quantization`. Avoid wrong usage of the function. ### Tests Added a test to verify correctness of feeding precomputed quantization parameters to `do_int_quantization`. ### Tickets Prerequisite to 139047. --- .../weight_compression/algorithm.py | 2 +- .../weight_compression/scale_estimation.py | 12 +++-- .../weight_compression/weight_lowering.py | 9 +++- .../quantization/test_weights_compression.py | 47 +++++++++++++++++++ 4 files changed, 63 insertions(+), 7 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 81bb4406f0a..957b1a55a42 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -617,7 +617,7 @@ def apply( else: if self._scale_estimation: scale_estimation_params = self._advanced_parameters.scale_estimation_params - scales = ScaleEstimation( + scales, zero_points = ScaleEstimation( model, self._backend_entity.name_to_node_mapping, all_weight_params, diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index aaa46a4e7c6..a5572530857 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -112,7 +112,7 @@ def apply( graph: NNCFGraph, statistic_points: Optional[StatisticPointsContainer] = None, dataset: Optional[Dataset] = None, - ) -> Dict[str, Tensor]: + ) -> Tuple[Dict[str, Tensor], Dict[str, Tensor]]: """ Estimates better scale for the int4 nodes in the model. Minimizes per-group difference between floating point MatMul and @@ -124,10 +124,10 @@ def apply( :param graph: Model graph. :param statistic_points: Statistic points with collected statistics values. :param dataset: A representative dataset for the calibration process. - :return: Dict with pairs (weight name, estimated scale). + :return: Two dictionaries for estimated scales and zero points for each weight name. """ - scales = dict() + scales, zero_points = dict(), dict() for wp in track(self._all_weight_params, description="Applying Scale Estimation"): weight_name = wp.weight_name @@ -147,7 +147,7 @@ def apply( weight = self._backend_entity.get_weight(wp.node_with_weight, weight_port_id, model, graph) - scales[weight_name], _ = self.calculate_quantization_params( + scales[weight_name], zero_points[weight_name] = self.calculate_quantization_params( self._backend_entity, stats, weight, @@ -159,7 +159,7 @@ def apply( self._weight_penalty, ) - return scales + return scales, zero_points @staticmethod def calculate_quantization_params( @@ -369,6 +369,8 @@ def calculate_quantization_params( if config.group_size == -1: result_scale = fns.squeeze(result_scale, axis=1) + if zp is not None and config.group_size == -1: + zp = fns.squeeze(zp, axis=1) return result_scale, zp diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 342725c0237..13406c0b288 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -358,6 +358,12 @@ def do_int_quantization( """ assert config.is_integer(), "The function supports integer quantization only" group_size = config.group_size + is_asym = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] + if is_asym and (precomputed_scale is None) != (precomputed_zero_point is None): + raise ValueError( + "If precomputed quantization parameters are provided, both scale and zero point are required " + "for asymmetric quantization." + ) if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) @@ -366,7 +372,8 @@ def do_int_quantization( # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - if precomputed_zero_point is None or precomputed_zero_point is None: + scale, zero_point = None, None + if precomputed_scale is None or (is_asym and precomputed_zero_point is None): scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) if precomputed_scale is not None: scale = precomputed_scale diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index db72b267698..347c299a50a 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -1074,6 +1074,53 @@ def test_compressed_weighs_range(mode, data): assert np.allclose(np.abs(compressed_weighs.data), np.abs(w.data)) +@pytest.mark.parametrize( + ("config", "precompute_scale", "precompute_zero_point", "raises"), + [ + (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), False, False, False), + (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), True, True, False), + (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), True, False, True), + (WeightCompressionConfig(CompressWeightsMode.INT8_ASYM), False, True, True), + (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), False, False, False), + (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), True, True, False), + (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), True, False, True), + (WeightCompressionConfig(CompressWeightsMode.INT4_ASYM), False, True, True), + (WeightCompressionConfig(CompressWeightsMode.INT8_SYM), True, False, False), + (WeightCompressionConfig(CompressWeightsMode.INT8_SYM), False, False, False), + (WeightCompressionConfig(CompressWeightsMode.INT4_SYM), True, False, False), + (WeightCompressionConfig(CompressWeightsMode.INT4_SYM), False, False, False), + ], +) +def test_int_quantization_with_precomputed_parameters(config, precompute_scale, precompute_zero_point, raises): + is_asym = config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT8_ASYM] + + precomputed_scale, precomputed_zero_point = None, None + weight = Tensor(((np.arange(11) - 5) / 10).astype(np.float32)[:, None]) + if precompute_scale: + precomputed_scale = Tensor(-((np.arange(11) - 5) / 100).astype(np.float32)[:, None]) + if precompute_zero_point: + precomputed_zero_point = Tensor(np.arange(11).astype(np.int32)[:, None]) + + if raises: + with pytest.raises(ValueError) as exc_info: + _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point) + assert exc_info.value == ( + "If precomputed quantization parameters are provided, both scale and zero point " + "are required for asymmetric quantization." + ) + return + else: + _, scale, zero_point = do_int_quantization(weight, -1, config, precomputed_scale, precomputed_zero_point) + + if precompute_scale: + assert np.allclose(scale.data, precomputed_scale.data) + if is_asym: + if precompute_zero_point: + assert np.allclose(zero_point.data, precomputed_zero_point.data) + else: + assert zero_point is None + + @pytest.mark.parametrize("mode", INT4_NF4_MODES) def test_call_max_var_criterion_with_dataset_gptq_neg_group_size(mode): model = AWQMatmulModel().ov_model