From 48682f1aa03c69cabc95ceb9892fd3496902def0 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 29 Feb 2024 15:33:04 +0100 Subject: [PATCH 01/13] Align compression subgraphs for both weight input data types --- .../weight_compression/openvino_backend.py | 20 +++++---- .../quantization/test_weights_compression.py | 41 +++++++++++-------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 87793477dd3..394d6c504d0 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -144,7 +144,7 @@ def transform_model( const_attributes = wc_params.node_with_weight.layer_attributes.constant_attributes[wc_params.weight_port_id] const_node_name = const_attributes["name"] const_node = self.name_to_node_mapping[const_node_name] - const_dtype = const_node.output(0).get_element_type().to_dtype() + const_dtype = const_node.output(0).get_element_type() weight = Tensor(get_const_value(const_node)) original_shape = weight.shape @@ -153,24 +153,30 @@ def transform_model( compressed_const = opset.constant( compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name ) - converted_const = opset.convert(compressed_const, const_dtype) + converted_const = opset.convert(compressed_const, ov.Type.f16) if compressed_weight.zero_point is not None: zero_point_const = opset.constant( compressed_weight.zero_point.data, dtype=compression_dtype, name=f"{const_node_name}/zero_point", ) - converted_zero_point = opset.convert(zero_point_const, const_dtype) - converted_const = opset.subtract(converted_const, converted_zero_point) + converted_zero_point = opset.convert(zero_point_const, ov.Type.f16) + converted_const = opset.subtract( + converted_const, converted_zero_point, name=f"{const_node_name}/zero_point/subtract" + ) - scale_const = opset.constant(compressed_weight.scale.data, dtype="float16", name=f"{const_node_name}/scale") - if const_dtype != "float16": - scale_const = opset.convert(scale_const, const_dtype, name=f"{const_node_name}/scale_convert") + scale_const = opset.constant( + compressed_weight.scale.data, dtype=ov.Type.f16, name=f"{const_node_name}/scale" + ) mul = opset.multiply( converted_const, scale_const, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}", ) + if const_dtype != ov.Type.f16: + mul = opset.convert( + mul, ov.Type.f32, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}/convert" + ) if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 71556ce3ac6..0f6a441fb40 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -138,7 +138,10 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = scale_node = scale_node.input_value(0).get_node() assert list(scale_node.shape) == reduced_weight_shape - reshape_node = get_next_node(mul_node) + convert_node = get_next_node(mul_node) + assert convert_node.get_type_name() == "Convert" + + reshape_node = get_next_node(convert_node) assert reshape_node.get_type_name() == "Reshape" return { @@ -164,7 +167,10 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): scale_node = scale_node.input_value(0).get_node() assert list(scale_node.shape) == reduced_weight_shape - reshape_node = get_next_node(mul_node) + convert_node = get_next_node(mul_node) + assert convert_node.get_type_name() == "Convert" + + reshape_node = get_next_node(convert_node) assert reshape_node.get_type_name() == "Reshape" return { @@ -697,22 +703,21 @@ def test_data_type_for_num_weights(mocker): assert isinstance(params.num_weights, np.uint64) -def test_weight_scale_datatype(): - # When model weight is in fp32, there will be an extra convert node for weight scale f16 > f32 - model_fp32 = IdentityMatmul(weights_dtype=np.float32).ov_model - compressed_model_fp32 = compress_weights(model_fp32) - name_to_node_map = {op.get_friendly_name(): op for op in compressed_model_fp32.get_ops()} - assert "weights/scale_convert" in name_to_node_map - scale_multiply_node = name_to_node_map["weights/fq_weights_1"] - assert scale_multiply_node.input_value(1).get_node().get_element_type() == ov.Type.f32 - - # When model weight is in fp16, there will be no extra convert node for weight scale - model_fp16 = IdentityMatmul(weights_dtype=np.float16).ov_model - compressed_model_fp16 = compress_weights(model_fp16) - name_to_node_map = {op.get_friendly_name(): op for op in compressed_model_fp16.get_ops()} - assert "weights/scale_convert" not in name_to_node_map - scale_multiply_node = name_to_node_map["weights/fq_weights_1"] - assert scale_multiply_node.input_value(1).get_node().get_element_type() == ov.Type.f16 +def test_compression_subgraph_for_different_weight_types(): + for weight_dtype in [np.float32, np.float16]: + model_fp32 = IdentityMatmul(weights_dtype=weight_dtype).ov_model + compressed_model_fp32 = compress_weights(model_fp32) + name_to_node_map = {op.get_friendly_name(): op for op in compressed_model_fp32.get_ops()} + + # Weight scale should be in fp16 nevertheless the weight data type + scale_multiply_node = name_to_node_map["weights/fq_weights_1"] + assert scale_multiply_node.input_value(1).get_node().get_element_type() == ov.Type.f16 + + convert_node = get_next_node(scale_multiply_node) + assert convert_node.get_type_name() == "Convert" + # In case weight is in fp32, the convert node is manually inserted + if weight_dtype == np.float32: + assert convert_node.get_friendly_name() == "weights/fq_weights_1/convert" DATASET_SIZE = 129 From 4a78bf914e96f8e41b20582a8513d02c26ec8c57 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 29 Feb 2024 15:41:15 +0100 Subject: [PATCH 02/13] Remove conditions added in the previous PR --- .../native/quantization/test_weights_compression.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 0f6a441fb40..e1a35ad54e1 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -96,8 +96,6 @@ def check_int8_node(op: ov.Node, mode: CompressWeightsMode = CompressWeightsMode mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" scale_node = mul_node.input_value(1).get_node() - if scale_node.get_type_name() == "Convert": - scale_node = scale_node.input_value(0).get_node() scale = get_const_value(scale_node) return { @@ -134,8 +132,6 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" scale_node = mul_node.input_value(1).get_node() - if scale_node.get_type_name() == "Convert": - scale_node = scale_node.input_value(0).get_node() assert list(scale_node.shape) == reduced_weight_shape convert_node = get_next_node(mul_node) @@ -163,8 +159,6 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): mul_node = get_next_node(convert_node) assert mul_node.get_type_name() == "Multiply" scale_node = mul_node.input_value(1).get_node() - if scale_node.get_type_name() == "Convert": - scale_node = scale_node.input_value(0).get_node() assert list(scale_node.shape) == reduced_weight_shape convert_node = get_next_node(mul_node) From 22d059b8f07631552d746e07f6449c59db6225ec Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 29 Feb 2024 17:08:33 +0100 Subject: [PATCH 03/13] Transition to FP32 case subgraph --- .../weight_compression/openvino_backend.py | 16 ++++---- .../quantization/test_weights_compression.py | 39 +++++++++---------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 394d6c504d0..b9d2a793e56 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -153,14 +153,14 @@ def transform_model( compressed_const = opset.constant( compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name ) - converted_const = opset.convert(compressed_const, ov.Type.f16) + converted_const = opset.convert(compressed_const, ov.Type.f32) if compressed_weight.zero_point is not None: zero_point_const = opset.constant( compressed_weight.zero_point.data, dtype=compression_dtype, name=f"{const_node_name}/zero_point", ) - converted_zero_point = opset.convert(zero_point_const, ov.Type.f16) + converted_zero_point = opset.convert(zero_point_const, ov.Type.f32) converted_const = opset.subtract( converted_const, converted_zero_point, name=f"{const_node_name}/zero_point/subtract" ) @@ -168,21 +168,23 @@ def transform_model( scale_const = opset.constant( compressed_weight.scale.data, dtype=ov.Type.f16, name=f"{const_node_name}/scale" ) + scale_const = opset.convert(scale_const, ov.Type.f32, name=f"{const_node_name}/scale_convert") mul = opset.multiply( converted_const, scale_const, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}", ) - if const_dtype != ov.Type.f16: - mul = opset.convert( - mul, ov.Type.f32, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}/convert" - ) if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) + const_node_output = const_node.output(0) + if const_dtype == ov.Type.f16: + # Bypass fp16 -> fp32 convert node + const_node_output = next(iter(const_node_output.get_target_inputs())).get_node().output(0) + mul_output = mul.output(0) - for target_input in const_node.output(0).get_target_inputs(): + for target_input in const_node_output.get_target_inputs(): target_input.replace_source_output(mul_output) # reset name_to_node_mapping diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index e1a35ad54e1..79fbc0ffba8 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -95,7 +95,8 @@ def check_int8_node(op: ov.Node, mode: CompressWeightsMode = CompressWeightsMode mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" - scale_node = mul_node.input_value(1).get_node() + convert_node = mul_node.input_value(1).get_node() + scale_node = convert_node.input_value(0).get_node() scale = get_const_value(scale_node) return { @@ -131,13 +132,11 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" - scale_node = mul_node.input_value(1).get_node() + convert_node = mul_node.input_value(1).get_node() + scale_node = convert_node.input_value(0).get_node() assert list(scale_node.shape) == reduced_weight_shape - convert_node = get_next_node(mul_node) - assert convert_node.get_type_name() == "Convert" - - reshape_node = get_next_node(convert_node) + reshape_node = get_next_node(mul_node) assert reshape_node.get_type_name() == "Reshape" return { @@ -158,13 +157,11 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): mul_node = get_next_node(convert_node) assert mul_node.get_type_name() == "Multiply" - scale_node = mul_node.input_value(1).get_node() + convert_node = mul_node.input_value(1).get_node() + scale_node = convert_node.input_value(0).get_node() assert list(scale_node.shape) == reduced_weight_shape - convert_node = get_next_node(mul_node) - assert convert_node.get_type_name() == "Convert" - - reshape_node = get_next_node(convert_node) + reshape_node = get_next_node(mul_node) assert reshape_node.get_type_name() == "Reshape" return { @@ -697,21 +694,23 @@ def test_data_type_for_num_weights(mocker): assert isinstance(params.num_weights, np.uint64) -def test_compression_subgraph_for_different_weight_types(): +def test_weight_scale_datatype(): for weight_dtype in [np.float32, np.float16]: model_fp32 = IdentityMatmul(weights_dtype=weight_dtype).ov_model compressed_model_fp32 = compress_weights(model_fp32) name_to_node_map = {op.get_friendly_name(): op for op in compressed_model_fp32.get_ops()} - # Weight scale should be in fp16 nevertheless the weight data type + # Scale should always be converted from f16 to f32 + assert "weights/scale_convert" in name_to_node_map scale_multiply_node = name_to_node_map["weights/fq_weights_1"] - assert scale_multiply_node.input_value(1).get_node().get_element_type() == ov.Type.f16 - - convert_node = get_next_node(scale_multiply_node) - assert convert_node.get_type_name() == "Convert" - # In case weight is in fp32, the convert node is manually inserted - if weight_dtype == np.float32: - assert convert_node.get_friendly_name() == "weights/fq_weights_1/convert" + convert_node = scale_multiply_node.input_value(1).get_node() + scale_node = convert_node.input_value(0).get_node() + assert scale_node.get_element_type() == ov.Type.f16 + assert convert_node.get_element_type() == ov.Type.f32 + + # There should be no Convert node after scale multiply + matmul_node = get_next_node(scale_multiply_node) + assert matmul_node.get_type_name() == 'MatMul' DATASET_SIZE = 129 From 22f59fc80aa4a48bba6613e19de715a56f78e84a Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 29 Feb 2024 17:11:13 +0100 Subject: [PATCH 04/13] Black --- tests/openvino/native/quantization/test_weights_compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 79fbc0ffba8..32b8760f822 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -710,7 +710,7 @@ def test_weight_scale_datatype(): # There should be no Convert node after scale multiply matmul_node = get_next_node(scale_multiply_node) - assert matmul_node.get_type_name() == 'MatMul' + assert matmul_node.get_type_name() == "MatMul" DATASET_SIZE = 129 From 16a7fe0b521874c98dd65cb0384d7dbb11c85c12 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 29 Feb 2024 17:17:16 +0100 Subject: [PATCH 05/13] Tweak comment --- .../algorithms/weight_compression/openvino_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index b9d2a793e56..84978041459 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -180,7 +180,7 @@ def transform_model( const_node_output = const_node.output(0) if const_dtype == ov.Type.f16: - # Bypass fp16 -> fp32 convert node + # Bypass fp16 -> fp32 weight convert node const_node_output = next(iter(const_node_output.get_target_inputs())).get_node().output(0) mul_output = mul.output(0) From b4c78e9516e89730ffa5b70242d1b17bfdc5679b Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 1 Mar 2024 10:56:32 +0100 Subject: [PATCH 06/13] Rename test --- .../native/quantization/test_weights_compression.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 32b8760f822..70500669f6e 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -694,11 +694,11 @@ def test_data_type_for_num_weights(mocker): assert isinstance(params.num_weights, np.uint64) -def test_weight_scale_datatype(): +def test_compression_for_different_weight_dtypes(): for weight_dtype in [np.float32, np.float16]: - model_fp32 = IdentityMatmul(weights_dtype=weight_dtype).ov_model - compressed_model_fp32 = compress_weights(model_fp32) - name_to_node_map = {op.get_friendly_name(): op for op in compressed_model_fp32.get_ops()} + model = IdentityMatmul(weights_dtype=weight_dtype).ov_model + compressed_model = compress_weights(model) + name_to_node_map = {op.get_friendly_name(): op for op in compressed_model.get_ops()} # Scale should always be converted from f16 to f32 assert "weights/scale_convert" in name_to_node_map From 1f1d8822ad37bbb8b89caa0cce55bac8bdba7eca Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 19 Mar 2024 14:35:47 +0100 Subject: [PATCH 07/13] Consider the case of f16 activations --- .../weight_compression/openvino_backend.py | 24 ++++++++--- tests/openvino/native/models.py | 11 +++-- .../quantization/test_weights_compression.py | 43 +++++++++++-------- 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 84978041459..c2481504439 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -178,14 +178,24 @@ def transform_model( if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) - const_node_output = const_node.output(0) - if const_dtype == ov.Type.f16: - # Bypass fp16 -> fp32 weight convert node - const_node_output = next(iter(const_node_output.get_target_inputs())).get_node().output(0) - mul_output = mul.output(0) - for target_input in const_node_output.get_target_inputs(): - target_input.replace_source_output(mul_output) + for target_input in const_node.output(0).get_target_inputs(): + target_input_node = target_input.get_node() + if const_dtype == ov.Type.f16: + target_input_node_attrs = target_input_node.get_attributes() + if (target_input_node.get_type_name() == "Convert" and + target_input_node_attrs["destination_type"] == "f32"): + # Before compression, there was a f16 -> f32 Convert node after the weight. Now, scale multiply + # node is in f32, and this Convert node is not needed. + next_node_target_input = next(iter(target_input_node.output(0).get_target_inputs())) + next_node_target_input.replace_source_output(mul_output) + else: + # Both weight and activation are in f16. After the addition of f32 scale multiply node we have + # to add a Convert node. + mul_converted = opset.convert(mul, ov.Type.f16, name=f"{mul.get_friendly_name()}/convert") + target_input.replace_source_output(mul_converted.output(0)) + else: + target_input.replace_source_output(mul_output) # reset name_to_node_mapping self.name_to_node_mapping = None diff --git a/tests/openvino/native/models.py b/tests/openvino/native/models.py index c25df4d862f..2b83a95457a 100644 --- a/tests/openvino/native/models.py +++ b/tests/openvino/native/models.py @@ -790,16 +790,19 @@ def _create_ov_model(self): class IdentityMatmul(OVReferenceModel): - def _create_ov_model(self, weights_dtype=None): + def _create_ov_model(self, weights_dtype=None, activation_dtype=None): """ :param: weights_dtype: precision of weights, should be either np.float32 or np.float16 + :param: activation_dtype: precision of activations, should be either np.float32 or np.float16 """ weights_dtype = np.float32 if weights_dtype is None else weights_dtype - input_node = opset.parameter([3, 3], name="Input_1") + activation_dtype = np.float32 if activation_dtype is None else activation_dtype + + input_node = opset.parameter([3, 3], dtype=activation_dtype, name="Input_1") weights_data = np.eye(3) * 255 current_weights = opset.constant(weights_data, dtype=weights_dtype, name="weights") - if weights_dtype != np.float32: - current_weights = opset.convert(current_weights, np.float32, name="weights/convert") + if weights_dtype != activation_dtype: + current_weights = opset.convert(current_weights, activation_dtype, name="weights/convert") matmul_node = opset.matmul(input_node, current_weights, transpose_a=False, transpose_b=True, name="MatMul") result = opset.result(matmul_node, name="Result") result.get_output_tensor(0).set_names(set(["Result"])) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 70500669f6e..98767e29a49 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -694,23 +694,32 @@ def test_data_type_for_num_weights(mocker): assert isinstance(params.num_weights, np.uint64) -def test_compression_for_different_weight_dtypes(): - for weight_dtype in [np.float32, np.float16]: - model = IdentityMatmul(weights_dtype=weight_dtype).ov_model - compressed_model = compress_weights(model) - name_to_node_map = {op.get_friendly_name(): op for op in compressed_model.get_ops()} - - # Scale should always be converted from f16 to f32 - assert "weights/scale_convert" in name_to_node_map - scale_multiply_node = name_to_node_map["weights/fq_weights_1"] - convert_node = scale_multiply_node.input_value(1).get_node() - scale_node = convert_node.input_value(0).get_node() - assert scale_node.get_element_type() == ov.Type.f16 - assert convert_node.get_element_type() == ov.Type.f32 - - # There should be no Convert node after scale multiply - matmul_node = get_next_node(scale_multiply_node) - assert matmul_node.get_type_name() == "MatMul" +def test_compression_for_different_dtypes(): + for activation_dtype in [np.float32, np.float16]: + for weight_dtype in [np.float32, np.float16]: + if activation_dtype == np.float16 and weight_dtype == np.float32: + # Activations can be in f16 only if weights are in f16 + continue + + model = IdentityMatmul(weights_dtype=weight_dtype, activation_dtype=activation_dtype).ov_model + compressed_model = compress_weights(model) + name_to_node_map = {op.get_friendly_name(): op for op in compressed_model.get_ops()} + + # Scale should always be converted from f16 to f32 + assert "weights/scale_convert" in name_to_node_map + scale_multiply_node = name_to_node_map["weights/fq_weights_1"] + convert_node = scale_multiply_node.input_value(1).get_node() + scale_node = convert_node.input_value(0).get_node() + assert scale_node.get_element_type() == ov.Type.f16 + assert convert_node.get_element_type() == ov.Type.f32 + + node_after_scale = get_next_node(scale_multiply_node) + if activation_dtype == np.float16 and weight_dtype == np.float16: + # If both weights and activations are in f16, there should be a f32 -> f16 convert after scale multiply + assert node_after_scale.get_type_name() == "Convert" + else: + # Otherwise there should be no Convert node after scale multiply + assert node_after_scale.get_type_name() == "MatMul" DATASET_SIZE = 129 From 805f9924313355027e797b3ae27be989b2f1a5c5 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 19 Mar 2024 14:39:06 +0100 Subject: [PATCH 08/13] Black --- .../algorithms/weight_compression/openvino_backend.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index c2481504439..b8060201224 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -183,8 +183,10 @@ def transform_model( target_input_node = target_input.get_node() if const_dtype == ov.Type.f16: target_input_node_attrs = target_input_node.get_attributes() - if (target_input_node.get_type_name() == "Convert" and - target_input_node_attrs["destination_type"] == "f32"): + if ( + target_input_node.get_type_name() == "Convert" + and target_input_node_attrs["destination_type"] == "f32" + ): # Before compression, there was a f16 -> f32 Convert node after the weight. Now, scale multiply # node is in f32, and this Convert node is not needed. next_node_target_input = next(iter(target_input_node.output(0).get_target_inputs())) From 633b4437ad89535c2ebba0ef942846f2dc842d36 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Tue, 19 Mar 2024 15:01:49 +0100 Subject: [PATCH 09/13] Tweak comment --- .../algorithms/weight_compression/openvino_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index b8060201224..964668694a7 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -192,7 +192,7 @@ def transform_model( next_node_target_input = next(iter(target_input_node.output(0).get_target_inputs())) next_node_target_input.replace_source_output(mul_output) else: - # Both weight and activation are in f16. After the addition of f32 scale multiply node we have + # Both weight and activation are in f16. Because f32 scale multiply node was added, we have # to add a Convert node. mul_converted = opset.convert(mul, ov.Type.f16, name=f"{mul.get_friendly_name()}/convert") target_input.replace_source_output(mul_converted.output(0)) From ba3f3b2d7ed7df228565ee97b10ef9be2b45f5ea Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 20 Mar 2024 10:39:49 +0100 Subject: [PATCH 10/13] Reverted back to FP16 case --- .../weight_compression/openvino_backend.py | 28 +++--------- .../quantization/test_weights_compression.py | 43 ++++++++++--------- 2 files changed, 29 insertions(+), 42 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 964668694a7..c7e63cfaf13 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -153,14 +153,14 @@ def transform_model( compressed_const = opset.constant( compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name ) - converted_const = opset.convert(compressed_const, ov.Type.f32) + converted_const = opset.convert(compressed_const, ov.Type.f16) if compressed_weight.zero_point is not None: zero_point_const = opset.constant( compressed_weight.zero_point.data, dtype=compression_dtype, name=f"{const_node_name}/zero_point", ) - converted_zero_point = opset.convert(zero_point_const, ov.Type.f32) + converted_zero_point = opset.convert(zero_point_const, ov.Type.f16) converted_const = opset.subtract( converted_const, converted_zero_point, name=f"{const_node_name}/zero_point/subtract" ) @@ -168,36 +168,22 @@ def transform_model( scale_const = opset.constant( compressed_weight.scale.data, dtype=ov.Type.f16, name=f"{const_node_name}/scale" ) - scale_const = opset.convert(scale_const, ov.Type.f32, name=f"{const_node_name}/scale_convert") mul = opset.multiply( converted_const, scale_const, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}", ) + if const_dtype == ov.Type.f32: + mul = opset.convert( + mul, ov.Type.f32, name=f"{mul.get_friendly_name()}/convert" + ) if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) mul_output = mul.output(0) for target_input in const_node.output(0).get_target_inputs(): - target_input_node = target_input.get_node() - if const_dtype == ov.Type.f16: - target_input_node_attrs = target_input_node.get_attributes() - if ( - target_input_node.get_type_name() == "Convert" - and target_input_node_attrs["destination_type"] == "f32" - ): - # Before compression, there was a f16 -> f32 Convert node after the weight. Now, scale multiply - # node is in f32, and this Convert node is not needed. - next_node_target_input = next(iter(target_input_node.output(0).get_target_inputs())) - next_node_target_input.replace_source_output(mul_output) - else: - # Both weight and activation are in f16. Because f32 scale multiply node was added, we have - # to add a Convert node. - mul_converted = opset.convert(mul, ov.Type.f16, name=f"{mul.get_friendly_name()}/convert") - target_input.replace_source_output(mul_converted.output(0)) - else: - target_input.replace_source_output(mul_output) + target_input.replace_source_output(mul_output) # reset name_to_node_mapping self.name_to_node_mapping = None diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index 98767e29a49..b49cca1ed2c 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -95,8 +95,7 @@ def check_int8_node(op: ov.Node, mode: CompressWeightsMode = CompressWeightsMode mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" - convert_node = mul_node.input_value(1).get_node() - scale_node = convert_node.input_value(0).get_node() + scale_node = mul_node.input_value(1).get_node() scale = get_const_value(scale_node) return { @@ -132,11 +131,13 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" - convert_node = mul_node.input_value(1).get_node() - scale_node = convert_node.input_value(0).get_node() + scale_node = mul_node.input_value(1).get_node() assert list(scale_node.shape) == reduced_weight_shape - reshape_node = get_next_node(mul_node) + convert_node = get_next_node(mul_node) + assert convert_node.get_type_name() == "Convert" + + reshape_node = get_next_node(convert_node) assert reshape_node.get_type_name() == "Reshape" return { @@ -157,11 +158,13 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): mul_node = get_next_node(convert_node) assert mul_node.get_type_name() == "Multiply" - convert_node = mul_node.input_value(1).get_node() - scale_node = convert_node.input_value(0).get_node() + scale_node = mul_node.input_value(1).get_node() assert list(scale_node.shape) == reduced_weight_shape - reshape_node = get_next_node(mul_node) + convert_node = get_next_node(mul_node) + assert convert_node.get_type_name() == "Convert" + + reshape_node = get_next_node(convert_node) assert reshape_node.get_type_name() == "Reshape" return { @@ -705,21 +708,19 @@ def test_compression_for_different_dtypes(): compressed_model = compress_weights(model) name_to_node_map = {op.get_friendly_name(): op for op in compressed_model.get_ops()} - # Scale should always be converted from f16 to f32 - assert "weights/scale_convert" in name_to_node_map + # Weight scale should be in fp16 nevertheless the weight data type scale_multiply_node = name_to_node_map["weights/fq_weights_1"] - convert_node = scale_multiply_node.input_value(1).get_node() - scale_node = convert_node.input_value(0).get_node() - assert scale_node.get_element_type() == ov.Type.f16 - assert convert_node.get_element_type() == ov.Type.f32 - - node_after_scale = get_next_node(scale_multiply_node) - if activation_dtype == np.float16 and weight_dtype == np.float16: - # If both weights and activations are in f16, there should be a f32 -> f16 convert after scale multiply - assert node_after_scale.get_type_name() == "Convert" + assert scale_multiply_node.input_value(1).get_node().get_element_type() == ov.Type.f16 + + next_node = get_next_node(scale_multiply_node) + if activation_dtype == np.float16: + # There should be no convert node after multiply if both weights and activations are in f16 + assert next_node.get_type_name() != "Convert" else: - # Otherwise there should be no Convert node after scale multiply - assert node_after_scale.get_type_name() == "MatMul" + assert next_node.get_type_name() == "Convert" + # In case weight is in fp32, the convert node is manually inserted + if weight_dtype == np.float32: + assert next_node.get_friendly_name() == "weights/fq_weights_1/convert" DATASET_SIZE = 129 From b162d73a1fdfa31d043aab52e4677fdbf6c95767 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 20 Mar 2024 10:51:21 +0100 Subject: [PATCH 11/13] Black --- .../algorithms/weight_compression/openvino_backend.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index c7e63cfaf13..4c20521846c 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -174,9 +174,7 @@ def transform_model( name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}", ) if const_dtype == ov.Type.f32: - mul = opset.convert( - mul, ov.Type.f32, name=f"{mul.get_friendly_name()}/convert" - ) + mul = opset.convert(mul, ov.Type.f32, name=f"{mul.get_friendly_name()}/convert") if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) From ec887be1136652feeb8af407c18cd577d237f506 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 22 Mar 2024 10:11:18 +0100 Subject: [PATCH 12/13] Address suggested changes --- .../algorithms/weight_compression/openvino_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 4c20521846c..9ba25b583fa 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -173,8 +173,8 @@ def transform_model( scale_const, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}", ) - if const_dtype == ov.Type.f32: - mul = opset.convert(mul, ov.Type.f32, name=f"{mul.get_friendly_name()}/convert") + if const_dtype != ov.Type.f16: + mul = opset.convert(mul, const_dtype, name=f"{mul.get_friendly_name()}/convert") if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) From 7855ff6aa8811a151360ab1944611f52e1614203 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Thu, 4 Apr 2024 20:05:18 +0200 Subject: [PATCH 13/13] Move Convert after Reshape in case of grouped compression --- .../weight_compression/openvino_backend.py | 7 ++++-- .../quantization/test_weights_compression.py | 25 +++++++++++-------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index 80b1aa1cc0f..5a5649a7d45 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -171,12 +171,15 @@ def transform_model( scale_const, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}", ) - if const_dtype != ov.Type.f16: - mul = opset.convert(mul, const_dtype, name=f"{mul.get_friendly_name()}/convert") if compression_config.group_size != -1: mul = opset.reshape(mul, output_shape=original_shape, special_zero=False) + if const_dtype != ov.Type.f16: + mul = opset.convert( + mul, const_dtype, name=f"{const_node_name}/fq_weights_{wc_params.weight_port_id}/convert" + ) + mul_output = mul.output(0) for target_input in const_node.output(0).get_target_inputs(): target_input.replace_source_output(mul_output) diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index b49cca1ed2c..f339d316d0d 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -134,12 +134,12 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = scale_node = mul_node.input_value(1).get_node() assert list(scale_node.shape) == reduced_weight_shape - convert_node = get_next_node(mul_node) - assert convert_node.get_type_name() == "Convert" - - reshape_node = get_next_node(convert_node) + reshape_node = get_next_node(mul_node) assert reshape_node.get_type_name() == "Reshape" + convert_node = get_next_node(reshape_node) + assert convert_node.get_type_name() == "Convert" + return { "scale": get_const_value(scale_node), } @@ -161,12 +161,12 @@ def check_nf4_grouped(op: ov.Node, group_size: int = 7): scale_node = mul_node.input_value(1).get_node() assert list(scale_node.shape) == reduced_weight_shape - convert_node = get_next_node(mul_node) - assert convert_node.get_type_name() == "Convert" - - reshape_node = get_next_node(convert_node) + reshape_node = get_next_node(mul_node) assert reshape_node.get_type_name() == "Reshape" + convert_node = get_next_node(reshape_node) + assert convert_node.get_type_name() == "Convert" + return { "scale": get_const_value(scale_node), } @@ -705,14 +705,19 @@ def test_compression_for_different_dtypes(): continue model = IdentityMatmul(weights_dtype=weight_dtype, activation_dtype=activation_dtype).ov_model - compressed_model = compress_weights(model) + compressed_model = compress_weights( + model, mode=CompressWeightsMode.INT4_SYM, ratio=1, group_size=1, all_layers=True + ) name_to_node_map = {op.get_friendly_name(): op for op in compressed_model.get_ops()} # Weight scale should be in fp16 nevertheless the weight data type scale_multiply_node = name_to_node_map["weights/fq_weights_1"] assert scale_multiply_node.input_value(1).get_node().get_element_type() == ov.Type.f16 - next_node = get_next_node(scale_multiply_node) + reshape_node = get_next_node(scale_multiply_node) + assert reshape_node.get_type_name() == "Reshape" + + next_node = get_next_node(reshape_node) if activation_dtype == np.float16: # There should be no convert node after multiply if both weights and activations are in f16 assert next_node.get_type_name() != "Convert"