diff --git a/nncf/openvino/quantization/compression_primitives.py b/nncf/openvino/quantization/compression_primitives.py index a9582988c53..556c999ffe5 100644 --- a/nncf/openvino/quantization/compression_primitives.py +++ b/nncf/openvino/quantization/compression_primitives.py @@ -9,7 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -from typing import Optional, Tuple +from typing import Optional, Tuple, List import numpy as np import openvino as ov @@ -22,7 +22,30 @@ class OVCompressionPrimitiveCache: def __init__(self): self._compress_weight_model_cache = {} + self._compress_weight_end_to_end_model_cache = {} self._compress_decompress_weight_model_cache = {} + self._compress_decompress_end_to_end_weight_model_cache = {} + + def get_compress_weight_primitive_end_to_end( + self, + config: WeightCompressionConfig, + weight_shape: Tuple, + reduction_axes: Optional[Tuple], + invert_scale: Optional[bool] = False, + ): + DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) + if DYNAMIC_COMPRESSION: + weight_shape = (-1,) * len(weight_shape) + + recompile = bool(int(os.environ.get("RECOMPILE", "0"))) + if recompile: + return self._build_compress_model_end_to_end(config, weight_shape, reduction_axes, invert_scale) + key = (config.mode, config.num_bits, weight_shape, reduction_axes, invert_scale) + if key not in self._compress_weight_end_to_end_model_cache: + self._compress_weight_end_to_end_model_cache[key] = self._build_compress_model_end_to_end( + config, weight_shape, reduction_axes, invert_scale + ) + return self._compress_weight_end_to_end_model_cache[key] def get_compress_weight_primitive( self, @@ -55,28 +78,97 @@ def get_compress_decompress_weight_primitive( self, config: WeightCompressionConfig, weight_shape: Tuple, - scale_shape: Tuple, + reduction_axes: Optional[Tuple] = None, + scale_shape: Optional[Tuple] = None, zero_point_shape: Optional[Tuple] = None, + invert_scale: Optional[bool] = False, ): DYNAMIC_COMPRESSION = bool(int(os.environ.get("DYNAMIC_COMPRESSION", "0"))) if DYNAMIC_COMPRESSION: weight_shape = (-1,) * len(weight_shape) - scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) + if scale_shape is not None: + scale_shape = (-1,) * (len(scale_shape) - 1) + (1,) if zero_point_shape is not None: zero_point_shape = (-1,) * (len(zero_point_shape) - 1) + (1,) recompile = bool(int(os.environ.get("RECOMPILE", "0"))) if recompile: - return self._build_compress_decompress_model(config, weight_shape, scale_shape, zero_point_shape) - key = (config.mode, config.num_bits, weight_shape, scale_shape) + return self._build_compress_decompress_model(config, weight_shape, reduction_axes, scale_shape, zero_point_shape) + key = (config.mode, config.num_bits, weight_shape, invert_scale) + if reduction_axes is not None: + key += (reduction_axes,) + if scale_shape is not None: + key += (scale_shape,) if zero_point_shape is not None: key += (zero_point_shape,) if key not in self._compress_decompress_weight_model_cache: self._compress_decompress_weight_model_cache[key] = self._build_compress_decompress_model( - config, weight_shape, scale_shape, zero_point_shape + config, weight_shape, reduction_axes, scale_shape, zero_point_shape, invert_scale ) return self._compress_decompress_weight_model_cache[key] + @staticmethod + def _build_compress_model_end_to_end( + config: WeightCompressionConfig, + weight_shape: Tuple, + reduction_axes: Optional[Tuple] = None, + invert_scale: Optional[bool] = False, + return_nodes: bool = False, + ): + INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32") + + if INPUT_DTYPE == "fp32": + input_dtype = ov.Type.f32 + elif INPUT_DTYPE == "fp16": + input_dtype = ov.Type.f16 + elif INPUT_DTYPE == "bf16": + input_dtype = ov.Type.bf16 + else: + raise Exception + weight = opset.parameter(weight_shape, name="w", dtype=input_dtype) + parameters = [weight] + + mode = config.mode + num_bits = config.num_bits + eps = np.finfo(np.float32).eps + if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + min_values = opset.reduce_min(weight, reduction_axes=reduction_axes, + keep_dims=True) # [a1, r, a2] -> [a1, 1, a2] + max_values = opset.reduce_max(weight, reduction_axes=reduction_axes, + keep_dims=True) # [a1, r, a2] -> [a1, 1, a2] + min_values, max_values = opset.convert(min_values, ov.Type.f32), opset.convert(max_values, ov.Type.f32) + + level_low = 0 + level_high = 2 ** num_bits - 1 + levels = level_high - level_low + 1 + scale = (max_values - min_values) / opset.constant(levels - 1, ov.Type.f32) + scale = opset.select(opset.abs(scale) < eps, eps, scale) + + zero_point = opset.constant(level_low, ov.Type.f32) - opset.round(min_values / scale) + zero_point = opset.clamp(zero_point, level_low, level_high) + else: + zero_point = None + level_high = opset.constant(2 ** (num_bits - 1), ov.Type.f32) + + w_abs_min = opset.abs(opset.reduce_min(weight, reduction_axes=reduction_axes, keep_dims=True)) + w_max = opset.reduce_max(weight, reduction_axes=reduction_axes, keep_dims=True) + w_abs_min, w_max = opset.convert(w_abs_min, ov.Type.f32), opset.convert(w_max, ov.Type.f32) + + scale = opset.select(w_abs_min >= w_max, w_abs_min, -w_max) + scale /= level_high + scale = opset.select(opset.abs(scale) < eps, eps, scale) + + return OVCompressionPrimitiveCache._get_compress_model( + config, + parameters, + weight, + scale, + zero_point, + output_only_weight=False, + invert_scale=invert_scale, + return_nodes=return_nodes, + ) + @staticmethod def _build_compress_model( config: WeightCompressionConfig, @@ -87,15 +179,73 @@ def _build_compress_model( return_nodes: bool = False, ): INPUT_DTYPE = os.environ.get("INPUT_DTYPE", "fp32") - INT8_OUTPUT = bool(int(os.environ.get("INT8_OUTPUT", "0"))) - SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) - input_dtype = ov.Type.f32 if INPUT_DTYPE == "fp32" else ov.Type.f16 if INPUT_DTYPE == "fp16" else ov.Type.bf16 - w = opset.parameter(weight_shape, name="w", dtype=input_dtype) - s = opset.parameter(scale_shape, name="s") - parameters = [w, s] + if INPUT_DTYPE == "fp32": + input_dtype = ov.Type.f32 + elif INPUT_DTYPE == "fp16": + input_dtype = ov.Type.f16 + elif INPUT_DTYPE == "bf16": + input_dtype = ov.Type.bf16 + else: + raise Exception + weight = opset.parameter(weight_shape, name="w", dtype=input_dtype) + scale = opset.parameter(scale_shape, name="s") + parameters = [weight, scale] + + zero_point = None + if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: + zero_point = opset.parameter(zero_point_shape, name="zp") + parameters.append(zero_point) + + return OVCompressionPrimitiveCache._get_compress_model( + config, + parameters, + weight, + scale, + zero_point, + output_only_weight=True, + invert_scale=invert_scale, + return_nodes=return_nodes, + ) + + @staticmethod + def _build_compress_decompress_model_end_to_end( + config: WeightCompressionConfig, + weight_shape: Tuple, + reduction_axes: Optional[Tuple] = None, + invert_scale: Optional[bool] = False, + ): + parameters, results = OVCompressionPrimitiveCache._build_compress_model_end_to_end( + config, weight_shape, reduction_axes, invert_scale, return_nodes=True + ) + # `results` holds compressed weight, scale and, possibly, zero point + return OVCompressionPrimitiveCache._get_compress_decompress_model(config, parameters, results) + + @staticmethod + def _build_compress_decompress_model( + config: WeightCompressionConfig, + weight_shape: Tuple, + scale_shape: Tuple, + zero_point_shape: Optional[Tuple] = None, + ): + parameters, results = OVCompressionPrimitiveCache._build_compress_model( + config, weight_shape, scale_shape, zero_point_shape, return_nodes=True + ) + # `results` holds only compressed weight + return OVCompressionPrimitiveCache._get_compress_decompress_model(config, parameters, results) - if input_dtype != ov.Type.f32: + @staticmethod + def _get_compress_model( + config: WeightCompressionConfig, + parameters: List[ov._pyopenvino.op.Parameter], + w: ov.runtime.Node, + s: ov.runtime.Node, + zp: Optional[ov.runtime.Node] = None, + output_only_weight: Optional[bool] = True, + invert_scale: Optional[bool] = None, + return_nodes: Optional[bool] = False, + ): + if w.get_element_type() != ov.Type.f32: w = opset.convert(w, ov.Type.f32) compressed_w = w * (1 / s) if invert_scale else w / s @@ -105,9 +255,6 @@ def _build_compress_model( dtype = ov.Type.u8 if config.mode == CompressWeightsMode.INT8_ASYM else ov.Type.u4 level_low = 0 level_high = 2**num_bits - 1 - - zp = opset.parameter(zero_point_shape, name="zp") - parameters.append(zp) compressed_w += zp elif config.mode in [CompressWeightsMode.INT8_SYM, config.mode.INT4_SYM]: dtype = ov.Type.i8 if config.mode == CompressWeightsMode.INT8_SYM else ov.Type.i4 @@ -116,39 +263,49 @@ def _build_compress_model( else: raise Exception - result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") + compressed_w = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") + INT8_OUTPUT = bool(int(os.environ.get("INT8_OUTPUT", "0"))) if INT8_OUTPUT: - result = opset.convert(result, dtype) + compressed_w = opset.convert(compressed_w, dtype) + results = [compressed_w] + if not output_only_weight: + results.append(s) + if zp is not None: + results.append(zp) if return_nodes: - return parameters, result + return parameters, results - model = ov.Model([result], parameters) + model = ov.Model(results, parameters) compiled_model = ov.compile_model(model, device_name="CPU") - return lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS)[0] + SHARE_OUTPUTS = bool(int(os.environ.get("SHARE_OUTPUTS", "0"))) + return compiled_model, lambda parameters: compiled_model(parameters, share_outputs=SHARE_OUTPUTS) @staticmethod - def _build_compress_decompress_model( + def _get_compress_decompress_model( config: WeightCompressionConfig, - weight_shape: Tuple, - scale_shape: Tuple, - zero_point_shape: Optional[Tuple] = None, + parameters: List[ov._pyopenvino.op.Parameter], + results: List[ov._pyopenvino.op.Parameter] ): - parameters, clamp = OVCompressionPrimitiveCache._build_compress_model( - config, weight_shape, scale_shape, zero_point_shape, return_nodes=True - ) - - if len(parameters) == 3: - _, s, zp = parameters - result = (clamp - zp) * s + if config.mode in [CompressWeightsMode.INT8_ASYM, config.mode.INT4_ASYM]: + if len(results) == 1: + compressed_w = results[0] + s, zp = parameters[1], parameters[2] + else: + compressed_w, s, zp = results + decompressed_w = (compressed_w - zp) * s else: - s = parameters[1] - result = clamp * s + if len(results) == 1: + compressed_w = results[0] + s = parameters[1] + else: + compressed_w, s = results + decompressed_w = compressed_w * s - model = ov.Model([result], parameters) + model = ov.Model([decompressed_w], parameters) compiled_model = ov.compile_model(model, device_name="CPU") return lambda parameters: compiled_model(parameters)[0] diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index 0a4124e5760..8a7e3a86008 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -290,6 +290,7 @@ def calculate_quantized_weight( config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None, + reduction_axes: Optional[Tuple] = None, invert_scale=False, ) -> Tensor: """ @@ -308,34 +309,44 @@ def calculate_quantized_weight( if weight.backend == TensorBackend.numpy and not is_openvino_available(): log_once(logging.INFO, "Compression time may improve after installing OpenVINO") + if hasattr(weight.data, "flags"): + assert weight.data.flags["C_CONTIGUOUS"] + NUMPY_COMPRESSION = bool(int(os.environ.get("NUMPY_COMPRESSION", "0"))) if weight.backend in [TensorBackend.numpy, TensorBackend.ov] and is_openvino_available() and not NUMPY_COMPRESSION: from nncf.openvino.quantization.compression_primitives import OV_COMPRESSION_PRIMITIVE_CACHE - zero_point_shape = None if zero_point is None else zero_point.shape - compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive( - config, weight.shape, scale.shape, zero_point_shape - ) - - if hasattr(weight.data, "flags"): - assert weight.data.flags["C_CONTIGUOUS"] - input_tensors = weight.data, scale.data - if zero_point is not None: - input_tensors += (zero_point.data,) - compressed_weights = Tensor(compress_weight_primitive(input_tensors)) + input_tensors = (weight.data,) + if scale is not None: + zero_point_shape = None if zero_point is None else zero_point.shape + compiled_model, compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive( + config, weight.shape, scale.shape, zero_point_shape + ) + input_tensors += (scale.data,) + if zero_point is not None: + input_tensors += (zero_point.data,) + compressed_weights = Tensor(compress_weight_primitive(input_tensors)[0]) + else: + compiled_model, compress_weight_primitive = OV_COMPRESSION_PRIMITIVE_CACHE.get_compress_weight_primitive_end_to_end( + config, weight.shape, reduction_axes, invert_scale + ) + results = compress_weight_primitive(input_tensors) + results = [Tensor(results[i]) for i in range(3)] + if asym_quant: + compressed_weights, scale, zero_point = results + else: + compressed_weights, scale = results else: if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) - if scale.dtype != TensorDataType.float32: - scale = scale.astype(TensorDataType.float32) + assert scale.dtype == TensorDataType.float32 num_bits = config.num_bits level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 if invert_scale: - scale = fns.power(scale, -1) - compressed_weights = weight * scale + compressed_weights = weight * fns.power(scale, -1) else: compressed_weights = weight / scale if zero_point is not None: @@ -347,7 +358,7 @@ def calculate_quantized_weight( if compressed_weights.dtype != dtype: compressed_weights = compressed_weights.astype(dtype) - return compressed_weights + return compressed_weights, scale, zero_point def calculate_quantized_dequantized_weight( @@ -415,18 +426,22 @@ def do_int_quantization( if weight.dtype != TensorDataType.float32 and INPUT_DTYPE == "fp32": weight = weight.astype(TensorDataType.float32) - if group_size != -1: - # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] - weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) - - if precomputed_zero_point is None or precomputed_zero_point is None: - scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) - if precomputed_scale is not None: - scale = precomputed_scale - if precomputed_zero_point is not None: - zero_point = precomputed_zero_point + END_TO_END_COMPRESSION = bool(int(os.environ.get("END_TO_END_COMPRESSION", "0"))) + if not END_TO_END_COMPRESSION or group_size != -1: + if group_size != -1: + # weights are reshaped from [a1, r, a2] to [a1, r//gs, gs, a2] + weight, reduction_axes = reshape_weight_for_grouped_quantization(weight, reduction_axes, group_size) + + if precomputed_zero_point is None or precomputed_scale is None: + scale, zero_point = calculate_integer_quantization_params(weight, reduction_axes, config) + if precomputed_scale is not None: + scale = precomputed_scale + if precomputed_zero_point is not None: + zero_point = precomputed_zero_point + else: + scale = zero_point = None - compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point, invert_scale) + compressed_weights, scale, zero_point = calculate_quantized_weight(weight, config, scale, zero_point, reduction_axes, invert_scale) return compressed_weights, scale, zero_point diff --git a/weight_compression.py b/weight_compression.py index 864fcd60d7e..29e9247bd24 100644 --- a/weight_compression.py +++ b/weight_compression.py @@ -32,9 +32,11 @@ def parse_arguments(): parser.add_argument("--log-dir", default="./compression_logs", type=str, help="Directory where logs will be saved") - parser.add_argument("--numpy-compression", action="store_true", help="Enable numpy compression") + parser.add_argument("--numpy", action="store_true", help="Enable numpy compression") - parser.add_argument("--dynamic-compression", action="store_true", help="Enable dynamic compression") + parser.add_argument("--dynamic", action="store_true", help="Enable compression with dynamic-shaped OV models") + + parser.add_argument("--end-to-end", action="store_true", help="Enable end-to-end OV compression") parser.add_argument("--input-dtype", type=str, choices=["fp32", "fp16", "bf16"], default="fp32", help="OV model input dtype") @@ -61,8 +63,10 @@ def main(args): model_path = Path(args.model_path) log_dir = Path(args.log_dir) - numpy_compression = args.numpy_compression - dynamic_compression = args.dynamic_compression + numpy_compression = args.numpy + dynamic_compression = args.dynamic + end_to_end_compression = args.end_to_end + # end_to_end_compression = bool(0) input_dtype = args.input_dtype int8_output = args.int8_output recompile = args.recompile @@ -71,8 +75,9 @@ def main(args): if numpy_compression: log_dir_suffix = "numpy" else: - log_dir_suffix = "ov-dynamic" if dynamic_compression else "ov-static" - log_dir_suffix = f"{log_dir_suffix}_{('output-int8' if int8_output else 'output-fp32')}" + log_dir_suffix = "end-to-end_" if end_to_end_compression else "" + log_dir_suffix = f"{log_dir_suffix}{'ov-dynamic' if dynamic_compression else 'ov-static'}" + log_dir_suffix = f"{log_dir_suffix}_{'output-int8' if int8_output else 'output-fp32'}" log_dir_suffix = f"{log_dir_suffix}_{f'input-{input_dtype}'}" if recompile: log_dir_suffix = f"{log_dir_suffix}_recompile" @@ -91,6 +96,7 @@ def main(args): os.environ["NUMPY_COMPRESSION"] = f"{int(numpy_compression)}" os.environ["DYNAMIC_COMPRESSION"] = f"{int(dynamic_compression)}" + os.environ["END_TO_END_COMPRESSION"] = f"{int(end_to_end_compression)}" os.environ["INPUT_DTYPE"] = input_dtype os.environ["INT8_OUTPUT"] = f"{int(int8_output)}" os.environ["RECOMPILE"] = f"{int(recompile)}"