Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scale estimation/rectification for int4 compression #2549

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
9f29c53
Scale estimation for 4bit compression.
andreyanufr Mar 4, 2024
2b657f3
Fixed name to node mapping sharing.
andreyanufr Mar 5, 2024
00c8d12
Added algo flag and removed debug information.
andreyanufr Mar 5, 2024
eab7f49
Cache set of used ov networks for compression/decompression
andreyanufr Mar 12, 2024
3908ef9
Added test for scale estimation.
andreyanufr Mar 12, 2024
8961142
Changed variable names.
andreyanufr Mar 12, 2024
3bc4ec2
Added adwanced parametrs for compression.
andreyanufr Mar 13, 2024
0292b2a
Added advanced parameters to compression algo.
andreyanufr Mar 13, 2024
7cec4ce
Changed AWQ logick.
andreyanufr Mar 14, 2024
18313df
Added conformance test.
andreyanufr Mar 18, 2024
9d4c122
Fixed conformance test
andreyanufr Mar 18, 2024
b35de99
Removed code dublicate.
andreyanufr Mar 19, 2024
c69d8eb
1) Fixed bugs with parameter.
andreyanufr Mar 20, 2024
7999c6b
Merge remote-tracking branch 'upstream/develop' into andreyan/scale_e…
andreyanufr Mar 22, 2024
f5111a8
Stateless scale estimation test.
andreyanufr Mar 22, 2024
bd5c1e1
1) Added conformance metrics for scale estimation.
andreyanufr Mar 25, 2024
ad9124b
Fixed style.
andreyanufr Mar 25, 2024
d1b566c
Added filtering of special tokens in preprocessing.
andreyanufr Mar 26, 2024
0d19c69
Added scale estimation for asym_int4.
andreyanufr Mar 28, 2024
c89c939
1) Added high-level comments about scale estimation.
andreyanufr Mar 28, 2024
5e7d899
Fix in comment.
andreyanufr Mar 28, 2024
81f02fc
Added extra hyperparameter for scale estimation.
andreyanufr Apr 8, 2024
8ab8f36
Update nncf/quantization/algorithms/weight_compression/scale_estimati…
andreyanufr Apr 15, 2024
c85a415
Added inplace layer compression in scale estimation.
andreyanufr Apr 15, 2024
1f13800
Merge branch 'andreyan/scale_estimation_pr_inplace' into andreyan/sca…
andreyanufr Apr 16, 2024
680939b
Resolve conflict.
andreyanufr Apr 16, 2024
1cd2c7f
Changed PT backend compression according to OV bckend cahnges.
andreyanufr Apr 16, 2024
2fb43bd
Updated conformance test references.
andreyanufr Apr 17, 2024
4371295
Update references in conformance tests.
andreyanufr Apr 18, 2024
d41e29c
Revert change related to inplace compression in sacle estimation.
andreyanufr Apr 19, 2024
6a42c35
Merge remote-tracking branch 'upstream/develop' into andreyan/scale_e…
andreyanufr Apr 19, 2024
413eec2
Added precomputed scale as new parameter.
andreyanufr Apr 19, 2024
065e4f4
Updated docstrings for compression algo hyperparameters.
andreyanufr Apr 19, 2024
007d92d
Fixed pylint error.
andreyanufr Apr 19, 2024
ad81ac0
Added extra check for compression parameters combination.
andreyanufr Apr 19, 2024
6771faf
Reduce OV test scope.
andreyanufr Apr 19, 2024
8d2a842
Reduce OV test scope.
andreyanufr Apr 19, 2024
0c2a1f7
Added exception for AWQ and SE in the case gruop_size==-1.
andreyanufr Apr 22, 2024
afcf3b3
Refactoring.
andreyanufr Apr 25, 2024
140c31f
Merge remote-tracking branch 'upstream/develop' into andreyan/scale_e…
andreyanufr Apr 25, 2024
795742d
Update nncf/quantization/quantize_model.py
andreyanufr Apr 26, 2024
f06095e
Fixed return type.
andreyanufr Apr 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion nncf/openvino/quantization/quantize_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,14 +404,15 @@ def compress_weights_impl(
sensitivity_metric: SensitivityMetric,
awq: bool,
subset_size: int,
scale_estimation: bool,
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
) -> ov.Model:
"""
Implementation of the `compress_weights()` method for the OpenVINO backend.
"""

model = remove_friendly_name_duplicates(model)
compression_algorithm = WeightCompression(
mode, ratio, group_size, ignored_scope, all_layers, sensitivity_metric, awq, subset_size
mode, ratio, group_size, ignored_scope, all_layers, sensitivity_metric, awq, subset_size, scale_estimation
)
graph = NNCFGraphFactory.create(model)
return compression_algorithm.apply(model, graph, dataset=dataset)
75 changes: 75 additions & 0 deletions nncf/quantization/advanced_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import nncf
from nncf.common.quantization.structs import QuantizationScheme as QuantizationMode
from nncf.common.utils.api_marker import api
from nncf.parameters import SensitivityMetric
from nncf.quantization.range_estimator import AggregatorType
from nncf.quantization.range_estimator import RangeEstimatorParameters
from nncf.quantization.range_estimator import StatisticsType
Expand Down Expand Up @@ -238,6 +239,80 @@ class AdvancedQuantizationParameters:
backend_params: Dict[str, Any] = field(default_factory=dict)


@dataclass
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
class AdvancedAWQParameters:
"""
Contains advanced parameters for AWQ algorithm.
It regulates the calculation of the smooth scale for different node types.
A negative value switches off the algorithm for current node type. In case of inaccurate results,
this parameter may be adjusted in the range from 0 to 1 or set -1 to disable SmoothQuant algorithm.

:param subset_size: The number of samples for AWQ.
:param percent_to_apply: The percent of outliers for correction.
:param alpha_min: Minimum value of smoothness parameter for grid search.
:param alpha_max: Maximal value of smoothness parameter for grid search.
:param steps: The number of the steps in grid search.
"""

subset_size: int = 32
percent_to_apply: float = 0.002
alpha_min: float = 0.01
alpha_max: float = 1.0
steps: int = 100


@dataclass
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
class AdvancedScaleEstimationParameters:
"""
Contains advanced parameters for scale estimation algorithm.
It regulates the calculation of the smooth scale for different node types.
A negative value switches off the algorithm for current node type. In case of inaccurate results,
this parameter may be adjusted in the range from 0 to 1 or set -1 to disable SmoothQuant algorithm.

:param subset_size: The number of samples for scale estimation.
:param initial_steps: The number of the steps for absmax scale rectification.
:param scale_steps: The number of the steps for grid search scale rectification
from 1.0 to 1.0 - 0.05 * scale_step.
"""

subset_size: int = 32
initial_steps: int = 5
scale_steps: int = 10


@dataclass
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
class AdvancedSensitivityParameters:
"""
Contains advanced parameters for scale estimation algorithm.
It regulates the calculation of the smooth scale for different node types.
A negative value switches off the algorithm for current node type. In case of inaccurate results,
this parameter may be adjusted in the range from 0 to 1 or set -1 to disable SmoothQuant algorithm.

:param sensitivity_metric: The sensitivity metric for assigning quantization precision to layers. In order to
preserve the accuracy of the model, the more sensitive layers receives a higher precision.
:param subset_size: Number of data samples to calculate activation statistics used for assigning different
quantization precision.
"""

sensitivity_metric: SensitivityMetric = SensitivityMetric.WEIGHT_QUANTIZATION_ERROR
subset_size: int = 128


@api()
@dataclass
class AdvancedCompressionParameters:
# Advanced sensitivity algorithm parameters
sensitivity_params: AdvancedSensitivityParameters = field(default_factory=AdvancedSensitivityParameters)

# Advanced AWQ algorithm parameters
awq_params: AdvancedAWQParameters = field(default_factory=AdvancedAWQParameters)

# Advanced scale estimation algorithm parameters
scale_estimation_params: AdvancedScaleEstimationParameters = field(
default_factory=AdvancedScaleEstimationParameters
)


@api()
@dataclass
class AdvancedAccuracyRestorerParameters:
Expand Down
35 changes: 34 additions & 1 deletion nncf/quantization/algorithms/weight_compression/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@
from nncf.experimental.tensor.definitions import TensorDataType
from nncf.parameters import CompressWeightsMode
from nncf.parameters import SensitivityMetric
from nncf.quantization.advanced_parameters import AdvancedCompressionParameters
from nncf.quantization.algorithms.algorithm import Algorithm
from nncf.quantization.algorithms.weight_compression.awq import AWQ
from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
from nncf.scopes import IgnoredScope
from nncf.scopes import get_ignored_node_names_from_ignored_scope
Expand All @@ -60,6 +62,8 @@ def __init__(
sensitivity_metric: SensitivityMetric,
awq: bool,
subset_size: int,
scale_estimation: bool,
advanced_parameters: Optional[AdvancedCompressionParameters] = None,
):
"""
:param mode: Defines a mode for weight compression.
Expand Down Expand Up @@ -88,6 +92,7 @@ def __init__(
:param awq: determines whether to use or not modified AWQ algorithm.
:param subset_size: Number of data samples to calculate activation statistics used for assigning different
quantization precision.
:param scale_estimation: determines whether to use or not scale estimation for 4 bit layers.
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
"""
super().__init__()
self._mode = mode
Expand All @@ -101,6 +106,10 @@ def __init__(
self._sensitivity_metric = sensitivity_metric
self._awq = awq
self._subset_size = subset_size
self._scale_estimation = scale_estimation
self._advanced_parameters = (
advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
)

@property
def available_backends(self) -> List[BackendType]:
Expand Down Expand Up @@ -339,11 +348,35 @@ def do_compression(
nncf_logger.info(self._get_bitwidth_distribution_str(all_weight_params, ratio_defining_params))

if self._awq and activations is not None and self._mode != CompressWeightsMode.NF4:
awq_params = self._advanced_parameters.awq_params
awq_algo = AWQ(
model, self._backend_entity.name_to_node_mapping, all_weight_params, nodes_to_compress, activations
model,
self._backend_entity.name_to_node_mapping,
all_weight_params,
nodes_to_compress,
activations,
awq_params.subset_size,
awq_params.percent_to_apply,
awq_params.alpha_min,
awq_params.alpha_max,
awq_params.steps,
)
Comment on lines +354 to 364
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like this is out of scope of the PR, but my opinion is __init__ parameters should be look like this:

Suggested change
model,
self._backend_entity.name_to_node_mapping,
all_weight_params,
nodes_to_compress,
activations,
awq_params.subset_size,
awq_params.percent_to_apply,
awq_params.alpha_min,
awq_params.alpha_max,
awq_params.steps,
)
awq_params.subset_size,
awq_params.percent_to_apply,
awq_params.alpha_min,
awq_params.alpha_max,
awq_params.steps,
)

This comment is something to think about.

awq_algo.apply(model, graph)

if self._scale_estimation and activations is not None and self._mode != CompressWeightsMode.NF4:
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
scale_estimation_params = self._advanced_parameters.scale_estimation_params
scale_algo = ScaleEstimation(
alexsu52 marked this conversation as resolved.
Show resolved Hide resolved
model,
self._backend_entity.name_to_node_mapping,
all_weight_params,
nodes_to_compress,
activations,
scale_estimation_params.subset_size,
scale_estimation_params.initial_steps,
scale_estimation_params.scale_steps,
)
scale_algo.apply(model, graph)

# Compress model using weight compression parameters
transformed_model = self._backend_entity.transform_model(
model, graph, track(all_weight_params, description="Applying Weight Compression")
Expand Down
19 changes: 14 additions & 5 deletions nncf/quantization/algorithms/weight_compression/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(
activations: Optional[Dict[str, TTensor]] = None,
subset_size: int = 32,
percent_to_apply=0.002,
alpha_min=0.01,
alpha_min=0.0,
alpha_max=1.0,
steps=100,
):
Expand Down Expand Up @@ -107,8 +107,7 @@ def _set_backend_entity(self, model: TModel) -> None:
if model_backend == BackendType.OPENVINO:
from nncf.quantization.algorithms.weight_compression.openvino_backend import OVAWQAlgoAlgoBackend

self._backend_entity = OVAWQAlgoAlgoBackend(model)
self._backend_entity.name_to_node_mapping = self.name_to_node_mapping
self._backend_entity = OVAWQAlgoAlgoBackend(model, self.name_to_node_mapping)
self._patterns = self._backend_entity.get_awq_patterns()
else:
raise RuntimeError(
Expand Down Expand Up @@ -181,11 +180,15 @@ def apply(
stats = self._activations[k]
X = fns.stack([fns.mean(stat, axis=0) for stat in stats])
X = fns.transpose(X)
if X.shape[1] > self._subset_size:
X = X[:, : self._subset_size]

s = fns.max(fns.abs(X), axis=1)

if X.shape[1] > self._subset_size:
lens = [stat.shape[0] for stat in stats]
step = X.shape[1] // self._subset_size
idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
X = X[:, idxs]

top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
topk_idxs = fns.argsort(-s)[:top_k]

Expand Down Expand Up @@ -263,6 +266,12 @@ def apply(
merge_weight = merge_weight * a_scale
self._backend_entity.set_weight(merge_node, port_id, model, graph, merge_weight)

# update activations for next usage
a_scale_t = fns.transpose(a_scale)
for i, stat in enumerate(self._activations[k]):
stat = stat * a_scale_t
self._activations[k][i] = stat

return model

def get_statistic_points(self, model: TModel, graph: NNCFGraph) -> StatisticPointsContainer:
Expand Down
2 changes: 2 additions & 0 deletions nncf/quantization/algorithms/weight_compression/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class WeightCompressionParameters:
:param num_weights: Number of elements in the weight array.
:param reduction_axes: Axes, along which to reduce (collect) different statistics (e.g. min, max).
:param compression_config: Configuration of weight compression for the weight node.
:param precomputed_scale: Precomputed scale for weight compression.
"""

weight_name: str
Expand All @@ -59,6 +60,7 @@ class WeightCompressionParameters:
num_weights: np.uint64
reduction_axes: Tuple[int, ...]
compression_config = WeightCompressionConfig()
precomputed_scale = None

def __post_init__(self):
# Explicitly cast num_weights to avoid overflow on finding total number of weights.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,11 @@


class OVWeightCompressionAlgoBackend(WeightCompressionAlgoBackend):
def __init__(self, model: ov.Model):
self.name_to_node_mapping = OVModelTransformer._get_name_to_node_mapping(model)
def __init__(self, model: ov.Model, name_to_node_mapping: Dict = None):
if name_to_node_mapping is None:
self.name_to_node_mapping = OVModelTransformer._get_name_to_node_mapping(model)
else:
self.name_to_node_mapping = name_to_node_mapping

@property
def matmul_metatypes(self) -> List[OperatorMetatype]:
Expand Down Expand Up @@ -148,7 +151,9 @@ def transform_model(

weight = Tensor(get_const_value(const_node))
original_shape = weight.shape
compressed_weight = compress_weight(weight, wc_params.reduction_axes, compression_config)
compressed_weight = compress_weight(
weight, wc_params.reduction_axes, compression_config, wc_params.precomputed_scale
)

compressed_const = opset.constant(
compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name
Expand Down Expand Up @@ -190,6 +195,69 @@ def dump_parameters(
) -> None:
dump_parameters(model, parameters, algo_name, path)

@staticmethod
def get_compress_decompress_pipeline(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO:

Suggested change
def get_compress_decompress_pipeline(
def create_compress_decompress_fn(

weight_compression_parameter: WeightCompressionParameters, w_shape, s_shape, z_p_shape
):
config = weight_compression_parameter.compression_config
mode = config.mode
assert mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM]
num_bits = config.num_bits

level_low = 0
level_high = 2**num_bits - 1

input_node_w = opset.parameter(w_shape, name="w")
input_node_s = opset.parameter(s_shape, name="s")
input_node_zp = opset.parameter(z_p_shape, name="zp")

node_compression_div = opset.divide(input_node_w, input_node_s)
node_compression_add = opset.add(node_compression_div, input_node_zp)
node_compression_round = opset.round(node_compression_add)
node_compression_clamp = opset.clamp(node_compression_round, level_low, level_high)

result1 = opset.result(node_compression_clamp, name="compressed_weights")
result1.get_output_tensor(0).set_names(set(["compressed_weights"]))

node_decompression_add = opset.subtract(node_compression_clamp, input_node_zp)
node_decompression_mul = opset.multiply(node_decompression_add, input_node_s)
result2 = opset.result(node_decompression_mul, name="q_weights")
result2.get_output_tensor(0).set_names(set(["q_weights"]))

model = ov.Model([result1, result2], [input_node_w, input_node_s, input_node_zp])

compiled_model = ov.compile_model(model)

return compiled_model

@staticmethod
def get_compress_pipeline(weight_compression_parameter: WeightCompressionParameters, w_shape, s_shape, z_p_shape):
config = weight_compression_parameter.compression_config
mode = config.mode
assert mode in [CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM]
daniil-lyakhov marked this conversation as resolved.
Show resolved Hide resolved
num_bits = config.num_bits

level_low = 0
level_high = 2**num_bits - 1

input_node_w = opset.parameter(w_shape, name="w")
input_node_s = opset.parameter(s_shape, name="s")
input_node_zp = opset.parameter(z_p_shape, name="zp")

node_compression_div = opset.divide(input_node_w, input_node_s)
node_compression_add = opset.add(node_compression_div, input_node_zp)
node_compression_round = opset.round(node_compression_add)
node_compression_clamp = opset.clamp(node_compression_round, level_low, level_high)

result1 = opset.result(node_compression_clamp, name="compressed_weights")
result1.get_output_tensor(0).set_names(set(["compressed_weights"]))

model = ov.Model([result1], [input_node_w, input_node_s, input_node_zp])

compiled_model = ov.compile_model(model)

return compiled_model


class OVAWQAlgoAlgoBackend(OVWeightCompressionAlgoBackend):
@staticmethod
Expand Down
Loading
Loading