Lora Correction Algorithm for int4/nf4 weight compression

openvinotoolkit · Jul 22, 2024 · 24b1f9e · 24b1f9e
1 parent 8744c20
commit 24b1f9e
Show file tree

Hide file tree

Showing 18 changed files with 832 additions and 145 deletions.
diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md
@@ -61,7 +61,8 @@ nncf_dataset = nncf.Dataset(data_source, transform_fn)
 compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM, ratio=0.8, dataset=nncf_dataset) # model is openvino.Model object
 ```
 
-- Accuracy of the 4-bit compressed models also can be improved by using AWQ, Scale Estimation or GPTQ algorithms over data-based mixed-precision algorithm. These algorithms work by equalizing a subset of weights to minimize the difference between the original precision and the 4-bit precision. The AWQ algorithm can be used in conjunction with either the Scale Estimation or GPTQ algorithm. However, Scale Estimation and GPTQ algorithms are mutually exclusive and cannot be used together. Below are examples demonstrating how to enable the AWQ, Scale Estimation or GPTQ algorithms:
+- Accuracy of the 4-bit compressed models also can be improved by using AWQ, Scale Estimation, GPTQ or Lora Correction algorithms over data-based mixed-precision algorithm. These algorithms work by equalizing a subset of weights to minimize the difference between the original precision and the 4-bit precision.
+Unlike all others, the Lora Correction algorithm inserts an additional Linear layers for reducing quantization noise and further accuracy improvement. Inevitably, this approach introduces a memory and a runtime overheads, but they are negligible, since the inserted weight much smaller and can be quantized to 8-bit. The AWQ, Scale Estimation (SE) and Lora Correction (LC) algo can be used in any combination together: AWQ + SE, AWQ + LC, SE + LC, AWQ + SE + LC. The GPTQ algorithm can be combined with AWQ only. Below are examples demonstrating how to enable the AWQ, Scale Estimation, GPTQ or Lora Correction algorithms:
 
   Prepare the calibration dataset for data-based algorithms:
 
@@ -135,6 +136,16 @@ model.model = compress_weights(model.model,
                                gptq=True)
 ```
 
+- How to compress 80% of layers to 4-bit integer with a default data-based mixed precision algorithm and Lora Correction algorithm. It requires setting `lora_correction` to `True` additionally to data-based mixed-precision algorithm.
+
+```python
+model.model = compress_weights(model.model,
+                               mode=CompressWeightsMode.INT4_SYM,
+                               ratio=0.8,
+                               dataset=nncf_dataset,
+                               lora_correction=True)
+```
+
 - `NF4` mode can be considered for improving accuracy, but currently models quantized to nf4 should not be faster models
   quantized to 8-bit asymmetric integer. Here's the example how to compress weights to nf4 data type with group size = 128.
   Different `group_size` and `ratio` are also supported.

diff --git a/nncf/openvino/quantization/quantize_model.py b/nncf/openvino/quantization/quantize_model.py
@@ -437,6 +437,7 @@ def compress_weights_impl(
     subset_size: int,
     scale_estimation: bool,
     gptq: bool,
+    lora_correction: bool,
     advanced_parameters: Optional[AdvancedCompressionParameters] = None,
 ) -> ov.Model:
     """
@@ -455,6 +456,7 @@ def compress_weights_impl(
         subset_size,
         scale_estimation,
         gptq,
+        lora_correction,
         advanced_parameters,
     )
     graph = NNCFGraphFactory.create(model)

diff --git a/nncf/quantization/advanced_parameters.py b/nncf/quantization/advanced_parameters.py
@@ -314,6 +314,34 @@ class AdvancedGPTQParameters:
     subset_size: int = 128
 
 
+@api()
+@dataclass
+class AdvancedLoraCorrectionParameters:
+    """
+    Contains advanced parameters for lora correction algorithm.
+
+    :param rank: rank of lora adapters. Defaults to 8.
+    :type rank: int
+    :param n_iters: number of correction steps. Defaults to 3.
+    :type n_iters: int
+    :param w_regularization: Whether to do regularization during the correction process. Defaults to False.
+        Helpful for big rank values to avoid overfitting.
+    :type w_regularization: bool
+    :param subset_size: Number of data samples for lora correction algorithm. Defaults to 32.
+    :type subset_size: int
+    :param is_int8_adapters: Whether to 8-bit quantize lora adapters. Defaults to True.
+        Reasonable with dynamic quantization of activation to achieve the best acceleration.
+    :type is_int8_adapters: bool
+
+    """
+
+    rank: int = 8
+    n_iters: int = 3
+    w_regularization: bool = False
+    subset_size: int = 32
+    is_int8_adapters: bool = True
+
+
 @api()
 @dataclass
 class AdvancedCompressionParameters:
@@ -337,6 +365,9 @@ class AdvancedCompressionParameters:
     # Advanced GPTQ algorithm parameters
     gptq_params: AdvancedGPTQParameters = field(default_factory=AdvancedGPTQParameters)
 
+    # Advanced Lora Correction algorithm parameters
+    lora_correction_params: AdvancedLoraCorrectionParameters = field(default_factory=AdvancedLoraCorrectionParameters)
+
 
 @api()
 @dataclass

diff --git a/nncf/quantization/algorithms/weight_compression/activation_stats.py b/nncf/quantization/algorithms/weight_compression/activation_stats.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Tuple, TypeVar
+
+from nncf.tensor import functions as fns
+
+TTensor = TypeVar("TTensor")
+
+
+def process_stats(stats: List[TTensor], subset_size: int) -> Tuple[TTensor, TTensor]:
+    """
+    It's a processing of activations shared between AWQ, Scale Estimation and LoRA Correction algorithms.
+
+    :param stats: list of activation statistics for a layer that contains N tensors with shape [SeqLen, HiddenDim]
+    :type stats: List[TTensor]
+    :param subset_size: The number of samples for AWQ.
+    :type subset_size: int
+    :return: tuple of the following tensors:
+        s - maximum channel magnitude across samples
+        X - average channel magnitude across tokens in the sequence
+    :rtype: Tuple[TTensor, TTensor]
+    """
+    X = fns.stack([fns.mean(stat, axis=0) for stat in stats])  # [Batch, HiddenDim]
+    X_full = fns.transpose(X)  # [HiddenDim, Batch]
+
+    # prevent high memory and time consumption
+    if X_full.shape[1] > subset_size:
+        lens = [stat.shape[0] for stat in stats]
+        step = X_full.shape[1] // subset_size
+        idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
+        X = X_full[:, idxs]  # [HiddenDim, SampleSize]
+    else:
+        X = X_full
+    s = fns.max(fns.abs(X_full), axis=1)  # [HiddenDim]
+    return s, X
diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -33,6 +33,7 @@
 from nncf.quantization.algorithms.weight_compression.awq import AWQ
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.gptq import GPTQ
+from nncf.quantization.algorithms.weight_compression.lora_correction import LoraCorrectionAlgorithm
 from nncf.quantization.algorithms.weight_compression.mixed_precision import MIXED_PRECISION_CRITERIA
 from nncf.quantization.algorithms.weight_compression.scale_estimation import ScaleEstimation
 from nncf.quantization.algorithms.weight_compression.weight_lowering import WeightCompressionConfig
@@ -65,6 +66,7 @@ def __init__(
         subset_size: int,
         scale_estimation: bool,
         gptq: bool,
+        lora_correction: bool,
         advanced_parameters: Optional[AdvancedCompressionParameters] = None,
     ):
         """
@@ -97,6 +99,7 @@ def __init__(
             quantization precision.
         :param scale_estimation: determines whether to use or not scale estimation for 4 bit layers.
         :param gptq: determines whether to use or not GPTQ algorithm.
+        :param lora_correction: determines whether to use or not LoRA Correction algorithm.
         :param advanced_parameters: advanced parameters for algorithms in compression pipeline.
         """
         super().__init__()
@@ -113,6 +116,7 @@ def __init__(
         self._subset_size = subset_size
         self._scale_estimation = scale_estimation
         self._gptq = gptq
+        self._lora_correction = lora_correction
         self._advanced_parameters = (
             advanced_parameters if advanced_parameters is not None else AdvancedCompressionParameters()
         )
@@ -403,16 +407,24 @@ def apply(
                 backend_entity=self._backend_entity,
             )
 
+        lora_correction_algo = None
+        description = "Applying Weight Compression"
+        if self._lora_correction:
+            lora_correction_params = self._advanced_parameters.lora_correction_params
+            lora_correction_algo = LoraCorrectionAlgorithm(activations, lora_correction_params)
+            description += " with correction of low-rank adapters"
+
         # Sort weight params to start compression with the bigger constants. This lowers peak memory footprint.
         all_weight_params = sorted(all_weight_params, key=lambda wp: wp.num_weights, reverse=True)
 
         # Compress model using weight compression parameters
         transformed_model = self._backend_entity.transform_model(
             model,
             graph,
-            track(all_weight_params, description="Applying Weight Compression"),
+            track(all_weight_params, description=description),
             scales,
             zero_points,
+            lora_correction_algo,
         )
 
         self._backend_entity.dump_parameters(
@@ -427,6 +439,7 @@ def apply(
                 "awq": self._awq,
                 "scale_estimation": self._scale_estimation,
                 "gptq": self._gptq,
+                "lora_correction": self._lora_correction,
             },
             algo_name="weight_compression",
         )

diff --git a/nncf/quantization/algorithms/weight_compression/awq.py b/nncf/quantization/algorithms/weight_compression/awq.py
@@ -25,6 +25,7 @@
 from nncf.common.utils.backend import BackendType
 from nncf.common.utils.backend import get_backend
 from nncf.quantization.algorithms.algorithm import Algorithm
+from nncf.quantization.algorithms.weight_compression.activation_stats import process_stats
 from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_dequantization
 from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization
@@ -101,9 +102,6 @@ def _set_backend_entity(self, model: TModel) -> None:
         Creates a helper class with a backed-specific logic of the algorithm.
 
         :param model: Backend-specific input model.
-        :param all_weight_params: List of all weight parameters.
-        :param nodes_to_compress: List of nodes for processing.
-        :param activations: The input activations of the layers considered for compression.
         """
 
         model_backend = get_backend(model)
@@ -197,17 +195,7 @@ def apply(
 
             config = wp.compression_config
 
-            stats = self._activations[k]
-            X = fns.stack([fns.mean(stat, axis=0) for stat in stats])
-            X = fns.transpose(X)
-
-            s = fns.max(fns.abs(X), axis=1)
-
-            if X.shape[1] > self._subset_size:
-                lens = [stat.shape[0] for stat in stats]
-                step = X.shape[1] // self._subset_size
-                idxs = [i[0] for i in sorted(enumerate(lens), key=lambda x: -x[1])][::step]
-                X = X[:, idxs]
+            s, X = process_stats(self._activations[k], self._subset_size)
 
             top_k = max(int(s.shape[0] * self._percent_to_apply), 1)
             topk_idxs = fns.argsort(-s)[:top_k]

diff --git a/nncf/quantization/algorithms/weight_compression/backend.py b/nncf/quantization/algorithms/weight_compression/backend.py
@@ -127,6 +127,36 @@ def transform_model(
         :return: The transformed model.
         """
 
+    @abstractmethod
+    def insert_adapters(self, wc_params: WeightCompressionParameters, lora_A: Tensor, lora_B: Tensor, int8_lora: bool):
+        """
+        Expands a model's execution graph following the Low-Rank Adaptation (LoRA) concept.
+
+        It inserts two additional Linear layers with weight matrices of low rank that are executed in parallel to the
+        target Linear layer.
+
+        Before insertion:
+
+            ----INPUT
+                   \
+                   orig.MM--------------------------------OUTPUT
+
+        After insertion:
+
+            ----INPUT ----lora_A.MM----lora_B.MM----\
+                  \                                add----OUTPUT
+                   orig.MM--------------------------/
+
+        :param wc_params: Parameters for weight compression.
+        :type wc_params: WeightCompressionParameters
+        :param lora_A: weights for the first LoRA matrix.
+        :type lora_A: Tensor
+        :param lora_B: weights for the second LoRA matrix.
+        :type lora_B: Tensor
+        :param int8_lora: indicates whether the LoRA matrices should be compressed to 8-bit.
+        :type int8_lora: bool
+        """
+
     @staticmethod
     @abstractmethod
     def target_point(target_type: TargetType, target_node_name: str, port_id: int) -> TargetPoint: