QuantizationPreset.MIXED for ModelType.TRANSFORMER by default (#2216)

### Changes Made the QuantizationPreset.MIXED preset for ModelType.TRANSFORMER by default. ### Reason for changes Quantization of transformer-based models with QuantizationPreset.MIXED preset shows the best accuracy w/o performance degradation. ### Related tickets ref: 123235 ### Tests test_create_nncf_config test_quantization_preset
openvinotoolkit · Oct 25, 2023 · d264494 · d264494
1 parent 9cc57c0
commit d264494
Show file tree

Hide file tree

Showing 13 changed files with 165 additions and 60 deletions.
diff --git a/nncf/experimental/torch/quantization/quantize_model.py b/nncf/experimental/torch/quantization/quantize_model.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
 
@@ -87,7 +87,7 @@ def send_to_device(tensor):
 def quantize_impl(
     model: torch.nn.Module,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset,
+    preset: Union[QuantizationPreset, None],
     target_device: TargetDevice,
     subset_size: int,
     fast_bias_correction: bool,

diff --git a/nncf/onnx/quantization/quantize_model.py b/nncf/onnx/quantization/quantize_model.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Optional, Union
 
 import onnx
 
@@ -31,7 +31,7 @@
 def quantize_impl(
     model: onnx.ModelProto,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset,
+    preset: Union[QuantizationPreset, None],
     target_device: TargetDevice,
     subset_size: int,
     fast_bias_correction: bool,

diff --git a/nncf/openvino/pot/quantization/quantize_model.py b/nncf/openvino/pot/quantization/quantize_model.py
@@ -12,7 +12,7 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, Optional
+from typing import Any, Callable, Dict, Iterable, Optional, Union
 
 import openvino.runtime as ov
 from openvino._offline_transformations import compress_quantize_weights_transformation
@@ -192,22 +192,22 @@ def _create_quantization_group_config(
 
 
 def _create_quantization_config(
-    preset: QuantizationPreset,
+    preset: Union[QuantizationPreset, None],
     target_device: TargetDevice,
     subset_size: int,
     fast_bias_correction: bool,
-    model_type: Optional[ModelType],
-    ignored_scope: Optional[IgnoredScope],
-    advanced_parameters: Optional[AdvancedQuantizationParameters],
+    model_type: Union[ModelType, None],
+    ignored_scope: Union[IgnoredScope, None],
+    advanced_parameters: Union[AdvancedQuantizationParameters, None],
 ) -> Dict[str, Any]:
     """
     Creates a quantization configuration.
 
-    :param preset: A preset that controls the quantization mode
-        (symmetric and asymmetric). It can take the following values:
+    :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+        It can take the following values:
         - `performance`: Symmetric quantization of weights and activations.
-        - `mixed`: Symmetric quantization of weights and asymmetric
-          quantization of activations.
+        - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+        - `None`: `mixed` preset is used for `transformer` model type otherwise `performace`.
     :param target_device: A target device the specificity of which will be
         taken into account while compressing in order to obtain the best
         performance for this type of device.
@@ -224,6 +224,9 @@ def _create_quantization_config(
         fine-tuning the quantization algorithm.
     :return: A POT quantization configuration as dict.
     """
+    if preset is None:
+        preset = QuantizationPreset.MIXED if model_type == ModelType.TRANSFORMER else QuantizationPreset.PERFORMANCE
+
     config = {
         "target_device": target_device.value,
         "preset": preset.value,
@@ -320,7 +323,7 @@ def _create_engine_config(
 def quantize_impl(
     model: ov.Model,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -423,7 +426,7 @@ def quantize_with_accuracy_control_impl(
     validation_fn: Callable[[ov.CompiledModel, Iterable[Any]], float],
     max_drop: float = 0.01,
     drop_type: DropType = DropType.ABSOLUTE,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,

diff --git a/nncf/openvino/quantization/quantize_model.py b/nncf/openvino/quantization/quantize_model.py
@@ -96,7 +96,7 @@ def dump_parameters(model: ov.Model, parameters: Dict, path: Optional[List] = No
 def native_quantize_if_op_impl(
     model: ov.Model,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -138,7 +138,7 @@ def native_quantize_if_op_impl(
     dump_parameters(
         quantized_model,
         {
-            "preset": preset.value,
+            "preset": preset,
             "target_device": target_device.value,
             "subset_size": subset_size,
             "fast_bias_correction": fast_bias_correction,
@@ -154,7 +154,7 @@ def native_quantize_if_op_impl(
 def native_quantize_impl(
     model: ov.Model,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -184,7 +184,7 @@ def native_quantize_impl(
     dump_parameters(
         quantized_model,
         {
-            "preset": preset.value,
+            "preset": preset,
             "target_device": target_device.value,
             "subset_size": subset_size,
             "fast_bias_correction": fast_bias_correction,
@@ -206,7 +206,7 @@ def native_quantize_with_accuracy_control_impl(
     validation_fn: Callable[[Any, Iterable[Any]], Tuple[float, Union[None, List[float], List[List[TTensor]]]]],
     max_drop: float = 0.01,
     drop_type: DropType = DropType.ABSOLUTE,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -321,7 +321,7 @@ def native_quantize_with_accuracy_control_impl(
     dump_parameters(
         quantized_model,
         {
-            "preset": preset.value,
+            "preset": preset,
             "target_device": target_device.value,
             "subset_size": subset_size,
             "fast_bias_correction": fast_bias_correction,
@@ -339,7 +339,7 @@ def native_quantize_with_accuracy_control_impl(
 def quantize_impl(
     model: ov.Model,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -396,7 +396,7 @@ def quantize_with_accuracy_control_impl(
     validation_fn: Callable[[Any, Iterable[Any]], float],
     max_drop: float = 0.01,
     drop_type: DropType = DropType.ABSOLUTE,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,

diff --git a/nncf/quantization/algorithms/min_max/algorithm.py b/nncf/quantization/algorithms/min_max/algorithm.py
@@ -96,7 +96,7 @@ class MinMaxQuantization(Algorithm):
 
     def __init__(
         self,
-        preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+        preset: Optional[QuantizationPreset] = None,
         target_device: TargetDevice = TargetDevice.ANY,
         subset_size: int = 300,
         model_type: Optional[ModelType] = None,
@@ -111,8 +111,12 @@ def __init__(
         backend_params: Optional[Dict[str, Any]] = None,
     ):
         """
-        :param preset: A preset that controls the quantization mode,
-            defaults to QuantizationPreset.PERFORMANCE.
+        :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+            It can take the following values:
+            - `performance`: Symmetric quantization of weights and activations.
+            - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+            Default value is None. In this case, `mixed` preset is used for `transformer`
+            model type otherwise `performace`.
         :param target_device: A target device the specificity of which will be taken
             into account while compressing in order to obtain the best performance
             for this type of device, defaults to TargetDevice.ANY.
@@ -157,6 +161,13 @@ def __init__(
             QuantizerGroup.ACTIVATIONS: activations_range_estimator_params,
         }
 
+        # preset definition
+        if preset is None:
+            if model_type == ModelType.TRANSFORMER:
+                preset = QuantizationPreset.MIXED
+            else:
+                preset = QuantizationPreset.PERFORMANCE
+
         # Calculates global quantizer constraints
         self._global_quantizer_constraints = {}
         for quantizer_group in QuantizerGroup:

diff --git a/nncf/quantization/algorithms/post_training/algorithm.py b/nncf/quantization/algorithms/post_training/algorithm.py
@@ -38,7 +38,7 @@ class PostTrainingQuantization(Algorithm):
 
     def __init__(
         self,
-        preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+        preset: Optional[QuantizationPreset] = None,
         target_device: TargetDevice = TargetDevice.ANY,
         subset_size: int = 300,
         fast_bias_correction: bool = True,
@@ -47,11 +47,12 @@ def __init__(
         advanced_parameters: Optional[AdvancedQuantizationParameters] = None,
     ):
         """
-        :param preset: A preset that controls the quantization mode
-            (symmetric and asymmetric). It can take the following values:
+        :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+            It can take the following values:
             - `performance`: Symmetric quantization of weights and activations.
-            - `mixed`: Symmetric quantization of weights and asymmetric
-            quantization of activations.
+            - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+            Default value is None. In this case, `mixed` preset is used for `transformer`
+            model type otherwise `performace`.
         :param target_device: A target device the specificity of which will be taken
             into account while compressing in order to obtain the best performance
             for this type of device.

diff --git a/nncf/quantization/algorithms/post_training/pipeline.py b/nncf/quantization/algorithms/post_training/pipeline.py
@@ -30,7 +30,7 @@
 
 
 def create_ptq_pipeline(
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -47,11 +47,12 @@ def create_ptq_pipeline(
         3) MinMaxQuantization
         4) FastBiasCorrection or BiasCorrection
 
-    :param preset: A preset that controls the quantization mode
-        (symmetric and asymmetric). It can take the following values:
+    :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+        It can take the following values:
         - `performance`: Symmetric quantization of weights and activations.
-        - `mixed`: Symmetric quantization of weights and asymmetric
-        quantization of activations.
+        - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+        Default value is None. In this case, `mixed` preset is used for `transformer`
+        model type otherwise `performace`.
     :param target_device: A target device the specificity of which will be taken
         into account while compressing in order to obtain the best performance
         for this type of device.

diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
@@ -38,7 +38,7 @@
 def quantize(
     model: TModel,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -54,11 +54,12 @@ def quantize(
     :param calibration_dataset: A representative dataset for the
         calibration process.
     :type  calibration_dataset: nncf.Dataset
-    :param preset: A preset that controls the quantization mode
-        (symmetric and asymmetric). It can take the following values:
+    :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+        It can take the following values:
         - `performance`: Symmetric quantization of weights and activations.
-        - `mixed`: Symmetric quantization of weights and asymmetric
-          quantization of activations.
+        - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+        Default value is None. In this case, `mixed` preset is used for `transformer`
+        model type otherwise `performace`.
     :type  preset: nncf.QuantizationPreset
     :param target_device: A target device the specificity of which will be taken
         into account while compressing in order to obtain the best performance
@@ -152,7 +153,7 @@ def quantize_with_accuracy_control(
     validation_fn: Callable[[Any, Iterable[Any]], float],
     max_drop: float = 0.01,
     drop_type: DropType = DropType.ABSOLUTE,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -179,7 +180,12 @@ def quantize_with_accuracy_control(
     :param max_drop: The maximum accuracy drop that should be achieved after the quantization.
     :param drop_type: The accuracy drop type, which determines how the maximum accuracy
         drop between the original model and the compressed model is calculated.
-    :param preset: A preset that controls the quantization mode.
+    :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+        It can take the following values:
+        - `performance`: Symmetric quantization of weights and activations.
+        - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+        Default value is None. In this case, `mixed` preset is used for `transformer`
+        model type otherwise `performace`.
     :type preset: nncf.QuantizationPreset
     :param target_device: A target device the specificity of which will be taken
         into account while compressing in order to obtain the best performance
@@ -288,7 +294,7 @@ def quantize_with_tune_hyperparams(
     initial_metric_results: MetricResults,
     quantized_metric_results: MetricResults,
     tuner_subset_size: int = 300,
-    preset: QuantizationPreset = QuantizationPreset.PERFORMANCE,
+    preset: Optional[QuantizationPreset] = None,
     target_device: TargetDevice = TargetDevice.ANY,
     subset_size: int = 300,
     fast_bias_correction: bool = True,
@@ -306,7 +312,12 @@ def quantize_with_tune_hyperparams(
     :param initial_metric_results: Initial metric results.
     :param quantized_metric_results: Quantized metric results.
     :param tuner_subset_size: Tuner subset size.
-    :param preset: A preset that controls the quantization mode.
+    :param preset: A preset controls the quantization mode (symmetric and asymmetric).
+        It can take the following values:
+        - `performance`: Symmetric quantization of weights and activations.
+        - `mixed`: Symmetric quantization of weights and asymmetric quantization of activations.
+        Default value is None. In this case, `mixed` preset is used for `transformer`
+        model type otherwise `performace`.
     :param target_device: A target device the specificity of which will be taken
         into account while compressing in order to obtain the best performance
         for this type of device.

diff --git a/nncf/tensorflow/quantization/quantize_model.py b/nncf/tensorflow/quantization/quantize_model.py
@@ -9,7 +9,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import tensorflow as tf
 
@@ -133,7 +133,7 @@ def _create_nncf_config(
 def quantize_impl(
     model: tf.Module,
     calibration_dataset: Dataset,
-    preset: QuantizationPreset,
+    preset: Union[QuantizationPreset, None],
     target_device: TargetDevice,
     subset_size: int,
     fast_bias_correction: bool,
@@ -157,6 +157,9 @@ def quantize_impl(
     if target_device == TargetDevice.CPU_SPR:
         raise RuntimeError("target_device == CPU_SPR is not supported.")
 
+    if preset is None:
+        preset = QuantizationPreset.PERFORMANCE
+
     nncf_config = _create_nncf_config(preset, target_device, subset_size, ignored_scope, advanced_parameters)
 
     calibration_data_loader = CalibrationDataLoader(calibration_dataset)