Data-free mixed precision algorithm for Torch and Torch FX backends (#…

…3042) ### Changes Data-free mixed precision algorithm for Torch and Torch FX backends ### Reason for changes For more accurate weight compression ### Related tickets ref: 153918 ### Tests NNCF/job/manual/job/post_training_weight_compression/228
openvinotoolkit · Oct 30, 2024 · 51a7fb6 · 51a7fb6
1 parent db3a935
commit 51a7fb6
Show file tree

Hide file tree

Showing 7 changed files with 78 additions and 38 deletions.
diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py
@@ -102,20 +102,13 @@ def apply(
             weight_param.compression_config = self._primary_config
             num_weights_in_4bit += weight_param.num_weights
 
-    @property
-    def available_backends(self) -> List[BackendType]:
-        return [BackendType.OPENVINO]
-
+    @abstractmethod
     def _set_backend_entity(self, model: TModel) -> None:
-        model_backend = get_backend(model)
-        if model_backend == BackendType.OPENVINO:
-            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVMixedPrecisionAlgoBackend
+        """
+        Creates a helper class with a backed-specific logic of the algorithm.
 
-            self._backend_entity = OVMixedPrecisionAlgoBackend(model)
-        else:
-            raise nncf.UnsupportedBackendError(
-                "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value)
-            )
+        :param model: Backend-specific input model.
+        """
 
     @abstractmethod
     def get_statistic_points(
@@ -142,6 +135,29 @@ class DataFreeCriterion(MixedPrecisionCriterion):
     A baseline mixed precision criterion that is based on quantization noise of weights only.
     """
 
+    @property
+    def available_backends(self) -> List[BackendType]:
+        return [BackendType.OPENVINO, BackendType.TORCH, BackendType.TORCH_FX]
+
+    def _set_backend_entity(self, model: TModel) -> None:
+        model_backend = get_backend(model)
+        if model_backend == BackendType.OPENVINO:
+            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVWeightCompressionAlgoBackend
+
+            self._backend_entity = OVWeightCompressionAlgoBackend(model)
+        elif model_backend == BackendType.TORCH:
+            from nncf.quantization.algorithms.weight_compression.torch_backend import PTWeightCompressionAlgoBackend
+
+            self._backend_entity = PTWeightCompressionAlgoBackend()
+        elif model_backend == BackendType.TORCH_FX:
+            from nncf.quantization.algorithms.weight_compression.torch_fx_backend import FXWeightCompressionAlgoBackend
+
+            self._backend_entity = FXWeightCompressionAlgoBackend()
+        else:
+            raise nncf.UnsupportedBackendError(
+                "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value)
+            )
+
     def _calc_weight_sensitivity(
         self,
         weight_param: WeightCompressionParameters,
@@ -197,6 +213,21 @@ class DataBasedCriterion(DataFreeCriterion):
 
     STAT_KEY = None
 
+    @property
+    def available_backends(self) -> List[BackendType]:
+        return [BackendType.OPENVINO]
+
+    def _set_backend_entity(self, model: TModel) -> None:
+        model_backend = get_backend(model)
+        if model_backend == BackendType.OPENVINO:
+            from nncf.quantization.algorithms.weight_compression.openvino_backend import OVMixedPrecisionAlgoBackend
+
+            self._backend_entity = OVMixedPrecisionAlgoBackend(model)
+        else:
+            raise nncf.UnsupportedBackendError(
+                "Cannot return backend-specific entity because {} is not supported!".format(model_backend.value)
+            )
+
     def _calc_activation_sensitivity(
         self,
         weight_param: WeightCompressionParameters,

diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
@@ -511,7 +511,6 @@ def compress_weights(
             )
 
         options = {
-            "sensitivity_metric": sensitivity_metric,
             "awq": awq,
             "scale_estimation": scale_estimation,
             "gptq": gptq,
@@ -523,8 +522,11 @@ def compress_weights(
                 f"Torch backend does not support {', '.join(unsupported_options)} option(s). Set them to None."
             )
 
-        if ratio is not None and ratio != 1:
-            raise nncf.ParameterNotSupportedError("Torch backend does not support ratio != 1.")
+        if sensitivity_metric not in [None, SensitivityMetric.WEIGHT_QUANTIZATION_ERROR]:
+            raise nncf.ParameterNotSupportedError(
+                "Torch backend only supports data-free sensitivity metric. "
+                "Set None or SensitivityMetric.WEIGHT_QUANTIZATION_ERROR."
+            )
 
         if is_wrapped_model(model):
             if not model.nncf.trace_parameters:
@@ -553,7 +555,6 @@ def compress_weights(
             )
 
         options = {
-            "sensitivity_metric": sensitivity_metric,
             "awq": awq,
             "scale_estimation": scale_estimation,
             "gptq": gptq,
@@ -565,8 +566,11 @@ def compress_weights(
                 f"TorchFX backend does not support {', '.join(unsupported_options)} option(s). Set them to None."
             )
 
-        if ratio is not None and ratio != 1:
-            raise nncf.ParameterNotSupportedError("TorchFX backend does not support ratio != 1.")
+        if sensitivity_metric not in [None, SensitivityMetric.WEIGHT_QUANTIZATION_ERROR]:
+            raise nncf.ParameterNotSupportedError(
+                "TorchFX backend only supports data-free sensitivity metric. "
+                "Set None or SensitivityMetric.WEIGHT_QUANTIZATION_ERROR."
+            )
 
         if dataset:
             raise nncf.ParameterNotSupportedError(

diff --git a/tests/post_training/data/wc_reference_data_2024.5.yaml b/tests/post_training/data/wc_reference_data_2024.5.yaml
@@ -8,6 +8,6 @@ tinyllama_NF4_scale_estimation_stateful_per_channel_backend_OV:
   num_int4: 11
   num_int8: 290
 tinyllama_int4_data_free_backend_TORCH:
-  metric_value: 0.73541
-  num_int4: 308
-  num_int8: 4
+  metric_value: 0.73873
+  num_int4: 114
+  num_int8: 84
diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
@@ -446,8 +446,10 @@
         "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
         "pipeline_cls": LMWeightCompression,
         "compression_params": {
-            "mode": CompressWeightsMode.INT4_ASYM,
             "group_size": 64,
+            "ratio": 0.8,
+            "mode": CompressWeightsMode.INT4_SYM,
+            "sensitivity_metric": SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,
         },
         "backends": [BackendType.TORCH],
     },

diff --git a/tests/post_training/test_quantize_conformance.py b/tests/post_training/test_quantize_conformance.py
@@ -106,7 +106,10 @@ def ref_data_correction(data: Dict, file_name: str):
         with file_path.open() as f:
             correction_data = yaml.safe_load(f)
         for m_name, c_data in correction_data.items():
-            data[m_name].update(c_data)
+            if m_name in data:
+                data[m_name].update(c_data)
+            else:
+                data[m_name] = c_data
         print(f"Applied correction file {file_path}")
 
     return data
@@ -125,17 +128,18 @@ def fixture_wc_reference_data():
     path_reference = DATA_ROOT / "wc_reference_data.yaml"
     with path_reference.open() as f:
         data = yaml.safe_load(f)
-        fp32_test_cases = defaultdict(dict)
-        for test_case_name in data:
-            if "atol" not in data[test_case_name]:
-                data[test_case_name]["atol"] = 1e-5
-            reported_name = test_case_name.split("_backend_")[0]
-            fp32_case_name = f"{reported_name}_backend_FP32"
-            fp32_test_cases[fp32_case_name]["metric_value"] = 1
-            if "atol" not in fp32_test_cases[fp32_case_name]:
-                fp32_test_cases[fp32_case_name]["atol"] = 1e-10
-        data.update(fp32_test_cases)
-    return ref_data_correction(data, "wc_reference_data")
+    data = ref_data_correction(data, "wc_reference_data")
+    fp32_test_cases = defaultdict(dict)
+    for test_case_name in data:
+        if "atol" not in data[test_case_name]:
+            data[test_case_name]["atol"] = 1e-5
+        reported_name = test_case_name.split("_backend_")[0]
+        fp32_case_name = f"{reported_name}_backend_FP32"
+        fp32_test_cases[fp32_case_name]["metric_value"] = 1
+        if "atol" not in fp32_test_cases[fp32_case_name]:
+            fp32_test_cases[fp32_case_name]["atol"] = 1e-10
+    data.update(fp32_test_cases)
+    return data
 
 
 @pytest.fixture(scope="session", name="ptq_result_data")

diff --git a/tests/torch/fx/test_compress_weights.py b/tests/torch/fx/test_compress_weights.py
@@ -24,6 +24,7 @@
 from nncf.quantization import compress_weights
 from nncf.torch.dynamic_graph.patch_pytorch import disable_patching
 from tests.torch.ptq.test_weights_compression import ALL_SENSITIVITY_METRICS
+from tests.torch.ptq.test_weights_compression import DATA_BASED_SENSITIVITY_METRICS
 from tests.torch.ptq.test_weights_compression import INT4_MODES
 from tests.torch.ptq.test_weights_compression import INT8_MODES
 from tests.torch.ptq.test_weights_compression import SUPPORTED_MODES
@@ -240,8 +241,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params):
 @pytest.mark.parametrize(
     "params",
     (
-        {"ratio": 0.5},
-        *({"sensitivity_metric": metric} for metric in ALL_SENSITIVITY_METRICS),
+        *({"sensitivity_metric": metric} for metric in DATA_BASED_SENSITIVITY_METRICS),
         {"gptq": True},
         {"awq": True},
         {"scale_estimation": True},

diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
@@ -250,8 +250,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params):
 @pytest.mark.parametrize(
     "params",
     (
-        {"ratio": 0.5},
-        *({"sensitivity_metric": metric} for metric in ALL_SENSITIVITY_METRICS),
+        *({"sensitivity_metric": metric} for metric in DATA_BASED_SENSITIVITY_METRICS),
         {"gptq": True},
         {"awq": True},
         {"scale_estimation": True},