[NNCF]: Add INT8 weight compression conformance test for Tinyllama-1.…

…1b PyTorch model (#2636) ### Changes - Added the `INT8` compression **test suite** to the `model_scope` - Added `TORCH` backend support in `LMWeightCompression` class - For `INT8` compression, _dataset,_ as well as some other parameters (see [model_scope](https://github.com/openvinotoolkit/nncf/blob/f0081037f28af2a829043d4ddaf4902d91864724/tests/post_training/model_scope.py#L329C1-L340C7)) are set to `None` - [metric_value](https://github.com/openvinotoolkit/nncf/blob/f0081037f28af2a829043d4ddaf4902d91864724/tests/post_training/data/wc_reference_data.yaml#L17C1-L20C15) has been set to **0.95944** - Mainly use `save_pretrained()` for `TORCH` models - Omitted a few method calls that are not supported for `TORCH` models (Check the commits for details) ### Reason for changes Requested to Benchmark changes via `whowhatbench` in issue #2527 ### Related tickets ref: 130788 Closes #2527 ### Tests - Added `INT8` _weight compression_ **conformance** test for `Tinyllama-1.1b` **PyTorch** model --------- Co-authored-by: Aleksander <[email protected]> Co-authored-by: Alexander Suslov <[email protected]>
openvinotoolkit · May 2, 2024 · ba7e1a4 · ba7e1a4
1 parent 08d5f0c
commit ba7e1a4
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 38 deletions.
diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
@@ -18,3 +18,7 @@ tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV:
   metric_value: 0.83795
   num_int4: 188
   num_int8: 124
+tinyllama_int8_data_free_backend_TORCH:
+  metric_value: 0.95624
+  num_int4: 0
+  num_int8: 312
diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
@@ -370,6 +370,15 @@
         "params": {"is_stateful": True},
         "backends": [BackendType.OV],
     },
+    {
+        "reported_name": "tinyllama_int8_data_free",
+        "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
+        "pipeline_cls": LMWeightCompression,
+        "compression_params": {
+            "mode": CompressWeightsMode.INT8_ASYM,
+        },
+        "backends": [BackendType.TORCH],
+    },
 ]
 
 

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
@@ -18,9 +18,12 @@
 
 import numpy as np
 import openvino as ov
+import torch
 from datasets import load_dataset
 from memory_profiler import memory_usage
+from optimum.exporters.openvino.convert import export_from_model
 from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import Evaluator
 
@@ -72,20 +75,36 @@ class LMWeightCompression(BaseTestPipeline):
 
     def prepare_model(self) -> None:
         is_stateful = self.params.get("is_stateful", False)
-        if is_stateful:
-            self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
-        if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
-            # export by model_id
-            self.model_hf = OVModelForCausalLM.from_pretrained(
-                self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
+
+        # load model
+        if self.backend == BackendType.TORCH:
+            if is_stateful:
+                raise RuntimeError(f"is_stateful={is_stateful} is not supported for PyTorch backend.")
+
+            self.model_hf = AutoModelForCausalLM.from_pretrained(
+                self.model_id, torch_dtype=torch.float32, device_map="cpu"
             )
-            self._dump_model_fp32()
+            self.model = self.model_hf
+        elif self.backend == BackendType.OV:
+            if is_stateful:
+                self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
+            if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
+                # export by model_id
+                self.model_hf = OVModelForCausalLM.from_pretrained(
+                    self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                )
+            else:
+                # no export, load from IR. Applicable for sequential run of test cases in local environment.
+                self.model_hf = OVModelForCausalLM.from_pretrained(
+                    self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                )
+            self.model = self.model_hf.model
         else:
-            # no export, load from IR. Applicable for sequential run of test cases in local environment.
-            self.model_hf = OVModelForCausalLM.from_pretrained(
-                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
-            )
-        self.model = self.model_hf.model
+            raise RuntimeError(f"backend={self.backend.value} is not supported.")
+
+        # dump FP32 model
+        if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
+            self._dump_model_fp32()
 
     def prepare_preprocessor(self) -> None:
         self.preprocessor = AutoTokenizer.from_pretrained(self.model_id)
@@ -108,36 +127,40 @@ def transform_fn(data, max_tokens=128):
             inputs["attention_mask"] = attention_mask
             position_ids = np.cumsum(attention_mask, axis=1) - 1
             position_ids[attention_mask == 0] = 1
-
-            # The magic forms KV cache as model inputs
-            batch_size = input_ids.shape[0]
-            for input_name in self.model_hf.key_value_input_names:
-                model_inputs = self.model.input(input_name)
-                shape = model_inputs.get_partial_shape()
-                shape[0] = batch_size
-                if shape[2].is_dynamic:
-                    shape[2] = 0
-                else:
-                    shape[1] = 0
-                inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
-
             inputs["position_ids"] = position_ids
 
-            # initialize the rest of inputs (e.g. beam_idx for stateful models)
-            for val in self.model.inputs:
-                name = val.any_name
-                if name in inputs:
-                    continue
-                shape = list(val.partial_shape.get_min_shape())
-                shape[0] = batch_size
-                inputs[name] = np.zeros(shape)
+            if self.backend == BackendType.OV:
+                # The magic forms KV cache as model inputs
+                batch_size = input_ids.shape[0]
+                for input_name in self.model_hf.key_value_input_names:
+                    model_inputs = self.model.input(input_name)
+                    shape = model_inputs.get_partial_shape()
+                    shape[0] = batch_size
+                    if shape[2].is_dynamic:
+                        shape[2] = 0
+                    else:
+                        shape[1] = 0
+                    inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
+
+                # initialize the rest of inputs (e.g. beam_idx for stateful models)
+                for val in self.model.inputs:
+                    name = val.any_name
+                    if name in inputs:
+                        continue
+                    shape = list(val.partial_shape.get_min_shape())
+                    shape[0] = batch_size
+                    inputs[name] = np.zeros(shape)
+            if self.backend == BackendType.TORCH:
+                for input_name in inputs:
+                    inputs[input_name] = torch.from_numpy(inputs[input_name])
             return inputs
 
         return transform_fn
 
     def prepare_calibration_dataset(self):
         dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e")
         dataset = dataset.filter(lambda example: len(example["text"]) > 128)
+
         self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn())
 
     def cleanup_cache(self):
@@ -164,8 +187,12 @@ def collect_data_from_stdout(self, stdout: str):
     def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
-        ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
-        self.model_hf._save_config(self.output_model_dir)
+
+        if self.backend == BackendType.OV:
+            ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
+            self.model_hf._save_config(self.output_model_dir)
+        elif self.backend == BackendType.TORCH:
+            export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32")
 
     def get_num_compressed(self) -> None:
         """
@@ -174,7 +201,12 @@ def get_num_compressed(self) -> None:
         num_int8 = 0
         num_int4 = 0
 
-        for node in self.model.get_ops():
+        if self.backend == BackendType.TORCH:
+            model = ov.Core().read_model(self.output_model_dir / self.OV_MODEL_NAME)
+        else:
+            model = self.model
+
+        for node in model.get_ops():
             for i in range(node.get_output_size()):
                 if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
                     num_int8 += 1
@@ -192,8 +224,11 @@ def _dump_model_fp32(self) -> None:
         Dump IRs of fp32 models, to help debugging. The test cases may share the same fp32 model, therefore it is saved
         to the dedicated shared folder.
         """
-        self.model_hf.save_pretrained(self.fp32_model_dir)
-        self.model_hf._save_config(self.fp32_model_dir)
+        if self.backend == BackendType.OV:
+            self.model_hf.save_pretrained(self.fp32_model_dir)
+            self.model_hf._save_config(self.fp32_model_dir)
+        elif self.backend == BackendType.TORCH:
+            export_from_model(self.model_hf, self.fp32_model_dir, stateful=False, compression_option="fp32")
 
     def _compress(self):
         """