From 2d6fbe578f077a86281b82ebf71ce4a87745890e Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Tue, 9 Apr 2024 14:47:41 +0400
Subject: [PATCH 01/34] feat: Added to the test scope

TODO: Make the test
---
 tests/post_training/model_scope.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
index e6c46f715d0..ed6a732eeae 100644
--- a/tests/post_training/model_scope.py
+++ b/tests/post_training/model_scope.py
@@ -326,6 +326,15 @@
         "backends": [BackendType.OV],
         "is_batch_size_supported": False,
     },
+    {
+        "reported_name": "tinyllama_int8_data_free",
+        "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
+        "pipeline_cls": LMWeightCompression,
+        "compression_params": {
+            "mode": CompressWeightsMode.INT8_ASYM,
+        },
+        "backends": [BackendType.TORCH],
+    },
 ]
 
 

From 52e81800e97c2bbce53996ffd71d591d9990ad10 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Fri, 12 Apr 2024 15:43:10 +0400
Subject: [PATCH 02/34] feat: Added torch backend support

compress() and _compress_torch() methods were implemented
---
 .../pipelines/lm_weight_compression.py        | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index b1a6e5853dc..36cdad457f0 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -18,6 +18,8 @@
 
 import numpy as np
 import openvino as ov
+import torch
+import transformers
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
@@ -143,6 +145,22 @@ def cleanup_cache(self):
     def compress(self) -> None:
         if self.backend == BackendType.FP32:
             return
+        elif self.backend == BackendType.TORCH:
+            start_time = time.perf_counter()
+
+            tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id)
+            self.model = transformers.AutoModelForCausalLM.from_pretrained(
+                self.model_id, torch_dtype=torch.float16, device_map="cpu"
+            )
+
+            text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens."
+            token = tokenizer(text, max_length=500, return_tensors="pt", truncation=True)
+            inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]}
+
+            self.run_info.compression_memory_usage = memory_usage(self._compress_torch(inputs), max_usage=True)
+            self.run_info.time_compression = time.perf_counter() - start_time
+
+            return
 
         print("Weight compression...")
         start_time = time.perf_counter()
@@ -174,6 +192,9 @@ def _dump_model_fp32(self) -> None:
         self.model_hf.save_pretrained(self.fp32_model_dir)
         self.model_hf._save_config(self.fp32_model_dir)
 
+    def _compress_torch(self, inputs):
+        self.compressed_model = nncf.compress_weights(self.model, dataset=nncf.Dataset([inputs]))
+
     def _compress(self):
         """
         Actual call of weight compression

From c02480338141fbeaf990ce9d149d2c252c472e93 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 18 Apr 2024 21:17:29 +0400
Subject: [PATCH 03/34] fix: Moved int8 conversion in _validate()

---
 .../pipelines/lm_weight_compression.py        | 27 +++++--------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 36cdad457f0..80d8cf3a5f9 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -19,10 +19,10 @@
 import numpy as np
 import openvino as ov
 import torch
-import transformers
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import Evaluator
 
@@ -145,22 +145,6 @@ def cleanup_cache(self):
     def compress(self) -> None:
         if self.backend == BackendType.FP32:
             return
-        elif self.backend == BackendType.TORCH:
-            start_time = time.perf_counter()
-
-            tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id)
-            self.model = transformers.AutoModelForCausalLM.from_pretrained(
-                self.model_id, torch_dtype=torch.float16, device_map="cpu"
-            )
-
-            text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens."
-            token = tokenizer(text, max_length=500, return_tensors="pt", truncation=True)
-            inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]}
-
-            self.run_info.compression_memory_usage = memory_usage(self._compress_torch(inputs), max_usage=True)
-            self.run_info.time_compression = time.perf_counter() - start_time
-
-            return
 
         print("Weight compression...")
         start_time = time.perf_counter()
@@ -192,9 +176,6 @@ def _dump_model_fp32(self) -> None:
         self.model_hf.save_pretrained(self.fp32_model_dir)
         self.model_hf._save_config(self.fp32_model_dir)
 
-    def _compress_torch(self, inputs):
-        self.compressed_model = nncf.compress_weights(self.model, dataset=nncf.Dataset([inputs]))
-
     def _compress(self):
         """
         Actual call of weight compression
@@ -231,7 +212,11 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        if self.backend != BackendType.FP32:
+        if self.backend == BackendType.TORCH:
+            compressed_model_hf = AutoModelForCausalLM.from_pretrained(
+                self.output_model_dir, torch_dtype=torch.float16, device_map="cpu"
+            )
+        elif self.backend != BackendType.FP32:
             compressed_model_hf = OVModelForCausalLM.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )

From f48c148de50e0a0891a494483ee11371390a5872 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Mon, 22 Apr 2024 21:41:28 +0400
Subject: [PATCH 04/34] fix: Returned initial implementation of _validate()

---
 tests/post_training/pipelines/lm_weight_compression.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index c3f5d89a3f4..169d0a5f771 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -71,6 +71,7 @@ class LMWeightCompression(BaseTestPipeline):
     """Pipeline for casual language models from Hugging Face repository"""
 
     OV_MODEL_NAME = "openvino_model.xml"
+    TORCH_MODEL_NAME = "torch_model.xml"
 
     def prepare_model(self) -> None:
         is_stateful = self.params.get("is_stateful", False)
@@ -226,11 +227,7 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        if self.backend == BackendType.TORCH:
-            compressed_model_hf = AutoModelForCausalLM.from_pretrained(
-                self.output_model_dir, torch_dtype=torch.float16, device_map="cpu"
-            )
-        elif self.backend != BackendType.FP32:
+        if self.backend != BackendType.FP32:
             compressed_model_hf = OVModelForCausalLM.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )

From f9505e41c3a5fe13f12f4a2dceccbb64f180a1eb Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Mon, 22 Apr 2024 22:52:34 +0400
Subject: [PATCH 05/34] chore: Temporary dummy data

---
 tests/post_training/data/wc_reference_data.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 5235d155244..1e97dc4c3d0 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -13,4 +13,8 @@ tinyllama_data_aware_awq_backend_OV:
 tinyllama_data_aware_awq_stateful_backend_OV:
   metric_value: 0.81237
   num_int4: 184
-  num_int8: 128
\ No newline at end of file
+  num_int8: 128
+tinyllama_int8_data_free_backend_TORCH:
+  metric_value: 0.72057
+  num_int4: 228
+  num_int8: 84

From 2bc73ec0b83ed0272ec0505d49d7b149bbb92115 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Mon, 22 Apr 2024 22:53:48 +0400
Subject: [PATCH 06/34] fix: Model Preparation for TORCH backend

---
 .../pipelines/lm_weight_compression.py        | 50 ++++++-------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 169d0a5f771..121000e36c8 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -16,9 +16,9 @@
 from dataclasses import dataclass
 from typing import Dict, Optional
 
+import torch
 import numpy as np
 import openvino as ov
-import torch
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
@@ -71,7 +71,6 @@ class LMWeightCompression(BaseTestPipeline):
     """Pipeline for casual language models from Hugging Face repository"""
 
     OV_MODEL_NAME = "openvino_model.xml"
-    TORCH_MODEL_NAME = "torch_model.xml"
 
     def prepare_model(self) -> None:
         is_stateful = self.params.get("is_stateful", False)
@@ -147,6 +146,11 @@ def compress(self) -> None:
         if self.backend == BackendType.FP32:
             return
 
+        if self.backend == BackendType.TORCH:
+            inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS")
+            if inference_num_threads is not None:
+                torch.set_num_threads(int(inference_num_threads))
+
         print("Weight compression...")
         start_time = time.perf_counter()
         self.run_info.compression_memory_usage = memory_usage(self._compress, max_usage=True)
@@ -160,25 +164,15 @@ def collect_data_from_stdout(self, stdout: str):
     def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
+        elif self.backend == BackendType.TORCH:
+            self.model = ov.convert_model(
+                self.compressed_model.cpu(), example_input=self.dummy_tensor.cpu(), input=self.input_size
+            )
         ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
         self.model_hf._save_config(self.output_model_dir)
 
     def get_num_compressed(self) -> None:
-        """
-        Get number of the i8, u8, i4, u4 ops in the compressed IR.
-        """
-        num_int8 = 0
-        num_int4 = 0
-
-        for node in self.model.get_ops():
-            for i in range(node.get_output_size()):
-                if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
-                    num_int8 += 1
-                if node.get_output_element_type(i).get_type_name() in ["i4", "u4"]:
-                    num_int4 += 1
-
-        self.run_info.num_compress_nodes.num_int8 = num_int8
-        self.run_info.num_compress_nodes.num_int4 = num_int4
+        pass
 
     def run_bench(self) -> None:
         pass
@@ -227,7 +221,11 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        if self.backend != BackendType.FP32:
+        if self.backend == BackendType.TORCH:
+            compressed_model_hf = AutoModelForCausalLM.from_pretrained(
+                self.output_model_dir, torch_dtype=torch.float16, device_map="cpu"
+            )
+        elif self.backend != BackendType.FP32:
             compressed_model_hf = OVModelForCausalLM.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
@@ -236,19 +234,3 @@ def _validate(self):
         similarity = all_metrics["similarity"][0]
         self.run_info.metric_name = "Similarity"
         self.run_info.metric_value = round(similarity, 5)
-
-        num_int4_reference = self.reference_data.get("num_int4")
-        num_int8_reference = self.reference_data.get("num_int8")
-
-        num_int4_value = self.run_info.num_compress_nodes.num_int4
-        num_int8_value = self.run_info.num_compress_nodes.num_int8
-
-        if num_int4_reference != num_int4_value:
-            status_msg = f"Regression: The number of int4 ops is different \
-                than reference {num_int4_reference} != {num_int4_value}"
-            raise ValueError(status_msg)
-
-        if num_int8_reference != num_int8_value:
-            status_msg = f"Regression: The number of int8 ops is different \
-                than reference {num_int8_reference} != {num_int8_value}"
-            raise ValueError(status_msg)

From 927c38fe9d933ba4b6bb38f22b9fb861ee72b580 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Mon, 22 Apr 2024 23:23:24 +0400
Subject: [PATCH 07/34] fix: Removed unsupported parameters for INT8

TODO: Maybe make it in a way where
I check for INT8 instead of
BackendType.TORCH,
---
 tests/post_training/model_scope.py            |  3 +++
 .../pipelines/lm_weight_compression.py        | 26 +++++++------------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
index ed6a732eeae..3920d0af3ea 100644
--- a/tests/post_training/model_scope.py
+++ b/tests/post_training/model_scope.py
@@ -332,6 +332,9 @@
         "pipeline_cls": LMWeightCompression,
         "compression_params": {
             "mode": CompressWeightsMode.INT8_ASYM,
+            "all_layers": None,
+            "awq": None,
+            "sensitivity_metric": None,
         },
         "backends": [BackendType.TORCH],
     },
diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 121000e36c8..ae99b150787 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -16,13 +16,11 @@
 from dataclasses import dataclass
 from typing import Dict, Optional
 
-import torch
 import numpy as np
 import openvino as ov
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
-from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import Evaluator
 
@@ -146,11 +144,6 @@ def compress(self) -> None:
         if self.backend == BackendType.FP32:
             return
 
-        if self.backend == BackendType.TORCH:
-            inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS")
-            if inference_num_threads is not None:
-                torch.set_num_threads(int(inference_num_threads))
-
         print("Weight compression...")
         start_time = time.perf_counter()
         self.run_info.compression_memory_usage = memory_usage(self._compress, max_usage=True)
@@ -164,10 +157,6 @@ def collect_data_from_stdout(self, stdout: str):
     def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
-        elif self.backend == BackendType.TORCH:
-            self.model = ov.convert_model(
-                self.compressed_model.cpu(), example_input=self.dummy_tensor.cpu(), input=self.input_size
-            )
         ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
         self.model_hf._save_config(self.output_model_dir)
 
@@ -189,6 +178,15 @@ def _compress(self):
         """
         Actual call of weight compression
         """
+        if self.backend == BackendType.TORCH:
+            self.compressed_model = nncf.compress_weights(
+                self.model,
+                dataset=None,
+                **self.compression_params,
+            )
+
+            return
+
         self.compressed_model = nncf.compress_weights(
             self.model,
             dataset=self.calibration_dataset,
@@ -221,11 +219,7 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        if self.backend == BackendType.TORCH:
-            compressed_model_hf = AutoModelForCausalLM.from_pretrained(
-                self.output_model_dir, torch_dtype=torch.float16, device_map="cpu"
-            )
-        elif self.backend != BackendType.FP32:
+        if self.backend != BackendType.FP32:
             compressed_model_hf = OVModelForCausalLM.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )

From f0081037f28af2a829043d4ddaf4902d91864724 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Mon, 22 Apr 2024 23:34:46 +0400
Subject: [PATCH 08/34] chore: Comment on important addition

---
 tests/post_training/pipelines/lm_weight_compression.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index ae99b150787..cf1845266bd 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -179,6 +179,7 @@ def _compress(self):
         Actual call of weight compression
         """
         if self.backend == BackendType.TORCH:
+            """If Backend is TORCH (Assuming that it's INT8 compression), don't use a dataset as it's Unsupported"""
             self.compressed_model = nncf.compress_weights(
                 self.model,
                 dataset=None,

From eeade477464c016cd031be399f6a64db9b1cdb7c Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Mon, 22 Apr 2024 23:46:08 +0400
Subject: [PATCH 09/34] feat: Added correct metric value according to @aleksu52

Co-authored-by: Aleksander <aleksu52@noreply.github.com>
---
 tests/post_training/data/wc_reference_data.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 1e97dc4c3d0..760d6ed5fea 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -15,6 +15,6 @@ tinyllama_data_aware_awq_stateful_backend_OV:
   num_int4: 184
   num_int8: 128
 tinyllama_int8_data_free_backend_TORCH:
-  metric_value: 0.72057
+  metric_value: 0.96283
   num_int4: 228
   num_int8: 84

From fc05eed783f417584d630b879aa3263b5d963e7d Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Tue, 23 Apr 2024 11:35:02 +0400
Subject: [PATCH 10/34] fix: Mode accurate check for the INT8 compression mode

---
 .../pipelines/lm_weight_compression.py              | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index cf1845266bd..73d2dc39ef9 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -20,6 +20,7 @@
 import openvino as ov
 from datasets import load_dataset
 from memory_profiler import memory_usage
+from nncf.parameters import CompressWeightsMode
 from optimum.intel.openvino import OVModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import Evaluator
@@ -178,15 +179,9 @@ def _compress(self):
         """
         Actual call of weight compression
         """
-        if self.backend == BackendType.TORCH:
-            """If Backend is TORCH (Assuming that it's INT8 compression), don't use a dataset as it's Unsupported"""
-            self.compressed_model = nncf.compress_weights(
-                self.model,
-                dataset=None,
-                **self.compression_params,
-            )
-
-            return
+        if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM:
+            """If compression mode is INT8, don't use a dataset as it's Unsupported"""
+            self.calibration_dataset = None
 
         self.compressed_model = nncf.compress_weights(
             self.model,

From 4aefa0dbd92a8063de246d96b7984e4208d09312 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Tue, 23 Apr 2024 17:10:49 +0400
Subject: [PATCH 11/34] feat: Problematic code for @aleksu52 to reproduce

---
 .../pipelines/lm_weight_compression.py        | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 73d2dc39ef9..18229b73aae 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -175,13 +175,31 @@ def _dump_model_fp32(self) -> None:
         self.model_hf.save_pretrained(self.fp32_model_dir)
         self.model_hf._save_config(self.fp32_model_dir)
 
+    # def _compress(self):
+    #     """
+    #     Actual call of weight compression
+    #     """
+    #     if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM:
+    #         """If compression mode is INT8, don't use a dataset as it's Unsupported"""
+    #         self.calibration_dataset = None
+    #
+    #     self.compressed_model = nncf.compress_weights(
+    #         self.model,
+    #         dataset=self.calibration_dataset,
+    #         **self.compression_params,
+    #     )
     def _compress(self):
         """
         Actual call of weight compression
         """
-        if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM:
-            """If compression mode is INT8, don't use a dataset as it's Unsupported"""
-            self.calibration_dataset = None
+        if self.backend == BackendType.TORCH:
+            from nncf.torch.model_creation import is_wrapped_model
+            from nncf.torch.model_creation import wrap_model
+
+            if not is_wrapped_model(self.model):
+                example_input = next(iter(self.calibration_dataset.get_inference_data()))
+                self.model = wrap_model(self.model, example_input=example_input, trace_parameters=True)
+                self.calibration_dataset = None
 
         self.compressed_model = nncf.compress_weights(
             self.model,

From 737c1a70bbd06c0f88ad3eb8aef293dc2bdcc866 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Wed, 24 Apr 2024 13:05:22 +0400
Subject: [PATCH 12/34] feat: Use AutoModelForCausalLM for TORCH models

---
 .../pipelines/lm_weight_compression.py        | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 18229b73aae..a1b634481e7 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -22,7 +22,7 @@
 from memory_profiler import memory_usage
 from nncf.parameters import CompressWeightsMode
 from optimum.intel.openvino import OVModelForCausalLM
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from whowhatbench import Evaluator
 
 import nncf
@@ -69,21 +69,25 @@ def get_stats(self) -> Dict[str, str]:
 class LMWeightCompression(BaseTestPipeline):
     """Pipeline for casual language models from Hugging Face repository"""
 
-    OV_MODEL_NAME = "openvino_model.xml"
+    MODEL_NAME = "openvino_model.xml"
+    MODEL_FUNC = OVModelForCausalLM
 
     def prepare_model(self) -> None:
+        if self.backend == BackendType.TORCH:
+            self.MODEL_NAME = "torch_model.xml"
+            self.MODEL_FUNC = AutoModelForCausalLM
         is_stateful = self.params.get("is_stateful", False)
         if is_stateful:
             self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
-        if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
+        if not (self.fp32_model_dir / self.MODEL_NAME).exists():
             # export by model_id
-            self.model_hf = OVModelForCausalLM.from_pretrained(
+            self.model_hf = self.MODEL_FUNC.from_pretrained(
                 self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
             self._dump_model_fp32()
         else:
             # no export, load from IR. Applicable for sequential run of test cases in local environment.
-            self.model_hf = OVModelForCausalLM.from_pretrained(
+            self.model_hf = self.MODEL_FUNC.from_pretrained(
                 self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
         self.model = self.model_hf.model
@@ -158,7 +162,7 @@ def collect_data_from_stdout(self, stdout: str):
     def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
-        ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
+        ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME)
         self.model_hf._save_config(self.output_model_dir)
 
     def get_num_compressed(self) -> None:
@@ -220,7 +224,7 @@ def _validate(self):
         gt_data_path.parent.mkdir(parents=True, exist_ok=True)
         if os.getenv("NNCF_TEST_REGEN_DOT") is not None:
             print("Collection ground-truth reference data")
-            model_gold = OVModelForCausalLM.from_pretrained(
+            model_gold = self.MODEL_FUNC.from_pretrained(
                 self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
             evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",))
@@ -234,7 +238,7 @@ def _validate(self):
 
         compressed_model_hf = self.model_hf
         if self.backend != BackendType.FP32:
-            compressed_model_hf = OVModelForCausalLM.from_pretrained(
+            compressed_model_hf = self.MODEL_FUNC.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
         print("Evaluation of the target model")

From 8066b76daca190cf3349d51f18c95c6fa8267ee6 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Wed, 24 Apr 2024 13:12:57 +0400
Subject: [PATCH 13/34] fix: Added model specific parameters during preparation

Some parameters for OVModelForCausalLM DO NOT apply for
LlamaForCausalLM
---
 tests/post_training/pipelines/lm_weight_compression.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index a1b634481e7..96551d983c0 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -73,22 +73,25 @@ class LMWeightCompression(BaseTestPipeline):
     MODEL_FUNC = OVModelForCausalLM
 
     def prepare_model(self) -> None:
+        is_stateful = self.params.get("is_stateful", False)
         if self.backend == BackendType.TORCH:
             self.MODEL_NAME = "torch_model.xml"
             self.MODEL_FUNC = AutoModelForCausalLM
-        is_stateful = self.params.get("is_stateful", False)
+            MODEL_SPECIFIC_PARAMS = {}
+        else:
+            MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}
         if is_stateful:
             self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
         if not (self.fp32_model_dir / self.MODEL_NAME).exists():
             # export by model_id
             self.model_hf = self.MODEL_FUNC.from_pretrained(
-                self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS
             )
             self._dump_model_fp32()
         else:
             # no export, load from IR. Applicable for sequential run of test cases in local environment.
             self.model_hf = self.MODEL_FUNC.from_pretrained(
-                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS
             )
         self.model = self.model_hf.model
 

From 004199852d0d7145adec284210913a30490bfbca Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 25 Apr 2024 00:03:51 +0400
Subject: [PATCH 14/34] refactor: Make a tokenizer during model preparation

Needed to tokenize example input for torch model later
---
 .../pipelines/lm_weight_compression.py        | 26 +++++--------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 96551d983c0..4b004301a41 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -18,11 +18,12 @@
 
 import numpy as np
 import openvino as ov
+import transformers
 from datasets import load_dataset
 from memory_profiler import memory_usage
-from nncf.parameters import CompressWeightsMode
 from optimum.intel.openvino import OVModelForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
 from whowhatbench import Evaluator
 
 import nncf
@@ -78,16 +79,16 @@ def prepare_model(self) -> None:
             self.MODEL_NAME = "torch_model.xml"
             self.MODEL_FUNC = AutoModelForCausalLM
             MODEL_SPECIFIC_PARAMS = {}
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id)
         else:
             MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}
         if is_stateful:
             self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
         if not (self.fp32_model_dir / self.MODEL_NAME).exists():
             # export by model_id
-            self.model_hf = self.MODEL_FUNC.from_pretrained(
-                self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS
-            )
-            self._dump_model_fp32()
+            self.model_hf = self.MODEL_FUNC.from_pretrained(self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS)
+            if self.backend != BackendType.TORCH:
+                self._dump_model_fp32()
         else:
             # no export, load from IR. Applicable for sequential run of test cases in local environment.
             self.model_hf = self.MODEL_FUNC.from_pretrained(
@@ -182,19 +183,6 @@ def _dump_model_fp32(self) -> None:
         self.model_hf.save_pretrained(self.fp32_model_dir)
         self.model_hf._save_config(self.fp32_model_dir)
 
-    # def _compress(self):
-    #     """
-    #     Actual call of weight compression
-    #     """
-    #     if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM:
-    #         """If compression mode is INT8, don't use a dataset as it's Unsupported"""
-    #         self.calibration_dataset = None
-    #
-    #     self.compressed_model = nncf.compress_weights(
-    #         self.model,
-    #         dataset=self.calibration_dataset,
-    #         **self.compression_params,
-    #     )
     def _compress(self):
         """
         Actual call of weight compression

From 3a61ccf81c80c1260a098dec63a6bf19e5e1e74d Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 25 Apr 2024 00:04:50 +0400
Subject: [PATCH 15/34] feat: Tokenize an input string (Temporary) to feed in
 torch model

TODO: Tokenize the dataset, instead of the string
---
 tests/post_training/pipelines/lm_weight_compression.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 4b004301a41..fecb029c7ba 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -140,6 +140,15 @@ def transform_fn(data):
     def prepare_calibration_dataset(self):
         dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e")
         dataset = dataset.filter(lambda example: len(example["text"]) > 80)
+        if self.backend == BackendType.TORCH:
+            example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens."
+            token = self.tokenizer(example_text, max_length=500, return_tensors="pt", truncation=True)
+            inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]}
+
+            self.calibration_dataset = nncf.Dataset([inputs])
+
+            return
+
         self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn())
 
     def cleanup_cache(self):

From ea0c4c4fe40c1e55dcba12471ef0370cd50a346d Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 25 Apr 2024 00:24:05 +0400
Subject: [PATCH 16/34] fix: Added torch_dtype parameter to the model

---
 tests/post_training/pipelines/lm_weight_compression.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index fecb029c7ba..253db44cfa7 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import openvino as ov
+import torch
 import transformers
 from datasets import load_dataset
 from memory_profiler import memory_usage
@@ -78,7 +79,7 @@ def prepare_model(self) -> None:
         if self.backend == BackendType.TORCH:
             self.MODEL_NAME = "torch_model.xml"
             self.MODEL_FUNC = AutoModelForCausalLM
-            MODEL_SPECIFIC_PARAMS = {}
+            MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16}
             self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id)
         else:
             MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}

From c3461008e05e69d0f89cc361a682958c88920f27 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 25 Apr 2024 00:40:24 +0400
Subject: [PATCH 17/34] chore: Removed unnecessary compression parameters

---
 tests/post_training/model_scope.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py
index ac4f35051ca..9574b66cfaf 100644
--- a/tests/post_training/model_scope.py
+++ b/tests/post_training/model_scope.py
@@ -348,9 +348,6 @@
         "pipeline_cls": LMWeightCompression,
         "compression_params": {
             "mode": CompressWeightsMode.INT8_ASYM,
-            "all_layers": None,
-            "awq": None,
-            "sensitivity_metric": None,
         },
         "backends": [BackendType.TORCH],
     },

From 1cfccf97de737a28ddbfff5086a227ea22c2e950 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 25 Apr 2024 14:40:54 +0400
Subject: [PATCH 18/34] refactor: Line spacing, preprocessor usage

---
 .../pipelines/lm_weight_compression.py        | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 253db44cfa7..c1208ee4338 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -19,7 +19,6 @@
 import numpy as np
 import openvino as ov
 import torch
-import transformers
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
@@ -76,25 +75,31 @@ class LMWeightCompression(BaseTestPipeline):
 
     def prepare_model(self) -> None:
         is_stateful = self.params.get("is_stateful", False)
+
         if self.backend == BackendType.TORCH:
             self.MODEL_NAME = "torch_model.xml"
             self.MODEL_FUNC = AutoModelForCausalLM
-            MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16}
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id)
+            self.MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16}
         else:
-            MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}
+            self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}
+
         if is_stateful:
             self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
+
         if not (self.fp32_model_dir / self.MODEL_NAME).exists():
             # export by model_id
-            self.model_hf = self.MODEL_FUNC.from_pretrained(self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS)
+            self.model_hf = self.MODEL_FUNC.from_pretrained(
+                self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
+            )
+
             if self.backend != BackendType.TORCH:
                 self._dump_model_fp32()
         else:
             # no export, load from IR. Applicable for sequential run of test cases in local environment.
             self.model_hf = self.MODEL_FUNC.from_pretrained(
-                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS
+                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
             )
+
         self.model = self.model_hf.model
 
     def prepare_preprocessor(self) -> None:
@@ -143,7 +148,7 @@ def prepare_calibration_dataset(self):
         dataset = dataset.filter(lambda example: len(example["text"]) > 80)
         if self.backend == BackendType.TORCH:
             example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens."
-            token = self.tokenizer(example_text, max_length=500, return_tensors="pt", truncation=True)
+            token = self.preprocessor(example_text, max_length=500, return_tensors="pt", truncation=True)
             inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]}
 
             self.calibration_dataset = nncf.Dataset([inputs])
@@ -211,6 +216,9 @@ def _compress(self):
             dataset=self.calibration_dataset,
             **self.compression_params,
         )
+        self.compressed_model = ov.convert_model(
+            self.compressed_model, example_input=torch.rand(1, 3, 224, 224).to(torch.long)
+        )
 
     def _validate(self):
         is_stateful = self.params.get("is_stateful", False)

From 5deba30ed9c33a1cb99696a3152c5810095861bb Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Sat, 27 Apr 2024 23:15:04 +0400
Subject: [PATCH 19/34] fix: Removing convert_model()

Does not work with CausalModels
---
 tests/post_training/pipelines/lm_weight_compression.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index c1208ee4338..296adc22e3a 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -216,9 +216,6 @@ def _compress(self):
             dataset=self.calibration_dataset,
             **self.compression_params,
         )
-        self.compressed_model = ov.convert_model(
-            self.compressed_model, example_input=torch.rand(1, 3, 224, 224).to(torch.long)
-        )
 
     def _validate(self):
         is_stateful = self.params.get("is_stateful", False)

From 40c5686755b49bfc6947b60dc1a5f12d489b1804 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Sun, 28 Apr 2024 00:03:34 +0400
Subject: [PATCH 20/34] fix: The pipeline now runs for TORCH models

TODO: Figure out why the metric value is so low (-0.00414)
---
 .../pipelines/lm_weight_compression.py            | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 296adc22e3a..9a093e51da2 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -92,8 +92,7 @@ def prepare_model(self) -> None:
                 self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
             )
 
-            if self.backend != BackendType.TORCH:
-                self._dump_model_fp32()
+            self._dump_model_fp32()
         else:
             # no export, load from IR. Applicable for sequential run of test cases in local environment.
             self.model_hf = self.MODEL_FUNC.from_pretrained(
@@ -181,6 +180,11 @@ def collect_data_from_stdout(self, stdout: str):
     def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
+        if self.backend == BackendType.TORCH:
+            self.compressed_model.save_pretrained(self.output_model_dir)
+
+            return
+
         ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME)
         self.model_hf._save_config(self.output_model_dir)
 
@@ -196,7 +200,8 @@ def _dump_model_fp32(self) -> None:
         to the dedicated shared folder.
         """
         self.model_hf.save_pretrained(self.fp32_model_dir)
-        self.model_hf._save_config(self.fp32_model_dir)
+        if not self.backend == BackendType.TORCH:
+            self.model_hf._save_config(self.fp32_model_dir)
 
     def _compress(self):
         """
@@ -231,7 +236,7 @@ def _validate(self):
         if os.getenv("NNCF_TEST_REGEN_DOT") is not None:
             print("Collection ground-truth reference data")
             model_gold = self.MODEL_FUNC.from_pretrained(
-                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
             )
             evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",))
             evaluator.dump_gt(str(gt_data_path))
@@ -245,7 +250,7 @@ def _validate(self):
         compressed_model_hf = self.model_hf
         if self.backend != BackendType.FP32:
             compressed_model_hf = self.MODEL_FUNC.from_pretrained(
-                self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS 
             )
         print("Evaluation of the target model")
         _, all_metrics = evaluator.score(compressed_model_hf)

From d3989be4595fd93476926fc7588e66794d744298 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Sun, 28 Apr 2024 13:02:42 +0400
Subject: [PATCH 21/34] fix: Using model_hf for validation

---
 tests/post_training/pipelines/lm_weight_compression.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 9a093e51da2..85407ba4d7e 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -79,7 +79,7 @@ def prepare_model(self) -> None:
         if self.backend == BackendType.TORCH:
             self.MODEL_NAME = "torch_model.xml"
             self.MODEL_FUNC = AutoModelForCausalLM
-            self.MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16}
+            self.MODEL_SPECIFIC_PARAMS = {}
         else:
             self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}
 
@@ -181,7 +181,7 @@ def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
         if self.backend == BackendType.TORCH:
-            self.compressed_model.save_pretrained(self.output_model_dir)
+            self.model_hf.save_pretrained(self.output_model_dir)
 
             return
 
@@ -248,10 +248,12 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        if self.backend != BackendType.FP32:
+        raise ValueError(f"{type(compressed_model_hf)}")
+        if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH:
             compressed_model_hf = self.MODEL_FUNC.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS 
             )
+
         print("Evaluation of the target model")
         _, all_metrics = evaluator.score(compressed_model_hf)
         similarity = all_metrics["similarity"][0]

From 43aec31c78e4bf7e829b6e288a4de1abf2ed1e65 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Sun, 28 Apr 2024 13:03:10 +0400
Subject: [PATCH 22/34] fix: Changed the reference metric value

---
 tests/post_training/data/wc_reference_data.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index 760d6ed5fea..ee3fe5db378 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -15,6 +15,6 @@ tinyllama_data_aware_awq_stateful_backend_OV:
   num_int4: 184
   num_int8: 128
 tinyllama_int8_data_free_backend_TORCH:
-  metric_value: 0.96283
+  metric_value: 0.95944
   num_int4: 228
   num_int8: 84

From a85ded2bfe9aaf35158af8229d7c787ac9e2bac4 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Sun, 28 Apr 2024 13:04:54 +0400
Subject: [PATCH 23/34] refactor: Pre-Commit changes

---
 tests/post_training/pipelines/lm_weight_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 85407ba4d7e..b3d94f403f7 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -251,7 +251,7 @@ def _validate(self):
         raise ValueError(f"{type(compressed_model_hf)}")
         if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH:
             compressed_model_hf = self.MODEL_FUNC.from_pretrained(
-                self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS 
+                self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
             )
 
         print("Evaluation of the target model")

From 28af5697cb166b34790ea878898b53d8da186d5a Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Tue, 30 Apr 2024 10:58:59 +0400
Subject: [PATCH 24/34] fix: Returned the original checks for int4/int8 values

I do not remember removing these
---
 .../pipelines/lm_weight_compression.py           | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index b3d94f403f7..76a4d244d2b 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -259,3 +259,19 @@ def _validate(self):
         similarity = all_metrics["similarity"][0]
         self.run_info.metric_name = "Similarity"
         self.run_info.metric_value = round(similarity, 5)
+
+        num_int4_reference = self.reference_data.get("num_int4")
+        num_int8_reference = self.reference_data.get("num_int8")
+
+        num_int4_value = self.run_info.num_compress_nodes.num_int4
+        num_int8_value = self.run_info.num_compress_nodes.num_int8
+
+        if num_int4_reference != num_int4_value:
+            status_msg = f"Regression: The number of int4 ops is different \
+                than reference {num_int4_reference} != {num_int4_value}"
+            raise ValueError(status_msg)
+
+        if num_int8_reference != num_int8_value:
+            status_msg = f"Regression: The number of int8 ops is different \
+                than reference {num_int8_reference} != {num_int8_value}"
+            raise ValueError(status_msg)

From a72ae7ee603b127c13ec636d87a9b4d949a6b58c Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Tue, 30 Apr 2024 11:48:11 +0400
Subject: [PATCH 25/34] chore: Pre-Commit changes

---
 tests/post_training/pipelines/lm_weight_compression.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 76a4d244d2b..cfa81d19ce8 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 import openvino as ov
-import torch
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
@@ -200,7 +199,7 @@ def _dump_model_fp32(self) -> None:
         to the dedicated shared folder.
         """
         self.model_hf.save_pretrained(self.fp32_model_dir)
-        if not self.backend == BackendType.TORCH:
+        if self.backend != BackendType.TORCH:
             self.model_hf._save_config(self.fp32_model_dir)
 
     def _compress(self):
@@ -223,7 +222,6 @@ def _compress(self):
         )
 
     def _validate(self):
-        is_stateful = self.params.get("is_stateful", False)
         core = ov.Core()
 
         if os.environ.get("INFERENCE_NUM_THREADS"):

From 7d328c3253608242a68ad605a1393fc120124942 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Tue, 30 Apr 2024 20:53:13 +0400
Subject: [PATCH 26/34] refactor: Pre-Commit Changes

---
 tests/post_training/pipelines/lm_weight_compression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 13419ab66be..2318f3b5f86 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -158,7 +158,7 @@ def prepare_calibration_dataset(self):
 
             self.calibration_dataset = nncf.Dataset([inputs])
 
-            return        
+            return
         self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn())
 
     def cleanup_cache(self):

From 7e50cfa4a64aeb884f403421457856a3abf6efb1 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Wed, 1 May 2024 16:03:16 +0400
Subject: [PATCH 27/34] fix: Removed the debugging line

---
 tests/post_training/pipelines/lm_weight_compression.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 2318f3b5f86..784f7514a21 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -252,7 +252,6 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        raise ValueError(f"{type(compressed_model_hf)}")
         if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH:
             compressed_model_hf = self.MODEL_FUNC.from_pretrained(
                 self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS

From 7c31d3d0212fdd7454629107fdca5069abd6e972 Mon Sep 17 00:00:00 2001
From: Adil Alizada <80326762+AdiKsOnDev@users.noreply.github.com>
Date: Thu, 2 May 2024 14:57:06 +0400
Subject: [PATCH 28/34] fix: Corrected reference data for TORCH backend

Co-authored-by: Alexander Suslov <alexander.suslov@intel.com>
---
 tests/post_training/data/wc_reference_data.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml
index b93d84d65dd..af744abe02f 100644
--- a/tests/post_training/data/wc_reference_data.yaml
+++ b/tests/post_training/data/wc_reference_data.yaml
@@ -19,6 +19,6 @@ tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV:
   num_int4: 188
   num_int8: 124
 tinyllama_int8_data_free_backend_TORCH:
-  metric_value: 0.95944
-  num_int4: 228
-  num_int8: 84
\ No newline at end of file
+  metric_value: 0.95624
+  num_int4: 0
+  num_int8: 312
\ No newline at end of file

From 6899097e8d977fc942d9c31902b3b4cc978f9785 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 2 May 2024 15:03:35 +0400
Subject: [PATCH 29/34] refactor: Code made cleaner

Also deleted the following class attributes:
MODEL_NAME
MODEL_FUNC
---
 .../pipelines/lm_weight_compression.py        | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 784f7514a21..eac1b493d8f 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -16,6 +16,7 @@
 from dataclasses import dataclass
 from typing import Dict, Optional
 
+import torch
 import numpy as np
 import openvino as ov
 from datasets import load_dataset
@@ -69,36 +70,40 @@ def get_stats(self) -> Dict[str, str]:
 class LMWeightCompression(BaseTestPipeline):
     """Pipeline for casual language models from Hugging Face repository"""
 
-    MODEL_NAME = "openvino_model.xml"
-    MODEL_FUNC = OVModelForCausalLM
+    OV_MODEL_NAME = "openvino_model.xml"
 
     def prepare_model(self) -> None:
         is_stateful = self.params.get("is_stateful", False)
 
+        # load model
         if self.backend == BackendType.TORCH:
-            self.MODEL_NAME = "torch_model.xml"
-            self.MODEL_FUNC = AutoModelForCausalLM
-            self.MODEL_SPECIFIC_PARAMS = {}
-        else:
-            self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}
-
-        if is_stateful:
-            self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
+            if is_stateful:
+                raise RuntimeError(f"is_stateful={is_stateful} is not supported for PyTorch backend.")
 
-        if not (self.fp32_model_dir / self.MODEL_NAME).exists():
-            # export by model_id
-            self.model_hf = self.MODEL_FUNC.from_pretrained(
-                self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
+            self.model_hf = AutoModelForCausalLM.from_pretrained(
+                self.model_id, torch_dtype=torch.float32, device_map="cpu"
             )
-
-            self._dump_model_fp32()
+            self.model = self.model_hf
+        elif self.backend == BackendType.OV:
+            if is_stateful:
+                self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
+            if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
+                # export by model_id
+                self.model_hf = OVModelForCausalLM.from_pretrained(
+                    self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                )
+            else:
+                # no export, load from IR. Applicable for sequential run of test cases in local environment.
+                self.model_hf = OVModelForCausalLM.from_pretrained(
+                    self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
+                )
+            self.model = self.model_hf.model
         else:
-            # no export, load from IR. Applicable for sequential run of test cases in local environment.
-            self.model_hf = self.MODEL_FUNC.from_pretrained(
-                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
-            )
+            raise RuntimeError(f"backend={self.backend.value} is not supported.")
 
-        self.model = self.model_hf.model
+        # dump FP32 model
+        if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
+            self._dump_model_fp32()
 
     def prepare_preprocessor(self) -> None:
         self.preprocessor = AutoTokenizer.from_pretrained(self.model_id)

From 86e91f909549817205f08dbad3523c46dbb3a763 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 2 May 2024 15:09:12 +0400
Subject: [PATCH 30/34] fix: Utilized wikitext for TORCH models as well

Co-authored-by: Alexander Suslov <alexander.suslov@intel.com>
---
 .../pipelines/lm_weight_compression.py        | 54 +++++++++----------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index eac1b493d8f..6a47f629194 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -16,9 +16,9 @@
 from dataclasses import dataclass
 from typing import Dict, Optional
 
-import torch
 import numpy as np
 import openvino as ov
+import torch
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
@@ -126,29 +126,32 @@ def transform_fn(data, max_tokens=128):
             inputs["attention_mask"] = attention_mask
             position_ids = np.cumsum(attention_mask, axis=1) - 1
             position_ids[attention_mask == 0] = 1
-
-            # The magic forms KV cache as model inputs
-            batch_size = input_ids.shape[0]
-            for input_name in self.model_hf.key_value_input_names:
-                model_inputs = self.model.input(input_name)
-                shape = model_inputs.get_partial_shape()
-                shape[0] = batch_size
-                if shape[2].is_dynamic:
-                    shape[2] = 0
-                else:
-                    shape[1] = 0
-                inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
-
             inputs["position_ids"] = position_ids
 
-            # initialize the rest of inputs (e.g. beam_idx for stateful models)
-            for val in self.model.inputs:
-                name = val.any_name
-                if name in inputs:
-                    continue
-                shape = list(val.partial_shape.get_min_shape())
-                shape[0] = batch_size
-                inputs[name] = np.zeros(shape)
+            if self.backend == BackendType.OV:
+                # The magic forms KV cache as model inputs
+                batch_size = input_ids.shape[0]
+                for input_name in self.model_hf.key_value_input_names:
+                    model_inputs = self.model.input(input_name)
+                    shape = model_inputs.get_partial_shape()
+                    shape[0] = batch_size
+                    if shape[2].is_dynamic:
+                        shape[2] = 0
+                    else:
+                        shape[1] = 0
+                    inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
+
+                # initialize the rest of inputs (e.g. beam_idx for stateful models)
+                for val in self.model.inputs:
+                    name = val.any_name
+                    if name in inputs:
+                        continue
+                    shape = list(val.partial_shape.get_min_shape())
+                    shape[0] = batch_size
+                    inputs[name] = np.zeros(shape)
+            if self.backend == BackendType.TORCH:
+                for input_name in inputs:
+                    inputs[input_name] = torch.from_numpy(inputs[input_name])
             return inputs
 
         return transform_fn
@@ -156,14 +159,7 @@ def transform_fn(data, max_tokens=128):
     def prepare_calibration_dataset(self):
         dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e")
         dataset = dataset.filter(lambda example: len(example["text"]) > 128)
-        if self.backend == BackendType.TORCH:
-            example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens."
-            token = self.preprocessor(example_text, max_length=500, return_tensors="pt", truncation=True)
-            inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]}
-
-            self.calibration_dataset = nncf.Dataset([inputs])
 
-            return
         self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn())
 
     def cleanup_cache(self):

From 7f32430235dc703ff72f95cda85b82154e3eb7da Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 2 May 2024 15:14:45 +0400
Subject: [PATCH 31/34] feat: Implemented get_num_compressed

Co-authored-by: Alexander Suslov <alexander.suslov@intel.com>
---
 .../pipelines/lm_weight_compression.py        | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 6a47f629194..96b725849d0 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -22,6 +22,7 @@
 from datasets import load_dataset
 from memory_profiler import memory_usage
 from optimum.intel.openvino import OVModelForCausalLM
+from optimum.exporters.openvino.convert import export_from_model
 from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import Evaluator
@@ -186,16 +187,34 @@ def collect_data_from_stdout(self, stdout: str):
     def save_compressed_model(self) -> None:
         if self.backend == BackendType.FP32:
             return
-        if self.backend == BackendType.TORCH:
-            self.model_hf.save_pretrained(self.output_model_dir)
-
-            return
 
-        ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME)
-        self.model_hf._save_config(self.output_model_dir)
+        if self.backend == BackendType.OV:
+	            ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
+	            self.model_hf._save_config(self.output_model_dir)
+	        elif self.backend == BackendType.TORCH:
+	            export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32")
 
     def get_num_compressed(self) -> None:
-        pass
+        """
+        Get number of the i8, u8, i4, u4 ops in the compressed IR.
+        """
+        num_int8 = 0
+        num_int4 = 0
+
+        if self.backend == BackendType.TORCH:
+            model = ov.Core().read_model(self.output_model_dir / self.OV_MODEL_NAME)
+        else:
+            model = self.model
+
+        for node in model.get_ops():
+            for i in range(node.get_output_size()):
+                if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
+                    num_int8 += 1
+                if node.get_output_element_type(i).get_type_name() in ["i4", "u4"]:
+                    num_int4 += 1
+
+        self.run_info.num_compress_nodes.num_int8 = num_int8
+        self.run_info.num_compress_nodes.num_int4 = num_int4
 
     def run_bench(self) -> None:
         pass

From 7729867343e3b16f5284ea6686445dc378122e35 Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 2 May 2024 15:15:52 +0400
Subject: [PATCH 32/34] fix: Dumping the fp32 model correctly

Utilization of export_from_model() function from Optimum
Co-authored-by: Alexander Suslov <alexander.suslov@intel.com>
---
 tests/post_training/pipelines/lm_weight_compression.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 96b725849d0..6e0c3e5138b 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -224,9 +224,11 @@ def _dump_model_fp32(self) -> None:
         Dump IRs of fp32 models, to help debugging. The test cases may share the same fp32 model, therefore it is saved
         to the dedicated shared folder.
         """
-        self.model_hf.save_pretrained(self.fp32_model_dir)
-        if self.backend != BackendType.TORCH:
+        if self.backend == BackendType.OV:
+            self.model_hf.save_pretrained(self.fp32_model_dir)
             self.model_hf._save_config(self.fp32_model_dir)
+        elif self.backend == BackendType.TORCH:
+            export_from_model(self.model_hf, self.fp32_model_dir, stateful=False, compression_option="fp32")
 
     def _compress(self):
         """

From 70cd9120132cdc24a7e413b87801fb56daf28cfa Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 2 May 2024 15:19:48 +0400
Subject: [PATCH 33/34] chore: Removed unneccesary model wrapping

TORCH Backends only
---
 .../pipelines/lm_weight_compression.py        | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 6e0c3e5138b..51a96777c83 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -21,8 +21,8 @@
 import torch
 from datasets import load_dataset
 from memory_profiler import memory_usage
-from optimum.intel.openvino import OVModelForCausalLM
 from optimum.exporters.openvino.convert import export_from_model
+from optimum.intel.openvino import OVModelForCausalLM
 from transformers import AutoModelForCausalLM
 from transformers import AutoTokenizer
 from whowhatbench import Evaluator
@@ -189,10 +189,10 @@ def save_compressed_model(self) -> None:
             return
 
         if self.backend == BackendType.OV:
-	            ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
-	            self.model_hf._save_config(self.output_model_dir)
-	        elif self.backend == BackendType.TORCH:
-	            export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32")
+            ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
+            self.model_hf._save_config(self.output_model_dir)
+        elif self.backend == BackendType.TORCH:
+            export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32")
 
     def get_num_compressed(self) -> None:
         """
@@ -234,15 +234,6 @@ def _compress(self):
         """
         Actual call of weight compression
         """
-        if self.backend == BackendType.TORCH:
-            from nncf.torch.model_creation import is_wrapped_model
-            from nncf.torch.model_creation import wrap_model
-
-            if not is_wrapped_model(self.model):
-                example_input = next(iter(self.calibration_dataset.get_inference_data()))
-                self.model = wrap_model(self.model, example_input=example_input, trace_parameters=True)
-                self.calibration_dataset = None
-
         self.compressed_model = nncf.compress_weights(
             self.model,
             dataset=self.calibration_dataset,

From e5db8cc42ae2cb1f2784a8ae7f4e8e4d07728e0a Mon Sep 17 00:00:00 2001
From: AdiKsOnDev <adilalizade13@gmail.com>
Date: Thu, 2 May 2024 15:23:03 +0400
Subject: [PATCH 34/34] fix: Changed _validate to match the modified pipeline

---
 .../post_training/pipelines/lm_weight_compression.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
index 51a96777c83..fcab0a20f88 100644
--- a/tests/post_training/pipelines/lm_weight_compression.py
+++ b/tests/post_training/pipelines/lm_weight_compression.py
@@ -241,6 +241,7 @@ def _compress(self):
         )
 
     def _validate(self):
+        is_stateful = self.params.get("is_stateful", False)
         core = ov.Core()
 
         if os.environ.get("INFERENCE_NUM_THREADS"):
@@ -252,8 +253,8 @@ def _validate(self):
         gt_data_path.parent.mkdir(parents=True, exist_ok=True)
         if os.getenv("NNCF_TEST_REGEN_DOT") is not None:
             print("Collection ground-truth reference data")
-            model_gold = self.MODEL_FUNC.from_pretrained(
-                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
+            model_gold = OVModelForCausalLM.from_pretrained(
+                self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
             evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",))
             evaluator.dump_gt(str(gt_data_path))
@@ -265,11 +266,10 @@ def _validate(self):
             )
 
         compressed_model_hf = self.model_hf
-        if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH:
-            compressed_model_hf = self.MODEL_FUNC.from_pretrained(
-                self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
+        if self.backend != BackendType.FP32:
+            compressed_model_hf = OVModelForCausalLM.from_pretrained(
+                self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
             )
-
         print("Evaluation of the target model")
         _, all_metrics = evaluator.score(compressed_model_hf)
         similarity = all_metrics["similarity"][0]