From 2d6fbe578f077a86281b82ebf71ce4a87745890e Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Tue, 9 Apr 2024 14:47:41 +0400 Subject: [PATCH 01/34] feat: Added to the test scope TODO: Make the test --- tests/post_training/model_scope.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py index e6c46f715d0..ed6a732eeae 100644 --- a/tests/post_training/model_scope.py +++ b/tests/post_training/model_scope.py @@ -326,6 +326,15 @@ "backends": [BackendType.OV], "is_batch_size_supported": False, }, + { + "reported_name": "tinyllama_int8_data_free", + "model_id": "tinyllama/tinyllama-1.1b-step-50k-105b", + "pipeline_cls": LMWeightCompression, + "compression_params": { + "mode": CompressWeightsMode.INT8_ASYM, + }, + "backends": [BackendType.TORCH], + }, ] From 52e81800e97c2bbce53996ffd71d591d9990ad10 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Fri, 12 Apr 2024 15:43:10 +0400 Subject: [PATCH 02/34] feat: Added torch backend support compress() and _compress_torch() methods were implemented --- .../pipelines/lm_weight_compression.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index b1a6e5853dc..36cdad457f0 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -18,6 +18,8 @@ import numpy as np import openvino as ov +import torch +import transformers from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM @@ -143,6 +145,22 @@ def cleanup_cache(self): def compress(self) -> None: if self.backend == BackendType.FP32: return + elif self.backend == BackendType.TORCH: + start_time = time.perf_counter() + + tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id) + self.model = transformers.AutoModelForCausalLM.from_pretrained( + self.model_id, torch_dtype=torch.float16, device_map="cpu" + ) + + text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens." + token = tokenizer(text, max_length=500, return_tensors="pt", truncation=True) + inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]} + + self.run_info.compression_memory_usage = memory_usage(self._compress_torch(inputs), max_usage=True) + self.run_info.time_compression = time.perf_counter() - start_time + + return print("Weight compression...") start_time = time.perf_counter() @@ -174,6 +192,9 @@ def _dump_model_fp32(self) -> None: self.model_hf.save_pretrained(self.fp32_model_dir) self.model_hf._save_config(self.fp32_model_dir) + def _compress_torch(self, inputs): + self.compressed_model = nncf.compress_weights(self.model, dataset=nncf.Dataset([inputs])) + def _compress(self): """ Actual call of weight compression From c02480338141fbeaf990ce9d149d2c252c472e93 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 18 Apr 2024 21:17:29 +0400 Subject: [PATCH 03/34] fix: Moved int8 conversion in _validate() --- .../pipelines/lm_weight_compression.py | 27 +++++-------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 36cdad457f0..80d8cf3a5f9 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -19,10 +19,10 @@ import numpy as np import openvino as ov import torch -import transformers from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM +from transformers import AutoModelForCausalLM from transformers import AutoTokenizer from whowhatbench import Evaluator @@ -145,22 +145,6 @@ def cleanup_cache(self): def compress(self) -> None: if self.backend == BackendType.FP32: return - elif self.backend == BackendType.TORCH: - start_time = time.perf_counter() - - tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id) - self.model = transformers.AutoModelForCausalLM.from_pretrained( - self.model_id, torch_dtype=torch.float16, device_map="cpu" - ) - - text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens." - token = tokenizer(text, max_length=500, return_tensors="pt", truncation=True) - inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]} - - self.run_info.compression_memory_usage = memory_usage(self._compress_torch(inputs), max_usage=True) - self.run_info.time_compression = time.perf_counter() - start_time - - return print("Weight compression...") start_time = time.perf_counter() @@ -192,9 +176,6 @@ def _dump_model_fp32(self) -> None: self.model_hf.save_pretrained(self.fp32_model_dir) self.model_hf._save_config(self.fp32_model_dir) - def _compress_torch(self, inputs): - self.compressed_model = nncf.compress_weights(self.model, dataset=nncf.Dataset([inputs])) - def _compress(self): """ Actual call of weight compression @@ -231,7 +212,11 @@ def _validate(self): ) compressed_model_hf = self.model_hf - if self.backend != BackendType.FP32: + if self.backend == BackendType.TORCH: + compressed_model_hf = AutoModelForCausalLM.from_pretrained( + self.output_model_dir, torch_dtype=torch.float16, device_map="cpu" + ) + elif self.backend != BackendType.FP32: compressed_model_hf = OVModelForCausalLM.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) From f48c148de50e0a0891a494483ee11371390a5872 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Mon, 22 Apr 2024 21:41:28 +0400 Subject: [PATCH 04/34] fix: Returned initial implementation of _validate() --- tests/post_training/pipelines/lm_weight_compression.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index c3f5d89a3f4..169d0a5f771 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -71,6 +71,7 @@ class LMWeightCompression(BaseTestPipeline): """Pipeline for casual language models from Hugging Face repository""" OV_MODEL_NAME = "openvino_model.xml" + TORCH_MODEL_NAME = "torch_model.xml" def prepare_model(self) -> None: is_stateful = self.params.get("is_stateful", False) @@ -226,11 +227,7 @@ def _validate(self): ) compressed_model_hf = self.model_hf - if self.backend == BackendType.TORCH: - compressed_model_hf = AutoModelForCausalLM.from_pretrained( - self.output_model_dir, torch_dtype=torch.float16, device_map="cpu" - ) - elif self.backend != BackendType.FP32: + if self.backend != BackendType.FP32: compressed_model_hf = OVModelForCausalLM.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) From f9505e41c3a5fe13f12f4a2dceccbb64f180a1eb Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Mon, 22 Apr 2024 22:52:34 +0400 Subject: [PATCH 05/34] chore: Temporary dummy data --- tests/post_training/data/wc_reference_data.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 5235d155244..1e97dc4c3d0 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -13,4 +13,8 @@ tinyllama_data_aware_awq_backend_OV: tinyllama_data_aware_awq_stateful_backend_OV: metric_value: 0.81237 num_int4: 184 - num_int8: 128 \ No newline at end of file + num_int8: 128 +tinyllama_int8_data_free_backend_TORCH: + metric_value: 0.72057 + num_int4: 228 + num_int8: 84 From 2bc73ec0b83ed0272ec0505d49d7b149bbb92115 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Mon, 22 Apr 2024 22:53:48 +0400 Subject: [PATCH 06/34] fix: Model Preparation for TORCH backend --- .../pipelines/lm_weight_compression.py | 50 ++++++------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 169d0a5f771..121000e36c8 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -16,9 +16,9 @@ from dataclasses import dataclass from typing import Dict, Optional +import torch import numpy as np import openvino as ov -import torch from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM @@ -71,7 +71,6 @@ class LMWeightCompression(BaseTestPipeline): """Pipeline for casual language models from Hugging Face repository""" OV_MODEL_NAME = "openvino_model.xml" - TORCH_MODEL_NAME = "torch_model.xml" def prepare_model(self) -> None: is_stateful = self.params.get("is_stateful", False) @@ -147,6 +146,11 @@ def compress(self) -> None: if self.backend == BackendType.FP32: return + if self.backend == BackendType.TORCH: + inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS") + if inference_num_threads is not None: + torch.set_num_threads(int(inference_num_threads)) + print("Weight compression...") start_time = time.perf_counter() self.run_info.compression_memory_usage = memory_usage(self._compress, max_usage=True) @@ -160,25 +164,15 @@ def collect_data_from_stdout(self, stdout: str): def save_compressed_model(self) -> None: if self.backend == BackendType.FP32: return + elif self.backend == BackendType.TORCH: + self.model = ov.convert_model( + self.compressed_model.cpu(), example_input=self.dummy_tensor.cpu(), input=self.input_size + ) ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) self.model_hf._save_config(self.output_model_dir) def get_num_compressed(self) -> None: - """ - Get number of the i8, u8, i4, u4 ops in the compressed IR. - """ - num_int8 = 0 - num_int4 = 0 - - for node in self.model.get_ops(): - for i in range(node.get_output_size()): - if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]: - num_int8 += 1 - if node.get_output_element_type(i).get_type_name() in ["i4", "u4"]: - num_int4 += 1 - - self.run_info.num_compress_nodes.num_int8 = num_int8 - self.run_info.num_compress_nodes.num_int4 = num_int4 + pass def run_bench(self) -> None: pass @@ -227,7 +221,11 @@ def _validate(self): ) compressed_model_hf = self.model_hf - if self.backend != BackendType.FP32: + if self.backend == BackendType.TORCH: + compressed_model_hf = AutoModelForCausalLM.from_pretrained( + self.output_model_dir, torch_dtype=torch.float16, device_map="cpu" + ) + elif self.backend != BackendType.FP32: compressed_model_hf = OVModelForCausalLM.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) @@ -236,19 +234,3 @@ def _validate(self): similarity = all_metrics["similarity"][0] self.run_info.metric_name = "Similarity" self.run_info.metric_value = round(similarity, 5) - - num_int4_reference = self.reference_data.get("num_int4") - num_int8_reference = self.reference_data.get("num_int8") - - num_int4_value = self.run_info.num_compress_nodes.num_int4 - num_int8_value = self.run_info.num_compress_nodes.num_int8 - - if num_int4_reference != num_int4_value: - status_msg = f"Regression: The number of int4 ops is different \ - than reference {num_int4_reference} != {num_int4_value}" - raise ValueError(status_msg) - - if num_int8_reference != num_int8_value: - status_msg = f"Regression: The number of int8 ops is different \ - than reference {num_int8_reference} != {num_int8_value}" - raise ValueError(status_msg) From 927c38fe9d933ba4b6bb38f22b9fb861ee72b580 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Mon, 22 Apr 2024 23:23:24 +0400 Subject: [PATCH 07/34] fix: Removed unsupported parameters for INT8 TODO: Maybe make it in a way where I check for INT8 instead of BackendType.TORCH, --- tests/post_training/model_scope.py | 3 +++ .../pipelines/lm_weight_compression.py | 26 +++++++------------ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py index ed6a732eeae..3920d0af3ea 100644 --- a/tests/post_training/model_scope.py +++ b/tests/post_training/model_scope.py @@ -332,6 +332,9 @@ "pipeline_cls": LMWeightCompression, "compression_params": { "mode": CompressWeightsMode.INT8_ASYM, + "all_layers": None, + "awq": None, + "sensitivity_metric": None, }, "backends": [BackendType.TORCH], }, diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 121000e36c8..ae99b150787 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -16,13 +16,11 @@ from dataclasses import dataclass from typing import Dict, Optional -import torch import numpy as np import openvino as ov from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM -from transformers import AutoModelForCausalLM from transformers import AutoTokenizer from whowhatbench import Evaluator @@ -146,11 +144,6 @@ def compress(self) -> None: if self.backend == BackendType.FP32: return - if self.backend == BackendType.TORCH: - inference_num_threads = os.environ.get("INFERENCE_NUM_THREADS") - if inference_num_threads is not None: - torch.set_num_threads(int(inference_num_threads)) - print("Weight compression...") start_time = time.perf_counter() self.run_info.compression_memory_usage = memory_usage(self._compress, max_usage=True) @@ -164,10 +157,6 @@ def collect_data_from_stdout(self, stdout: str): def save_compressed_model(self) -> None: if self.backend == BackendType.FP32: return - elif self.backend == BackendType.TORCH: - self.model = ov.convert_model( - self.compressed_model.cpu(), example_input=self.dummy_tensor.cpu(), input=self.input_size - ) ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) self.model_hf._save_config(self.output_model_dir) @@ -189,6 +178,15 @@ def _compress(self): """ Actual call of weight compression """ + if self.backend == BackendType.TORCH: + self.compressed_model = nncf.compress_weights( + self.model, + dataset=None, + **self.compression_params, + ) + + return + self.compressed_model = nncf.compress_weights( self.model, dataset=self.calibration_dataset, @@ -221,11 +219,7 @@ def _validate(self): ) compressed_model_hf = self.model_hf - if self.backend == BackendType.TORCH: - compressed_model_hf = AutoModelForCausalLM.from_pretrained( - self.output_model_dir, torch_dtype=torch.float16, device_map="cpu" - ) - elif self.backend != BackendType.FP32: + if self.backend != BackendType.FP32: compressed_model_hf = OVModelForCausalLM.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) From f0081037f28af2a829043d4ddaf4902d91864724 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Mon, 22 Apr 2024 23:34:46 +0400 Subject: [PATCH 08/34] chore: Comment on important addition --- tests/post_training/pipelines/lm_weight_compression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index ae99b150787..cf1845266bd 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -179,6 +179,7 @@ def _compress(self): Actual call of weight compression """ if self.backend == BackendType.TORCH: + """If Backend is TORCH (Assuming that it's INT8 compression), don't use a dataset as it's Unsupported""" self.compressed_model = nncf.compress_weights( self.model, dataset=None, From eeade477464c016cd031be399f6a64db9b1cdb7c Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Mon, 22 Apr 2024 23:46:08 +0400 Subject: [PATCH 09/34] feat: Added correct metric value according to @aleksu52 Co-authored-by: Aleksander --- tests/post_training/data/wc_reference_data.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 1e97dc4c3d0..760d6ed5fea 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -15,6 +15,6 @@ tinyllama_data_aware_awq_stateful_backend_OV: num_int4: 184 num_int8: 128 tinyllama_int8_data_free_backend_TORCH: - metric_value: 0.72057 + metric_value: 0.96283 num_int4: 228 num_int8: 84 From fc05eed783f417584d630b879aa3263b5d963e7d Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Tue, 23 Apr 2024 11:35:02 +0400 Subject: [PATCH 10/34] fix: Mode accurate check for the INT8 compression mode --- .../pipelines/lm_weight_compression.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index cf1845266bd..73d2dc39ef9 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -20,6 +20,7 @@ import openvino as ov from datasets import load_dataset from memory_profiler import memory_usage +from nncf.parameters import CompressWeightsMode from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoTokenizer from whowhatbench import Evaluator @@ -178,15 +179,9 @@ def _compress(self): """ Actual call of weight compression """ - if self.backend == BackendType.TORCH: - """If Backend is TORCH (Assuming that it's INT8 compression), don't use a dataset as it's Unsupported""" - self.compressed_model = nncf.compress_weights( - self.model, - dataset=None, - **self.compression_params, - ) - - return + if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM: + """If compression mode is INT8, don't use a dataset as it's Unsupported""" + self.calibration_dataset = None self.compressed_model = nncf.compress_weights( self.model, From 4aefa0dbd92a8063de246d96b7984e4208d09312 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Tue, 23 Apr 2024 17:10:49 +0400 Subject: [PATCH 11/34] feat: Problematic code for @aleksu52 to reproduce --- .../pipelines/lm_weight_compression.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 73d2dc39ef9..18229b73aae 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -175,13 +175,31 @@ def _dump_model_fp32(self) -> None: self.model_hf.save_pretrained(self.fp32_model_dir) self.model_hf._save_config(self.fp32_model_dir) + # def _compress(self): + # """ + # Actual call of weight compression + # """ + # if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM: + # """If compression mode is INT8, don't use a dataset as it's Unsupported""" + # self.calibration_dataset = None + # + # self.compressed_model = nncf.compress_weights( + # self.model, + # dataset=self.calibration_dataset, + # **self.compression_params, + # ) def _compress(self): """ Actual call of weight compression """ - if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM: - """If compression mode is INT8, don't use a dataset as it's Unsupported""" - self.calibration_dataset = None + if self.backend == BackendType.TORCH: + from nncf.torch.model_creation import is_wrapped_model + from nncf.torch.model_creation import wrap_model + + if not is_wrapped_model(self.model): + example_input = next(iter(self.calibration_dataset.get_inference_data())) + self.model = wrap_model(self.model, example_input=example_input, trace_parameters=True) + self.calibration_dataset = None self.compressed_model = nncf.compress_weights( self.model, From 737c1a70bbd06c0f88ad3eb8aef293dc2bdcc866 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Wed, 24 Apr 2024 13:05:22 +0400 Subject: [PATCH 12/34] feat: Use AutoModelForCausalLM for TORCH models --- .../pipelines/lm_weight_compression.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 18229b73aae..a1b634481e7 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -22,7 +22,7 @@ from memory_profiler import memory_usage from nncf.parameters import CompressWeightsMode from optimum.intel.openvino import OVModelForCausalLM -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AutoModelForCausalLM from whowhatbench import Evaluator import nncf @@ -69,21 +69,25 @@ def get_stats(self) -> Dict[str, str]: class LMWeightCompression(BaseTestPipeline): """Pipeline for casual language models from Hugging Face repository""" - OV_MODEL_NAME = "openvino_model.xml" + MODEL_NAME = "openvino_model.xml" + MODEL_FUNC = OVModelForCausalLM def prepare_model(self) -> None: + if self.backend == BackendType.TORCH: + self.MODEL_NAME = "torch_model.xml" + self.MODEL_FUNC = AutoModelForCausalLM is_stateful = self.params.get("is_stateful", False) if is_stateful: self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf") - if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists(): + if not (self.fp32_model_dir / self.MODEL_NAME).exists(): # export by model_id - self.model_hf = OVModelForCausalLM.from_pretrained( + self.model_hf = self.MODEL_FUNC.from_pretrained( self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful ) self._dump_model_fp32() else: # no export, load from IR. Applicable for sequential run of test cases in local environment. - self.model_hf = OVModelForCausalLM.from_pretrained( + self.model_hf = self.MODEL_FUNC.from_pretrained( self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) self.model = self.model_hf.model @@ -158,7 +162,7 @@ def collect_data_from_stdout(self, stdout: str): def save_compressed_model(self) -> None: if self.backend == BackendType.FP32: return - ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) + ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME) self.model_hf._save_config(self.output_model_dir) def get_num_compressed(self) -> None: @@ -220,7 +224,7 @@ def _validate(self): gt_data_path.parent.mkdir(parents=True, exist_ok=True) if os.getenv("NNCF_TEST_REGEN_DOT") is not None: print("Collection ground-truth reference data") - model_gold = OVModelForCausalLM.from_pretrained( + model_gold = self.MODEL_FUNC.from_pretrained( self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",)) @@ -234,7 +238,7 @@ def _validate(self): compressed_model_hf = self.model_hf if self.backend != BackendType.FP32: - compressed_model_hf = OVModelForCausalLM.from_pretrained( + compressed_model_hf = self.MODEL_FUNC.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) print("Evaluation of the target model") From 8066b76daca190cf3349d51f18c95c6fa8267ee6 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Wed, 24 Apr 2024 13:12:57 +0400 Subject: [PATCH 13/34] fix: Added model specific parameters during preparation Some parameters for OVModelForCausalLM DO NOT apply for LlamaForCausalLM --- tests/post_training/pipelines/lm_weight_compression.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index a1b634481e7..96551d983c0 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -73,22 +73,25 @@ class LMWeightCompression(BaseTestPipeline): MODEL_FUNC = OVModelForCausalLM def prepare_model(self) -> None: + is_stateful = self.params.get("is_stateful", False) if self.backend == BackendType.TORCH: self.MODEL_NAME = "torch_model.xml" self.MODEL_FUNC = AutoModelForCausalLM - is_stateful = self.params.get("is_stateful", False) + MODEL_SPECIFIC_PARAMS = {} + else: + MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} if is_stateful: self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf") if not (self.fp32_model_dir / self.MODEL_NAME).exists(): # export by model_id self.model_hf = self.MODEL_FUNC.from_pretrained( - self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful + self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS ) self._dump_model_fp32() else: # no export, load from IR. Applicable for sequential run of test cases in local environment. self.model_hf = self.MODEL_FUNC.from_pretrained( - self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful + self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS ) self.model = self.model_hf.model From 004199852d0d7145adec284210913a30490bfbca Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 25 Apr 2024 00:03:51 +0400 Subject: [PATCH 14/34] refactor: Make a tokenizer during model preparation Needed to tokenize example input for torch model later --- .../pipelines/lm_weight_compression.py | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 96551d983c0..4b004301a41 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -18,11 +18,12 @@ import numpy as np import openvino as ov +import transformers from datasets import load_dataset from memory_profiler import memory_usage -from nncf.parameters import CompressWeightsMode from optimum.intel.openvino import OVModelForCausalLM -from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import AutoModelForCausalLM +from transformers import AutoTokenizer from whowhatbench import Evaluator import nncf @@ -78,16 +79,16 @@ def prepare_model(self) -> None: self.MODEL_NAME = "torch_model.xml" self.MODEL_FUNC = AutoModelForCausalLM MODEL_SPECIFIC_PARAMS = {} + self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id) else: MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} if is_stateful: self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf") if not (self.fp32_model_dir / self.MODEL_NAME).exists(): # export by model_id - self.model_hf = self.MODEL_FUNC.from_pretrained( - self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS - ) - self._dump_model_fp32() + self.model_hf = self.MODEL_FUNC.from_pretrained(self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS) + if self.backend != BackendType.TORCH: + self._dump_model_fp32() else: # no export, load from IR. Applicable for sequential run of test cases in local environment. self.model_hf = self.MODEL_FUNC.from_pretrained( @@ -182,19 +183,6 @@ def _dump_model_fp32(self) -> None: self.model_hf.save_pretrained(self.fp32_model_dir) self.model_hf._save_config(self.fp32_model_dir) - # def _compress(self): - # """ - # Actual call of weight compression - # """ - # if self.compression_params["mode"] == CompressWeightsMode.INT8_ASYM: - # """If compression mode is INT8, don't use a dataset as it's Unsupported""" - # self.calibration_dataset = None - # - # self.compressed_model = nncf.compress_weights( - # self.model, - # dataset=self.calibration_dataset, - # **self.compression_params, - # ) def _compress(self): """ Actual call of weight compression From 3a61ccf81c80c1260a098dec63a6bf19e5e1e74d Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 25 Apr 2024 00:04:50 +0400 Subject: [PATCH 15/34] feat: Tokenize an input string (Temporary) to feed in torch model TODO: Tokenize the dataset, instead of the string --- tests/post_training/pipelines/lm_weight_compression.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 4b004301a41..fecb029c7ba 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -140,6 +140,15 @@ def transform_fn(data): def prepare_calibration_dataset(self): dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e") dataset = dataset.filter(lambda example: len(example["text"]) > 80) + if self.backend == BackendType.TORCH: + example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens." + token = self.tokenizer(example_text, max_length=500, return_tensors="pt", truncation=True) + inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]} + + self.calibration_dataset = nncf.Dataset([inputs]) + + return + self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn()) def cleanup_cache(self): From ea0c4c4fe40c1e55dcba12471ef0370cd50a346d Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 25 Apr 2024 00:24:05 +0400 Subject: [PATCH 16/34] fix: Added torch_dtype parameter to the model --- tests/post_training/pipelines/lm_weight_compression.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index fecb029c7ba..253db44cfa7 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -18,6 +18,7 @@ import numpy as np import openvino as ov +import torch import transformers from datasets import load_dataset from memory_profiler import memory_usage @@ -78,7 +79,7 @@ def prepare_model(self) -> None: if self.backend == BackendType.TORCH: self.MODEL_NAME = "torch_model.xml" self.MODEL_FUNC = AutoModelForCausalLM - MODEL_SPECIFIC_PARAMS = {} + MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16} self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id) else: MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} From c3461008e05e69d0f89cc361a682958c88920f27 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 25 Apr 2024 00:40:24 +0400 Subject: [PATCH 17/34] chore: Removed unnecessary compression parameters --- tests/post_training/model_scope.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/post_training/model_scope.py b/tests/post_training/model_scope.py index ac4f35051ca..9574b66cfaf 100644 --- a/tests/post_training/model_scope.py +++ b/tests/post_training/model_scope.py @@ -348,9 +348,6 @@ "pipeline_cls": LMWeightCompression, "compression_params": { "mode": CompressWeightsMode.INT8_ASYM, - "all_layers": None, - "awq": None, - "sensitivity_metric": None, }, "backends": [BackendType.TORCH], }, From 1cfccf97de737a28ddbfff5086a227ea22c2e950 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 25 Apr 2024 14:40:54 +0400 Subject: [PATCH 18/34] refactor: Line spacing, preprocessor usage --- .../pipelines/lm_weight_compression.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 253db44cfa7..c1208ee4338 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -19,7 +19,6 @@ import numpy as np import openvino as ov import torch -import transformers from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM @@ -76,25 +75,31 @@ class LMWeightCompression(BaseTestPipeline): def prepare_model(self) -> None: is_stateful = self.params.get("is_stateful", False) + if self.backend == BackendType.TORCH: self.MODEL_NAME = "torch_model.xml" self.MODEL_FUNC = AutoModelForCausalLM - MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16} - self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id) + self.MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16} else: - MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} + self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} + if is_stateful: self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf") + if not (self.fp32_model_dir / self.MODEL_NAME).exists(): # export by model_id - self.model_hf = self.MODEL_FUNC.from_pretrained(self.model_id, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS) + self.model_hf = self.MODEL_FUNC.from_pretrained( + self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS + ) + if self.backend != BackendType.TORCH: self._dump_model_fp32() else: # no export, load from IR. Applicable for sequential run of test cases in local environment. self.model_hf = self.MODEL_FUNC.from_pretrained( - self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **MODEL_SPECIFIC_PARAMS + self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS ) + self.model = self.model_hf.model def prepare_preprocessor(self) -> None: @@ -143,7 +148,7 @@ def prepare_calibration_dataset(self): dataset = dataset.filter(lambda example: len(example["text"]) > 80) if self.backend == BackendType.TORCH: example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens." - token = self.tokenizer(example_text, max_length=500, return_tensors="pt", truncation=True) + token = self.preprocessor(example_text, max_length=500, return_tensors="pt", truncation=True) inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]} self.calibration_dataset = nncf.Dataset([inputs]) @@ -211,6 +216,9 @@ def _compress(self): dataset=self.calibration_dataset, **self.compression_params, ) + self.compressed_model = ov.convert_model( + self.compressed_model, example_input=torch.rand(1, 3, 224, 224).to(torch.long) + ) def _validate(self): is_stateful = self.params.get("is_stateful", False) From 5deba30ed9c33a1cb99696a3152c5810095861bb Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Sat, 27 Apr 2024 23:15:04 +0400 Subject: [PATCH 19/34] fix: Removing convert_model() Does not work with CausalModels --- tests/post_training/pipelines/lm_weight_compression.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index c1208ee4338..296adc22e3a 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -216,9 +216,6 @@ def _compress(self): dataset=self.calibration_dataset, **self.compression_params, ) - self.compressed_model = ov.convert_model( - self.compressed_model, example_input=torch.rand(1, 3, 224, 224).to(torch.long) - ) def _validate(self): is_stateful = self.params.get("is_stateful", False) From 40c5686755b49bfc6947b60dc1a5f12d489b1804 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Sun, 28 Apr 2024 00:03:34 +0400 Subject: [PATCH 20/34] fix: The pipeline now runs for TORCH models TODO: Figure out why the metric value is so low (-0.00414) --- .../pipelines/lm_weight_compression.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 296adc22e3a..9a093e51da2 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -92,8 +92,7 @@ def prepare_model(self) -> None: self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS ) - if self.backend != BackendType.TORCH: - self._dump_model_fp32() + self._dump_model_fp32() else: # no export, load from IR. Applicable for sequential run of test cases in local environment. self.model_hf = self.MODEL_FUNC.from_pretrained( @@ -181,6 +180,11 @@ def collect_data_from_stdout(self, stdout: str): def save_compressed_model(self) -> None: if self.backend == BackendType.FP32: return + if self.backend == BackendType.TORCH: + self.compressed_model.save_pretrained(self.output_model_dir) + + return + ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME) self.model_hf._save_config(self.output_model_dir) @@ -196,7 +200,8 @@ def _dump_model_fp32(self) -> None: to the dedicated shared folder. """ self.model_hf.save_pretrained(self.fp32_model_dir) - self.model_hf._save_config(self.fp32_model_dir) + if not self.backend == BackendType.TORCH: + self.model_hf._save_config(self.fp32_model_dir) def _compress(self): """ @@ -231,7 +236,7 @@ def _validate(self): if os.getenv("NNCF_TEST_REGEN_DOT") is not None: print("Collection ground-truth reference data") model_gold = self.MODEL_FUNC.from_pretrained( - self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful + self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS ) evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",)) evaluator.dump_gt(str(gt_data_path)) @@ -245,7 +250,7 @@ def _validate(self): compressed_model_hf = self.model_hf if self.backend != BackendType.FP32: compressed_model_hf = self.MODEL_FUNC.from_pretrained( - self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful + self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS ) print("Evaluation of the target model") _, all_metrics = evaluator.score(compressed_model_hf) From d3989be4595fd93476926fc7588e66794d744298 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Sun, 28 Apr 2024 13:02:42 +0400 Subject: [PATCH 21/34] fix: Using model_hf for validation --- tests/post_training/pipelines/lm_weight_compression.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 9a093e51da2..85407ba4d7e 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -79,7 +79,7 @@ def prepare_model(self) -> None: if self.backend == BackendType.TORCH: self.MODEL_NAME = "torch_model.xml" self.MODEL_FUNC = AutoModelForCausalLM - self.MODEL_SPECIFIC_PARAMS = {"torch_dtype": torch.float16} + self.MODEL_SPECIFIC_PARAMS = {} else: self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} @@ -181,7 +181,7 @@ def save_compressed_model(self) -> None: if self.backend == BackendType.FP32: return if self.backend == BackendType.TORCH: - self.compressed_model.save_pretrained(self.output_model_dir) + self.model_hf.save_pretrained(self.output_model_dir) return @@ -248,10 +248,12 @@ def _validate(self): ) compressed_model_hf = self.model_hf - if self.backend != BackendType.FP32: + raise ValueError(f"{type(compressed_model_hf)}") + if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH: compressed_model_hf = self.MODEL_FUNC.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS ) + print("Evaluation of the target model") _, all_metrics = evaluator.score(compressed_model_hf) similarity = all_metrics["similarity"][0] From 43aec31c78e4bf7e829b6e288a4de1abf2ed1e65 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Sun, 28 Apr 2024 13:03:10 +0400 Subject: [PATCH 22/34] fix: Changed the reference metric value --- tests/post_training/data/wc_reference_data.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 760d6ed5fea..ee3fe5db378 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -15,6 +15,6 @@ tinyllama_data_aware_awq_stateful_backend_OV: num_int4: 184 num_int8: 128 tinyllama_int8_data_free_backend_TORCH: - metric_value: 0.96283 + metric_value: 0.95944 num_int4: 228 num_int8: 84 From a85ded2bfe9aaf35158af8229d7c787ac9e2bac4 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Sun, 28 Apr 2024 13:04:54 +0400 Subject: [PATCH 23/34] refactor: Pre-Commit changes --- tests/post_training/pipelines/lm_weight_compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 85407ba4d7e..b3d94f403f7 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -251,7 +251,7 @@ def _validate(self): raise ValueError(f"{type(compressed_model_hf)}") if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH: compressed_model_hf = self.MODEL_FUNC.from_pretrained( - self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS + self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS ) print("Evaluation of the target model") From 28af5697cb166b34790ea878898b53d8da186d5a Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Tue, 30 Apr 2024 10:58:59 +0400 Subject: [PATCH 24/34] fix: Returned the original checks for int4/int8 values I do not remember removing these --- .../pipelines/lm_weight_compression.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index b3d94f403f7..76a4d244d2b 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -259,3 +259,19 @@ def _validate(self): similarity = all_metrics["similarity"][0] self.run_info.metric_name = "Similarity" self.run_info.metric_value = round(similarity, 5) + + num_int4_reference = self.reference_data.get("num_int4") + num_int8_reference = self.reference_data.get("num_int8") + + num_int4_value = self.run_info.num_compress_nodes.num_int4 + num_int8_value = self.run_info.num_compress_nodes.num_int8 + + if num_int4_reference != num_int4_value: + status_msg = f"Regression: The number of int4 ops is different \ + than reference {num_int4_reference} != {num_int4_value}" + raise ValueError(status_msg) + + if num_int8_reference != num_int8_value: + status_msg = f"Regression: The number of int8 ops is different \ + than reference {num_int8_reference} != {num_int8_value}" + raise ValueError(status_msg) From a72ae7ee603b127c13ec636d87a9b4d949a6b58c Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Tue, 30 Apr 2024 11:48:11 +0400 Subject: [PATCH 25/34] chore: Pre-Commit changes --- tests/post_training/pipelines/lm_weight_compression.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 76a4d244d2b..cfa81d19ce8 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -18,7 +18,6 @@ import numpy as np import openvino as ov -import torch from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM @@ -200,7 +199,7 @@ def _dump_model_fp32(self) -> None: to the dedicated shared folder. """ self.model_hf.save_pretrained(self.fp32_model_dir) - if not self.backend == BackendType.TORCH: + if self.backend != BackendType.TORCH: self.model_hf._save_config(self.fp32_model_dir) def _compress(self): @@ -223,7 +222,6 @@ def _compress(self): ) def _validate(self): - is_stateful = self.params.get("is_stateful", False) core = ov.Core() if os.environ.get("INFERENCE_NUM_THREADS"): From 7d328c3253608242a68ad605a1393fc120124942 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Tue, 30 Apr 2024 20:53:13 +0400 Subject: [PATCH 26/34] refactor: Pre-Commit Changes --- tests/post_training/pipelines/lm_weight_compression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 13419ab66be..2318f3b5f86 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -158,7 +158,7 @@ def prepare_calibration_dataset(self): self.calibration_dataset = nncf.Dataset([inputs]) - return + return self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn()) def cleanup_cache(self): From 7e50cfa4a64aeb884f403421457856a3abf6efb1 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Wed, 1 May 2024 16:03:16 +0400 Subject: [PATCH 27/34] fix: Removed the debugging line --- tests/post_training/pipelines/lm_weight_compression.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 2318f3b5f86..784f7514a21 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -252,7 +252,6 @@ def _validate(self): ) compressed_model_hf = self.model_hf - raise ValueError(f"{type(compressed_model_hf)}") if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH: compressed_model_hf = self.MODEL_FUNC.from_pretrained( self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS From 7c31d3d0212fdd7454629107fdca5069abd6e972 Mon Sep 17 00:00:00 2001 From: Adil Alizada <80326762+AdiKsOnDev@users.noreply.github.com> Date: Thu, 2 May 2024 14:57:06 +0400 Subject: [PATCH 28/34] fix: Corrected reference data for TORCH backend Co-authored-by: Alexander Suslov --- tests/post_training/data/wc_reference_data.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index b93d84d65dd..af744abe02f 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -19,6 +19,6 @@ tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV: num_int4: 188 num_int8: 124 tinyllama_int8_data_free_backend_TORCH: - metric_value: 0.95944 - num_int4: 228 - num_int8: 84 \ No newline at end of file + metric_value: 0.95624 + num_int4: 0 + num_int8: 312 \ No newline at end of file From 6899097e8d977fc942d9c31902b3b4cc978f9785 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 2 May 2024 15:03:35 +0400 Subject: [PATCH 29/34] refactor: Code made cleaner Also deleted the following class attributes: MODEL_NAME MODEL_FUNC --- .../pipelines/lm_weight_compression.py | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 784f7514a21..eac1b493d8f 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -16,6 +16,7 @@ from dataclasses import dataclass from typing import Dict, Optional +import torch import numpy as np import openvino as ov from datasets import load_dataset @@ -69,36 +70,40 @@ def get_stats(self) -> Dict[str, str]: class LMWeightCompression(BaseTestPipeline): """Pipeline for casual language models from Hugging Face repository""" - MODEL_NAME = "openvino_model.xml" - MODEL_FUNC = OVModelForCausalLM + OV_MODEL_NAME = "openvino_model.xml" def prepare_model(self) -> None: is_stateful = self.params.get("is_stateful", False) + # load model if self.backend == BackendType.TORCH: - self.MODEL_NAME = "torch_model.xml" - self.MODEL_FUNC = AutoModelForCausalLM - self.MODEL_SPECIFIC_PARAMS = {} - else: - self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful} - - if is_stateful: - self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf") + if is_stateful: + raise RuntimeError(f"is_stateful={is_stateful} is not supported for PyTorch backend.") - if not (self.fp32_model_dir / self.MODEL_NAME).exists(): - # export by model_id - self.model_hf = self.MODEL_FUNC.from_pretrained( - self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS + self.model_hf = AutoModelForCausalLM.from_pretrained( + self.model_id, torch_dtype=torch.float32, device_map="cpu" ) - - self._dump_model_fp32() + self.model = self.model_hf + elif self.backend == BackendType.OV: + if is_stateful: + self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf") + if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists(): + # export by model_id + self.model_hf = OVModelForCausalLM.from_pretrained( + self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful + ) + else: + # no export, load from IR. Applicable for sequential run of test cases in local environment. + self.model_hf = OVModelForCausalLM.from_pretrained( + self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful + ) + self.model = self.model_hf.model else: - # no export, load from IR. Applicable for sequential run of test cases in local environment. - self.model_hf = self.MODEL_FUNC.from_pretrained( - self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS - ) + raise RuntimeError(f"backend={self.backend.value} is not supported.") - self.model = self.model_hf.model + # dump FP32 model + if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists(): + self._dump_model_fp32() def prepare_preprocessor(self) -> None: self.preprocessor = AutoTokenizer.from_pretrained(self.model_id) From 86e91f909549817205f08dbad3523c46dbb3a763 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 2 May 2024 15:09:12 +0400 Subject: [PATCH 30/34] fix: Utilized wikitext for TORCH models as well Co-authored-by: Alexander Suslov --- .../pipelines/lm_weight_compression.py | 54 +++++++++---------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index eac1b493d8f..6a47f629194 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -16,9 +16,9 @@ from dataclasses import dataclass from typing import Dict, Optional -import torch import numpy as np import openvino as ov +import torch from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM @@ -126,29 +126,32 @@ def transform_fn(data, max_tokens=128): inputs["attention_mask"] = attention_mask position_ids = np.cumsum(attention_mask, axis=1) - 1 position_ids[attention_mask == 0] = 1 - - # The magic forms KV cache as model inputs - batch_size = input_ids.shape[0] - for input_name in self.model_hf.key_value_input_names: - model_inputs = self.model.input(input_name) - shape = model_inputs.get_partial_shape() - shape[0] = batch_size - if shape[2].is_dynamic: - shape[2] = 0 - else: - shape[1] = 0 - inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape()) - inputs["position_ids"] = position_ids - # initialize the rest of inputs (e.g. beam_idx for stateful models) - for val in self.model.inputs: - name = val.any_name - if name in inputs: - continue - shape = list(val.partial_shape.get_min_shape()) - shape[0] = batch_size - inputs[name] = np.zeros(shape) + if self.backend == BackendType.OV: + # The magic forms KV cache as model inputs + batch_size = input_ids.shape[0] + for input_name in self.model_hf.key_value_input_names: + model_inputs = self.model.input(input_name) + shape = model_inputs.get_partial_shape() + shape[0] = batch_size + if shape[2].is_dynamic: + shape[2] = 0 + else: + shape[1] = 0 + inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape()) + + # initialize the rest of inputs (e.g. beam_idx for stateful models) + for val in self.model.inputs: + name = val.any_name + if name in inputs: + continue + shape = list(val.partial_shape.get_min_shape()) + shape[0] = batch_size + inputs[name] = np.zeros(shape) + if self.backend == BackendType.TORCH: + for input_name in inputs: + inputs[input_name] = torch.from_numpy(inputs[input_name]) return inputs return transform_fn @@ -156,14 +159,7 @@ def transform_fn(data, max_tokens=128): def prepare_calibration_dataset(self): dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e") dataset = dataset.filter(lambda example: len(example["text"]) > 128) - if self.backend == BackendType.TORCH: - example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens." - token = self.preprocessor(example_text, max_length=500, return_tensors="pt", truncation=True) - inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]} - - self.calibration_dataset = nncf.Dataset([inputs]) - return self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn()) def cleanup_cache(self): From 7f32430235dc703ff72f95cda85b82154e3eb7da Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 2 May 2024 15:14:45 +0400 Subject: [PATCH 31/34] feat: Implemented get_num_compressed Co-authored-by: Alexander Suslov --- .../pipelines/lm_weight_compression.py | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 6a47f629194..96b725849d0 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -22,6 +22,7 @@ from datasets import load_dataset from memory_profiler import memory_usage from optimum.intel.openvino import OVModelForCausalLM +from optimum.exporters.openvino.convert import export_from_model from transformers import AutoModelForCausalLM from transformers import AutoTokenizer from whowhatbench import Evaluator @@ -186,16 +187,34 @@ def collect_data_from_stdout(self, stdout: str): def save_compressed_model(self) -> None: if self.backend == BackendType.FP32: return - if self.backend == BackendType.TORCH: - self.model_hf.save_pretrained(self.output_model_dir) - - return - ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME) - self.model_hf._save_config(self.output_model_dir) + if self.backend == BackendType.OV: + ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) + self.model_hf._save_config(self.output_model_dir) + elif self.backend == BackendType.TORCH: + export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32") def get_num_compressed(self) -> None: - pass + """ + Get number of the i8, u8, i4, u4 ops in the compressed IR. + """ + num_int8 = 0 + num_int4 = 0 + + if self.backend == BackendType.TORCH: + model = ov.Core().read_model(self.output_model_dir / self.OV_MODEL_NAME) + else: + model = self.model + + for node in model.get_ops(): + for i in range(node.get_output_size()): + if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]: + num_int8 += 1 + if node.get_output_element_type(i).get_type_name() in ["i4", "u4"]: + num_int4 += 1 + + self.run_info.num_compress_nodes.num_int8 = num_int8 + self.run_info.num_compress_nodes.num_int4 = num_int4 def run_bench(self) -> None: pass From 7729867343e3b16f5284ea6686445dc378122e35 Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 2 May 2024 15:15:52 +0400 Subject: [PATCH 32/34] fix: Dumping the fp32 model correctly Utilization of export_from_model() function from Optimum Co-authored-by: Alexander Suslov --- tests/post_training/pipelines/lm_weight_compression.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 96b725849d0..6e0c3e5138b 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -224,9 +224,11 @@ def _dump_model_fp32(self) -> None: Dump IRs of fp32 models, to help debugging. The test cases may share the same fp32 model, therefore it is saved to the dedicated shared folder. """ - self.model_hf.save_pretrained(self.fp32_model_dir) - if self.backend != BackendType.TORCH: + if self.backend == BackendType.OV: + self.model_hf.save_pretrained(self.fp32_model_dir) self.model_hf._save_config(self.fp32_model_dir) + elif self.backend == BackendType.TORCH: + export_from_model(self.model_hf, self.fp32_model_dir, stateful=False, compression_option="fp32") def _compress(self): """ From 70cd9120132cdc24a7e413b87801fb56daf28cfa Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 2 May 2024 15:19:48 +0400 Subject: [PATCH 33/34] chore: Removed unneccesary model wrapping TORCH Backends only --- .../pipelines/lm_weight_compression.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 6e0c3e5138b..51a96777c83 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -21,8 +21,8 @@ import torch from datasets import load_dataset from memory_profiler import memory_usage -from optimum.intel.openvino import OVModelForCausalLM from optimum.exporters.openvino.convert import export_from_model +from optimum.intel.openvino import OVModelForCausalLM from transformers import AutoModelForCausalLM from transformers import AutoTokenizer from whowhatbench import Evaluator @@ -189,10 +189,10 @@ def save_compressed_model(self) -> None: return if self.backend == BackendType.OV: - ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) - self.model_hf._save_config(self.output_model_dir) - elif self.backend == BackendType.TORCH: - export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32") + ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME) + self.model_hf._save_config(self.output_model_dir) + elif self.backend == BackendType.TORCH: + export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32") def get_num_compressed(self) -> None: """ @@ -234,15 +234,6 @@ def _compress(self): """ Actual call of weight compression """ - if self.backend == BackendType.TORCH: - from nncf.torch.model_creation import is_wrapped_model - from nncf.torch.model_creation import wrap_model - - if not is_wrapped_model(self.model): - example_input = next(iter(self.calibration_dataset.get_inference_data())) - self.model = wrap_model(self.model, example_input=example_input, trace_parameters=True) - self.calibration_dataset = None - self.compressed_model = nncf.compress_weights( self.model, dataset=self.calibration_dataset, From e5db8cc42ae2cb1f2784a8ae7f4e8e4d07728e0a Mon Sep 17 00:00:00 2001 From: AdiKsOnDev Date: Thu, 2 May 2024 15:23:03 +0400 Subject: [PATCH 34/34] fix: Changed _validate to match the modified pipeline --- .../post_training/pipelines/lm_weight_compression.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 51a96777c83..fcab0a20f88 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -241,6 +241,7 @@ def _compress(self): ) def _validate(self): + is_stateful = self.params.get("is_stateful", False) core = ov.Core() if os.environ.get("INFERENCE_NUM_THREADS"): @@ -252,8 +253,8 @@ def _validate(self): gt_data_path.parent.mkdir(parents=True, exist_ok=True) if os.getenv("NNCF_TEST_REGEN_DOT") is not None: print("Collection ground-truth reference data") - model_gold = self.MODEL_FUNC.from_pretrained( - self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS + model_gold = OVModelForCausalLM.from_pretrained( + self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",)) evaluator.dump_gt(str(gt_data_path)) @@ -265,11 +266,10 @@ def _validate(self): ) compressed_model_hf = self.model_hf - if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH: - compressed_model_hf = self.MODEL_FUNC.from_pretrained( - self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS + if self.backend != BackendType.FP32: + compressed_model_hf = OVModelForCausalLM.from_pretrained( + self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful ) - print("Evaluation of the target model") _, all_metrics = evaluator.score(compressed_model_hf) similarity = all_metrics["similarity"][0]