Skip to content

Commit

Permalink
[NNCF]: Add INT8 weight compression conformance test for Tinyllama-1.…
Browse files Browse the repository at this point in the history
…1b PyTorch model (#2636)

### Changes

- Added the `INT8` compression  **test suite** to the `model_scope`
- Added `TORCH` backend support in `LMWeightCompression` class 
- For `INT8` compression, _dataset,_ as well as some other parameters
(see
[model_scope](https://github.com/openvinotoolkit/nncf/blob/f0081037f28af2a829043d4ddaf4902d91864724/tests/post_training/model_scope.py#L329C1-L340C7))
are set to `None`
-
[metric_value](https://github.com/openvinotoolkit/nncf/blob/f0081037f28af2a829043d4ddaf4902d91864724/tests/post_training/data/wc_reference_data.yaml#L17C1-L20C15)
has been set to **0.95944**
- Mainly use `save_pretrained()` for `TORCH` models
- Omitted a few method calls that are not supported for `TORCH` models
(Check the commits for details)

 

### Reason for changes

Requested to Benchmark changes via `whowhatbench` in issue #2527 

### Related tickets

ref: 130788
Closes #2527

### Tests
- Added `INT8` _weight compression_ **conformance** test for
`Tinyllama-1.1b` **PyTorch** model

---------

Co-authored-by: Aleksander <[email protected]>
Co-authored-by: Alexander Suslov <[email protected]>
  • Loading branch information
3 people authored May 2, 2024
1 parent 08d5f0c commit ba7e1a4
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 38 deletions.
4 changes: 4 additions & 0 deletions tests/post_training/data/wc_reference_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV:
metric_value: 0.83795
num_int4: 188
num_int8: 124
tinyllama_int8_data_free_backend_TORCH:
metric_value: 0.95624
num_int4: 0
num_int8: 312
9 changes: 9 additions & 0 deletions tests/post_training/model_scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,15 @@
"params": {"is_stateful": True},
"backends": [BackendType.OV],
},
{
"reported_name": "tinyllama_int8_data_free",
"model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
"pipeline_cls": LMWeightCompression,
"compression_params": {
"mode": CompressWeightsMode.INT8_ASYM,
},
"backends": [BackendType.TORCH],
},
]


Expand Down
111 changes: 73 additions & 38 deletions tests/post_training/pipelines/lm_weight_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@

import numpy as np
import openvino as ov
import torch
from datasets import load_dataset
from memory_profiler import memory_usage
from optimum.exporters.openvino.convert import export_from_model
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from whowhatbench import Evaluator

Expand Down Expand Up @@ -72,20 +75,36 @@ class LMWeightCompression(BaseTestPipeline):

def prepare_model(self) -> None:
is_stateful = self.params.get("is_stateful", False)
if is_stateful:
self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
# export by model_id
self.model_hf = OVModelForCausalLM.from_pretrained(
self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful

# load model
if self.backend == BackendType.TORCH:
if is_stateful:
raise RuntimeError(f"is_stateful={is_stateful} is not supported for PyTorch backend.")

self.model_hf = AutoModelForCausalLM.from_pretrained(
self.model_id, torch_dtype=torch.float32, device_map="cpu"
)
self._dump_model_fp32()
self.model = self.model_hf
elif self.backend == BackendType.OV:
if is_stateful:
self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
# export by model_id
self.model_hf = OVModelForCausalLM.from_pretrained(
self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
)
else:
# no export, load from IR. Applicable for sequential run of test cases in local environment.
self.model_hf = OVModelForCausalLM.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
)
self.model = self.model_hf.model
else:
# no export, load from IR. Applicable for sequential run of test cases in local environment.
self.model_hf = OVModelForCausalLM.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
)
self.model = self.model_hf.model
raise RuntimeError(f"backend={self.backend.value} is not supported.")

# dump FP32 model
if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():
self._dump_model_fp32()

def prepare_preprocessor(self) -> None:
self.preprocessor = AutoTokenizer.from_pretrained(self.model_id)
Expand All @@ -108,36 +127,40 @@ def transform_fn(data, max_tokens=128):
inputs["attention_mask"] = attention_mask
position_ids = np.cumsum(attention_mask, axis=1) - 1
position_ids[attention_mask == 0] = 1

# The magic forms KV cache as model inputs
batch_size = input_ids.shape[0]
for input_name in self.model_hf.key_value_input_names:
model_inputs = self.model.input(input_name)
shape = model_inputs.get_partial_shape()
shape[0] = batch_size
if shape[2].is_dynamic:
shape[2] = 0
else:
shape[1] = 0
inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())

inputs["position_ids"] = position_ids

# initialize the rest of inputs (e.g. beam_idx for stateful models)
for val in self.model.inputs:
name = val.any_name
if name in inputs:
continue
shape = list(val.partial_shape.get_min_shape())
shape[0] = batch_size
inputs[name] = np.zeros(shape)
if self.backend == BackendType.OV:
# The magic forms KV cache as model inputs
batch_size = input_ids.shape[0]
for input_name in self.model_hf.key_value_input_names:
model_inputs = self.model.input(input_name)
shape = model_inputs.get_partial_shape()
shape[0] = batch_size
if shape[2].is_dynamic:
shape[2] = 0
else:
shape[1] = 0
inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())

# initialize the rest of inputs (e.g. beam_idx for stateful models)
for val in self.model.inputs:
name = val.any_name
if name in inputs:
continue
shape = list(val.partial_shape.get_min_shape())
shape[0] = batch_size
inputs[name] = np.zeros(shape)
if self.backend == BackendType.TORCH:
for input_name in inputs:
inputs[input_name] = torch.from_numpy(inputs[input_name])
return inputs

return transform_fn

def prepare_calibration_dataset(self):
dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e")
dataset = dataset.filter(lambda example: len(example["text"]) > 128)

self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn())

def cleanup_cache(self):
Expand All @@ -164,8 +187,12 @@ def collect_data_from_stdout(self, stdout: str):
def save_compressed_model(self) -> None:
if self.backend == BackendType.FP32:
return
ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
self.model_hf._save_config(self.output_model_dir)

if self.backend == BackendType.OV:
ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
self.model_hf._save_config(self.output_model_dir)
elif self.backend == BackendType.TORCH:
export_from_model(self.model_hf, self.output_model_dir, stateful=False, compression_option="fp32")

def get_num_compressed(self) -> None:
"""
Expand All @@ -174,7 +201,12 @@ def get_num_compressed(self) -> None:
num_int8 = 0
num_int4 = 0

for node in self.model.get_ops():
if self.backend == BackendType.TORCH:
model = ov.Core().read_model(self.output_model_dir / self.OV_MODEL_NAME)
else:
model = self.model

for node in model.get_ops():
for i in range(node.get_output_size()):
if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
num_int8 += 1
Expand All @@ -192,8 +224,11 @@ def _dump_model_fp32(self) -> None:
Dump IRs of fp32 models, to help debugging. The test cases may share the same fp32 model, therefore it is saved
to the dedicated shared folder.
"""
self.model_hf.save_pretrained(self.fp32_model_dir)
self.model_hf._save_config(self.fp32_model_dir)
if self.backend == BackendType.OV:
self.model_hf.save_pretrained(self.fp32_model_dir)
self.model_hf._save_config(self.fp32_model_dir)
elif self.backend == BackendType.TORCH:
export_from_model(self.model_hf, self.fp32_model_dir, stateful=False, compression_option="fp32")

def _compress(self):
"""
Expand Down

0 comments on commit ba7e1a4

Please sign in to comment.