From eeb21a039d22abce7b2b53a1f9b8a2008f5decfd Mon Sep 17 00:00:00 2001 From: Nikolay Date: Tue, 6 Aug 2024 17:54:35 +0200 Subject: [PATCH] New metrics for weight compression with dynamic quantization --- .../llm_compression/openvino/tiny_llama/main.py | 2 +- .../openvino/tiny_llama_find_hyperparams/main.py | 1 - tests/post_training/data/wc_reference_data.yaml | 16 ++++++++-------- .../pipelines/lm_weight_compression.py | 1 - 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py index dd03a4361c6..f2be54ce1aa 100644 --- a/examples/llm_compression/openvino/tiny_llama/main.py +++ b/examples/llm_compression/openvino/tiny_llama/main.py @@ -67,7 +67,7 @@ def transform_fn(data, model, tokenizer): ) model.save_pretrained(OUTPUT_DIR) - model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"}) + model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR) input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device) start_t = time.time() diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py index 7ab0176eb85..6b57b9481f2 100644 --- a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py +++ b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py @@ -245,7 +245,6 @@ def main(): "PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "", - "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", } model = OVModelForCausalLM.from_pretrained( model_id, diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index 08a5b426746..d1d988e2a88 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -1,32 +1,32 @@ tinyllama_data_free_backend_OV: - metric_value: 0.73873 + metric_value: 0.72494 num_int4: 114 num_int8: 84 tinyllama_data_aware_backend_OV: - metric_value: 0.85767 + metric_value: 0.85635 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_stateful_backend_OV: - metric_value: 0.85616 + metric_value: 0.84951 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_backend_OV: - metric_value: 0.85502 + metric_value: 0.85299 num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV: - metric_value: 0.85502 + metric_value: 0.84125 num_int4: 94 num_int8: 124 tinyllama_int8_data_free_backend_TORCH: - metric_value: 0.95624 + metric_value: 0.94924 num_int4: 0 num_int8: 312 tinyllama_data_aware_gptq_backend_OV: - metric_value: 0.87134 + metric_value: 0.86621 num_int4: 94 num_int8: 124 tinyllama_scale_estimation_per_channel_backend_OV: - metric_value: 0.81389 + metric_value: 0.82356 num_int4: 188 num_int8: 124 diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py index 27479fe6a50..06074701b42 100644 --- a/tests/post_training/pipelines/lm_weight_compression.py +++ b/tests/post_training/pipelines/lm_weight_compression.py @@ -275,7 +275,6 @@ def _validate(self): load_in_8bit=False, compile=False, stateful=is_stateful, - ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"}, ) print("Evaluation of the target model") _, all_metrics = evaluator.score(compressed_model_hf)