enabled dynamic quantization and kv cache precision u8

openvinotoolkit · Nov 13, 2024 · c7f3566 · c7f3566
1 parent 8555c86
commit c7f3566
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 3 deletions.
diff --git a/examples/llm_compression/openvino/tiny_llama/main.py b/examples/llm_compression/openvino/tiny_llama/main.py
@@ -67,7 +67,7 @@ def transform_fn(data, model, tokenizer):
     )
     model.save_pretrained(OUTPUT_DIR)
 
-    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"})
+    model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"KV_CACHE_PRECISION": "u8"})
     input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
 
     start_t = time.time()

diff --git a/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py b/examples/llm_compression/openvino/tiny_llama_find_hyperparams/main.py
@@ -246,8 +246,8 @@ def main():
     ov_config = {
         "PERFORMANCE_HINT": "LATENCY",
         "NUM_STREAMS": "1",
+        "KV_CACHE_PRECISION": "u8",
         "CACHE_DIR": "",
-        "DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",
     }
     model = OVModelForCausalLM.from_pretrained(
         model_id,

diff --git a/tests/post_training/pipelines/lm_weight_compression.py b/tests/post_training/pipelines/lm_weight_compression.py
@@ -290,7 +290,7 @@ def _validate(self):
                 load_in_8bit=False,
                 compile=False,
                 stateful=is_stateful,
-                ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
+                ov_config={"KV_CACHE_PRECISION": "u8"},
             )
         print("Evaluation of the target model")
         _, all_metrics = evaluator.score(compressed_model_hf)