Skip to content

Commit

Permalink
enabled dynamic quantization and kv cache precision u8
Browse files Browse the repository at this point in the history
  • Loading branch information
ljaljushkin committed Nov 13, 2024
1 parent 8555c86 commit c7f3566
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion examples/llm_compression/openvino/tiny_llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def transform_fn(data, model, tokenizer):
)
model.save_pretrained(OUTPUT_DIR)

model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"})
model = OVModelForCausalLM.from_pretrained(OUTPUT_DIR, ov_config={"KV_CACHE_PRECISION": "u8"})
input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)

start_t = time.time()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,8 @@ def main():
ov_config = {
"PERFORMANCE_HINT": "LATENCY",
"NUM_STREAMS": "1",
"KV_CACHE_PRECISION": "u8",
"CACHE_DIR": "",
"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",
}
model = OVModelForCausalLM.from_pretrained(
model_id,
Expand Down
2 changes: 1 addition & 1 deletion tests/post_training/pipelines/lm_weight_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def _validate(self):
load_in_8bit=False,
compile=False,
stateful=is_stateful,
ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"},
ov_config={"KV_CACHE_PRECISION": "u8"},
)
print("Evaluation of the target model")
_, all_metrics = evaluator.score(compressed_model_hf)
Expand Down

0 comments on commit c7f3566

Please sign in to comment.