From d817328dd9040f3896bed71c1bb1c181a3cc805c Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Thu, 19 Oct 2023 14:36:04 +0800 Subject: [PATCH] Update ITREX version in ONNXRT WOQ example and fix bugs in hf models (#1333) Signed-off-by: yuwenzho --- .../question_answering/quantization/ptq_static/main.py | 2 ++ .../text_generation/llama/quantization/ptq_static/main.py | 2 +- .../llama/quantization/ptq_static/requirements.txt | 2 +- .../text_generation/llama/quantization/weight_only/README.md | 2 ++ .../text_generation/llama/quantization/weight_only/main.py | 2 +- .../llama/quantization/weight_only/requirements.txt | 4 ++-- 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py index 3bfc6e0aede..73e977c9559 100644 --- a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py @@ -483,6 +483,8 @@ def eval_func(model, *args): if model_args.model_name_or_path == 'mrm8488/spanbert-finetuned-squadv1': fp32_op_names = ['/bert/embeddings/word_embeddings/Gather', '/bert/encoder/layer.[5-7|9]/output/dense/MatMul'] + elif model_args.model_name_or_path == 'salti/bert-base-multilingual-cased-finetuned-squad': + fp32_op_names = ['/bert/encoder/layer.[4-5]/output/dense/MatMul'] elif model_args.model_name_or_path == 'distilbert-base-uncased-distilled-squad': fp32_op_names = ['/distilbert/transformer/layer.[1-5]/ffn/lin[1-2]/MatMul'] elif model_args.model_name_or_path == 'deepset/roberta-large-squad2': diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py index 1cf19b1873c..03a6e632ca8 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py @@ -25,7 +25,7 @@ import onnxruntime as ort from torch.nn.functional import pad from torch.utils.data import DataLoader -from intel_extension_for_transformers.evaluation.lm_eval import evaluate +from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate from optimum.onnxruntime import ORTModelForCausalLM from transformers import LlamaConfig, LlamaTokenizer diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt index 1061d4e7c8e..216b355bd2b 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/requirements.txt @@ -1,5 +1,5 @@ -git+https://github.com/intel/intel-extension-for-transformers.git@b8302f99a93e5f09a80431cee2fb384755062664 git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3 +intel-extension-for-transformers torch transformers accelerate diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md index f780509991a..e15d0e3c703 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md @@ -12,6 +12,8 @@ pip install -r requirements.txt ``` > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). +> Note: Weight-only quantization in IntelĀ® Neural Compressor is still under development. We encourage you to use the `master` branch to access the latest features. + ## 2. Prepare Model ```bash diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index a4bdba8bc38..b4985163fde 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -26,7 +26,7 @@ import onnxruntime as ort from torch.nn.functional import pad from torch.utils.data import DataLoader -from intel_extension_for_transformers.evaluation.lm_eval import evaluate +from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate from optimum.onnxruntime import ORTModelForCausalLM from transformers import LlamaConfig, LlamaTokenizer diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt index 1061d4e7c8e..6b7ed0b86af 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/requirements.txt @@ -1,5 +1,5 @@ -git+https://github.com/intel/intel-extension-for-transformers.git@b8302f99a93e5f09a80431cee2fb384755062664 git+https://github.com/EleutherAI/lm-evaluation-harness.git@83dbfbf6070324f3e5872f63e49d49ff7ef4c9b3 +intel-extension-for-transformers torch transformers accelerate @@ -8,4 +8,4 @@ onnxruntime onnxruntime-extensions; python_version < '3.11' datasets optimum -evaluate +evaluate \ No newline at end of file