diff --git a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py index 3bfc6e0aede..73e977c9559 100644 --- a/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/question_answering/quantization/ptq_static/main.py @@ -483,6 +483,8 @@ def eval_func(model, *args): if model_args.model_name_or_path == 'mrm8488/spanbert-finetuned-squadv1': fp32_op_names = ['/bert/embeddings/word_embeddings/Gather', '/bert/encoder/layer.[5-7|9]/output/dense/MatMul'] + elif model_args.model_name_or_path == 'salti/bert-base-multilingual-cased-finetuned-squad': + fp32_op_names = ['/bert/encoder/layer.[4-5]/output/dense/MatMul'] elif model_args.model_name_or_path == 'distilbert-base-uncased-distilled-squad': fp32_op_names = ['/distilbert/transformer/layer.[1-5]/ffn/lin[1-2]/MatMul'] elif model_args.model_name_or_path == 'deepset/roberta-large-squad2': diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py index c1095c822bd..46b09a25f86 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/main.py @@ -25,7 +25,7 @@ import onnxruntime as ort from torch.nn.functional import pad from torch.utils.data import DataLoader -from intel_extension_for_transformers.evaluation.lm_eval import evaluate +from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate from optimum.onnxruntime import ORTModelForCausalLM from transformers import LlamaConfig, LlamaTokenizer diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md index 2a278f2a9b4..fa1a250ef1c 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/README.md @@ -12,6 +12,8 @@ pip install -r requirements.txt ``` > Note: Validated ONNX Runtime [Version](/docs/source/installation_guide.md#validated-software-environment). +> Note: Weight-only quantization in IntelĀ® Neural Compressor is still under development. We encourage you to use the `master` branch to access the latest features. + ## 2. Prepare Model Note that this README.md uses meta-llama/Llama-2-7b-hf as an example. There are other models available that can be used for weight-only quantization. The following table shows a few models' configurations: diff --git a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index 233e19b7201..1a98933f597 100644 --- a/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -26,7 +26,7 @@ import onnxruntime as ort from torch.nn.functional import pad from torch.utils.data import DataLoader -from intel_extension_for_transformers.evaluation.lm_eval import evaluate +from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate from optimum.onnxruntime import ORTModelForCausalLM from transformers import LlamaConfig, LlamaTokenizer