diff --git a/python/llm/dev/benchmark/ceval/README.md b/python/llm/dev/benchmark/ceval/README.md index 03e835674d1..97771ba5ed3 100644 --- a/python/llm/dev/benchmark/ceval/README.md +++ b/python/llm/dev/benchmark/ceval/README.md @@ -18,6 +18,7 @@ bash run.sh ``` + `run.sh` ```shell +export IPEX_LLM_LAST_LM_HEAD=0 python eval.py \ --model_path "path to model" \ --eval_type validation \ diff --git a/python/llm/dev/benchmark/ceval/run.sh b/python/llm/dev/benchmark/ceval/run.sh index 1a4b92ef934..19a4b457fe6 100644 --- a/python/llm/dev/benchmark/ceval/run.sh +++ b/python/llm/dev/benchmark/ceval/run.sh @@ -1,3 +1,5 @@ +export IPEX_LLM_LAST_LM_HEAD=0 + python eval.py \ --model_path "path to model" \ --eval_type validation \ diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md index 50ec4b86f30..8f0d775cd85 100644 --- a/python/llm/dev/benchmark/harness/README.md +++ b/python/llm/dev/benchmark/harness/README.md @@ -15,15 +15,21 @@ pip install -e . run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evaluations easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py). ### Evaluation on CPU -```python +```bash +export IPEX_LLM_LAST_LM_HEAD=0 + python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` ### Evaluation on Intel GPU -```python +```bash +export IPEX_LLM_LAST_LM_HEAD=0 + python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` ### Evaluation using multiple Intel GPU -```python +```bash +export IPEX_LLM_LAST_LM_HEAD=0 + python run_multi_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu:0,2,3 --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache ``` Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks. diff --git a/python/llm/dev/benchmark/perplexity/README.md b/python/llm/dev/benchmark/perplexity/README.md index 410358eed34..638bdab335b 100644 --- a/python/llm/dev/benchmark/perplexity/README.md +++ b/python/llm/dev/benchmark/perplexity/README.md @@ -12,6 +12,11 @@ This is a required step on Linux for APT or offline installed oneAPI. Skip this source /opt/intel/oneapi/setvars.sh ``` +Please set IPEX_LLM_LAST_LM_HEAD=0 to disable the last_lm_head optimization. +```bash +export IPEX_LLM_LAST_LM_HEAD=0 +``` + ## PPL Evaluation ### 1. Run on Wikitext An example to run perplexity on [wikitext](https://paperswithcode.com/dataset/wikitext-2): diff --git a/python/llm/dev/benchmark/whisper/README.md b/python/llm/dev/benchmark/whisper/README.md index 189435db9c2..d2e6ed84f75 100644 --- a/python/llm/dev/benchmark/whisper/README.md +++ b/python/llm/dev/benchmark/whisper/README.md @@ -10,6 +10,7 @@ pip install datasets evaluate soundfile librosa jiwer ## Run ```bash +export IPEX_LLM_LAST_LM_HEAD=0 python run_whisper.py --model_path /path/to/model --data_type other --device cpu ``` diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index a7da4efc518..4f44b058e37 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -403,7 +403,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, optimize_lm_head = ( is_lm_head(name, model_config, out_features) and ( - os.environ.get("IPEX_LLM_LAST_LM_HEAD", "0") == "1" + (not os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "0") or os.environ.get("IPEX_LLM_LOW_MEM", "0") == "1" and getattr(model_config, "model_type", "") in ["gptj", "llama", "qwen2"] )