intel-analytics · cyita · Aug 21, 2024 · Aug 21, 2024
diff --git a/python/llm/dev/benchmark/ceval/README.md b/python/llm/dev/benchmark/ceval/README.md
@@ -18,6 +18,7 @@ bash run.sh
 ```
 + `run.sh`
 ```shell
+export IPEX_LLM_LAST_LM_HEAD=0
 python eval.py \
     --model_path "path to model" \
     --eval_type validation \

diff --git a/python/llm/dev/benchmark/ceval/run.sh b/python/llm/dev/benchmark/ceval/run.sh
@@ -1,3 +1,5 @@
+export IPEX_LLM_LAST_LM_HEAD=0
+
 python eval.py \
     --model_path "path to model" \
     --eval_type validation \

diff --git a/python/llm/dev/benchmark/harness/README.md b/python/llm/dev/benchmark/harness/README.md
@@ -15,15 +15,21 @@ pip install -e .
 run `python run_llb.py`. `run_llb.py` combines some arguments in `main.py` to make evaluations easier. The mapping of arguments is defined as a dict in [`llb.py`](llb.py).
 
 ### Evaluation on CPU
-```python
+```bash
+export IPEX_LLM_LAST_LM_HEAD=0
+
 python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device cpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 ### Evaluation on Intel GPU
-```python
+```bash
+export IPEX_LLM_LAST_LM_HEAD=0
+
 python run_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 ### Evaluation using multiple Intel GPU
-```python
+```bash
+export IPEX_LLM_LAST_LM_HEAD=0
+
 python run_multi_llb.py --model ipex-llm --pretrained /path/to/model --precision nf3 sym_int4 nf4 --device xpu:0,2,3 --tasks hellaswag arc mmlu truthfulqa --batch 1 --no_cache
 ```
 Taking example above, the script will fork 3 processes, each for one xpu, to execute the tasks.

diff --git a/python/llm/dev/benchmark/perplexity/README.md b/python/llm/dev/benchmark/perplexity/README.md
@@ -12,6 +12,11 @@ This is a required step on Linux for APT or offline installed oneAPI. Skip this
 source /opt/intel/oneapi/setvars.sh
 ```
 
+Please set IPEX_LLM_LAST_LM_HEAD=0 to disable the last_lm_head optimization.
+```bash
+export IPEX_LLM_LAST_LM_HEAD=0
+```
+
 ## PPL Evaluation
 ### 1. Run on Wikitext
 An example to run perplexity on [wikitext](https://paperswithcode.com/dataset/wikitext-2):

diff --git a/python/llm/dev/benchmark/whisper/README.md b/python/llm/dev/benchmark/whisper/README.md
@@ -10,6 +10,7 @@ pip install datasets evaluate soundfile librosa jiwer
 
 ## Run
 ```bash
+export IPEX_LLM_LAST_LM_HEAD=0
 python run_whisper.py --model_path /path/to/model --data_type other --device cpu
 ```
 

diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
@@ -403,7 +403,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
             optimize_lm_head = (
                 is_lm_head(name, model_config, out_features)
                 and (
-                    os.environ.get("IPEX_LLM_LAST_LM_HEAD", "0") == "1"
+                    (not os.environ.get("IPEX_LLM_LAST_LM_HEAD", None) == "0")
                     or os.environ.get("IPEX_LLM_LOW_MEM", "0") == "1"
                     and getattr(model_config, "model_type", "") in ["gptj", "llama", "qwen2"]
                 )
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,7 @@ pip install datasets evaluate soundfile librosa jiwer @@
     ## Run
     ```bash
+    export IPEX_LLM_LAST_LM_HEAD=0
     python run_whisper.py --model_path /path/to/model --data_type other --device cpu
     ```
@@ Expand Down @@