Update npu example and all in one benckmark (#11766)

intel-analytics · Aug 12, 2024 · 05989ad · 05989ad
1 parent 57d1777
commit 05989ad
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 8 deletions.
diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -580,15 +580,16 @@ def transformers_int4_npu_win(repo_id,
     # which convert the relevant layers in the model into INT4 format
     st = time.perf_counter()
     if repo_id in CHATGLM_IDS:
-        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True, torch_dtype='auto').eval()
+        model = AutoModel.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
+                                          torch_dtype='auto', attn_implementation="eager").eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     elif repo_id in LLAMA_IDS:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
-                                                     use_cache=True).eval()
+                                                     use_cache=True, attn_implementation="eager").eval()
         tokenizer = LlamaTokenizer.from_pretrained(model_path, trust_remote_code=True)
     else:
         model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit=low_bit, trust_remote_code=True,
-                                                     use_cache=True).eval()
+                                                     use_cache=True, attn_implementation="eager").eval()
         tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     end = time.perf_counter()
     load_time = end - st

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/README.md
@@ -29,11 +29,11 @@ In the example [generate.py](./generate.py), we show a basic use case for a Llam
 #### 1.1 Installation on Windows
 We suggest using conda to manage environment:
 ```bash
-conda create -n llm python=3.10 libuv
+conda create -n llm python=3.10
 conda activate llm
 
-# below command will install intel_extension_for_pytorch==2.1.10+xpu as default
-pip install --pre --upgrade ipex-llm[xpu] --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+# install ipex-llm with 'all' option
+pip install --pre --upgrade ipex-llm[all]
 
 # below command will install intel_npu_acceleration_library
 pip install intel-npu-acceleration-library==1.3

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/generate.py
@@ -24,7 +24,7 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for npu model')
-    parser.add_argument('--repo-id-or-model-path', type=str, default="D:\llm-models\Llama-2-7b-chat-hf",
+    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
                         help='The huggingface repo id for the Llama2 model to be downloaded'
                              ', or the path to the huggingface checkpoint folder')
     parser.add_argument('--prompt', type=str, default="Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun",
@@ -40,7 +40,8 @@
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,
-                                                 load_in_low_bit=args.load_in_low_bit)
+                                                 load_in_low_bit=args.load_in_low_bit,
+                                                 attn_implementation="eager")
 
     print(model)