From 107f7aafd0f7634aa14e59d4e9a1a5709cd23e2a Mon Sep 17 00:00:00 2001 From: Shaojun Liu <61072813+liu-shaojun@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:38:30 +0800 Subject: [PATCH] enable inference mode for deepspeed tp serving (#11742) --- python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py index 23de5fa1acc..3d8d4ca9ea3 100644 --- a/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py +++ b/python/llm/example/GPU/Deepspeed-AutoTP-FastAPI/serving.py @@ -116,11 +116,13 @@ def load_model(model_path, low_bit): # Use IPEX-LLM `optimize_model` to convert the model into optimized low bit format # Convert the rest of the model into float16 to reduce allreduce traffic model = optimize_model(model.module.to(f"cpu"), low_bit=low_bit).to(torch.float16) - + # Next, use XPU as accelerator to speed up inference current_accel = XPU_Accelerator() set_accelerator(current_accel) + model=model.eval() + # Move model back to xpu model = model.to(f"xpu:{local_rank}") model = BenchmarkWrapper(model)