diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md index 8fabd6941cb..4f32e50ed1a 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md @@ -23,6 +23,8 @@ pip install bitsandbytes scipy source /opt/intel/oneapi/setvars.sh # necessary to run before installing deepspeed pip install git+https://github.com/microsoft/DeepSpeed.git@78c518e pip install git+https://github.com/intel/intel-extension-for-deepspeed.git@ec33277 +# (optional) install mpirun to run multi-card finetuning +sudo apt install openmpi-bin ``` ### 2. Configures OneAPI environment variables diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index cfbdf3ed48c..4802f839942 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -794,8 +794,12 @@ def forward(self, x: torch.Tensor): self.weight.qtype, input_seq_size) result = result.to(x.dtype) else: - result = xe_linear.forward_new(x_2d, self.weight.data, - self.weight.qtype, input_seq_size) + if self.weight.qtype == NF4: + result = xe_linear.forward_new(x_2d, self.weight.data.view(torch.uint8), + self.weight.qtype, input_seq_size) + else: + result = xe_linear.forward_new(x_2d, self.weight.data, + self.weight.qtype, input_seq_size) if do_empty_cache: torch.xpu.empty_cache()