diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md
index 8fabd6941cb..4f32e50ed1a 100644
--- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md
+++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/README.md
@@ -23,6 +23,8 @@ pip install bitsandbytes scipy
 source /opt/intel/oneapi/setvars.sh # necessary to run before installing deepspeed
 pip install git+https://github.com/microsoft/DeepSpeed.git@78c518e
 pip install git+https://github.com/intel/intel-extension-for-deepspeed.git@ec33277
+# (optional) install mpirun to run multi-card finetuning
+sudo apt install openmpi-bin
 ```
 
 ### 2. Configures OneAPI environment variables
diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
index cfbdf3ed48c..4802f839942 100644
--- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py
+++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@@ -794,8 +794,12 @@ def forward(self, x: torch.Tensor):
                                                    self.weight.qtype, input_seq_size)
                     result = result.to(x.dtype)
                 else:
-                    result = xe_linear.forward_new(x_2d, self.weight.data,
-                                                   self.weight.qtype, input_seq_size)
+                    if self.weight.qtype == NF4:
+                        result = xe_linear.forward_new(x_2d, self.weight.data.view(torch.uint8),
+                                                       self.weight.qtype, input_seq_size)
+                    else:
+                        result = xe_linear.forward_new(x_2d, self.weight.data,
+                                                       self.weight.qtype, input_seq_size)
 
                 if do_empty_cache:
                     torch.xpu.empty_cache()