diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 2a3ecffcda6..0eec9919909 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -151,7 +151,9 @@ def from_pretrained(cls,
     @classmethod
     def load_convert(cls, q_k, optimize_model, device, *arg, **kwarg):
         from ipex_llm.transformers.npu_models.convert import replace_with_QuantizedLinear
+        from ipex_llm.transformers.npu_models.convert import replace_with_LowBitMLP
         replace_with_QuantizedLinear(optimize_model, q_k, device=device)
+        replace_with_LowBitMLP(optimize_model, q_k, device=device)
 
     @classmethod
     @patch("transformers.dynamic_module_utils.get_imports", patch_flash_attn_import)
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
index 6d3c95ee0bf..6871202c73b 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -61,6 +61,17 @@ def replace_with_QuantizedLinear(layer, qtype, device):
         return QuantizedLinear(qweights, scale, layer.bias)
 
 
+@module_optimization
+def replace_with_LowBitMLP(layer, qtype, device):
+    from transformers.models.llama.modeling_llama import LlamaMLP
+    from ipex_llm.transformers.npu_models.lowbitmlp import FusedLlamaLowBitMLP
+    if isinstance(layer, LlamaMLP):
+        weights = [(layer.gate_proj.weight, layer.gate_proj.scale),
+                   (layer.up_proj.weight, layer.up_proj.scale),
+                   (layer.down_proj.weight, layer.down_proj.scale)]
+        return FusedLlamaLowBitMLP(weights)   # TODO: handle bias
+
+
 def convert_forward(m, target_m, new_forward):
     if m.__class__ == target_m:
         bound_method = new_forward.__get__(m, m.__class__)
@@ -74,7 +85,6 @@ def optimize_llm(model: torch.nn.Module):
         from ipex_llm.transformers.npu_models.llama import merge_qkv
         from ipex_llm.transformers.npu_models.llama import merge_mlp
         model.apply(merge_qkv)
-        model.apply(merge_mlp)
 
         from ipex_llm.transformers.npu_models.llama import llama_model_forward
         from ipex_llm.transformers.npu_models.llama import llama_attention_forward
@@ -84,7 +94,6 @@ def optimize_llm(model: torch.nn.Module):
         from transformers.models.llama.modeling_llama import LlamaMLP
         convert_forward(model, LlamaModel, llama_model_forward)
         convert_forward(model, LlamaAttention, llama_attention_forward)
-        convert_forward(model, LlamaMLP, llama_mlp_forward)
 
     elif model.config.model_type == "mistral":
         from ipex_llm.transformers.npu_models.mistral import merge_qkv
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lowbitmlp.py b/python/llm/src/ipex_llm/transformers/npu_models/lowbitmlp.py
new file mode 100644
index 00000000000..9bf2c0fc249
--- /dev/null
+++ b/python/llm/src/ipex_llm/transformers/npu_models/lowbitmlp.py
@@ -0,0 +1,126 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is adapted from
+# https://github.com/intel/intel-npu-acceleration-library/blob/main/intel_npu_acceleration_library/nn/linear.py
+
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+
+from ipex_llm.transformers.npu_models.runtime import run_model
+from intel_npu_acceleration_library.backend.factory import NNFactory
+from typing import Optional, Sequence, List
+from functools import partial
+import numpy as np
+import torch
+import uuid
+
+
+class LowBitMLP(NNFactory):
+    """Computing a LowBit MLP with weights prefetching."""
+
+    def __init__(
+        self,
+        input_shape: Sequence[int],
+        intermediate_size: int,
+        activation: str = "swiglu",
+        bias: Optional[bool] = False,
+        dtype: np.dtype = np.int8,
+        profile: bool = False,
+        device: str = "NPU",
+        **additional_args
+    ):
+        """Initialize the LowBitMLP class.
+
+        Args:
+            input_shape (Sequence[int]): input shape channels.
+            intermediate_size (int): intermediate_size of the MLP.
+            activation (str): activation function to use.
+            bias (Optional[bool]): Enable/Disable bias. Defaults to False.
+            dtype (np.dtype): parameter type np.int8, np.uint8 and np.float16 supported.
+                              Defaults to np.int8. Unit8 represents packed i4 dtypes.
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+            additional_args: additional arguments
+        """
+        super().__init__(profile, device)
+        self.intermediate_size = intermediate_size
+        self.batch, self.hidden_size = input_shape
+        input = self.parameter((self.batch, self.hidden_size))
+
+        mm1 = self.linear(input, self.intermediate_size, self.hidden_size,
+                          bias=bias, wt_dtype=dtype)
+
+        if activation == "swiglu":
+            mm2 = self.linear(input, self.intermediate_size, self.hidden_size,
+                              bias=bias, wt_dtype=dtype)  # type: ignore[attr-defined]
+            mm1 = self.eltwise_mul(self.swish(mm1), mm2)  # type: ignore[attr-defined]
+        elif activation == "clamp":
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1, additional_args.get("min"), additional_args.get("max"))
+        elif activation == "elu":
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1, additional_args.get("alpha", 1.0))
+        elif activation == "grn":
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1, additional_args.get("grn_bias"))
+        else:
+            atc_fn = getattr(self, activation)
+            mm1 = atc_fn(mm1)
+
+        _ = self.linear(mm1, self.hidden_size, self.intermediate_size, bias=bias, wt_dtype=dtype)
+        self.compile()
+
+
+class FusedLlamaLowBitMLP(torch.nn.Module):
+    """LLAMA LowBit MLP operation NPU backend."""
+
+    def __init__(
+        self,
+        parameters: List[torch.Tensor],
+    ):
+        """Initialize LLAMA LowBit MLP operation.
+
+        Args:
+            parameters (List[torch.Tensor]): model weights
+        """
+        super().__init__()
+        self.op_parameters = parameters
+        self.op_id = str(uuid.uuid4())
+        if isinstance(parameters[0], tuple):  # weight, scale from QuantizedLinear
+            np_dtype = np.int8 if parameters[0][0].dtype == torch.int8 else np.uint8
+            intermediate_size, _ = parameters[0][0].shape
+        else:  # FP16 Linear
+            np_dtype = np.float16
+            intermediate_size, _ = parameters[0].shape
+        self.backend_cls = partial(LowBitMLP, intermediate_size=intermediate_size, dtype=np_dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Torch module forward method.
+
+        Args:
+            x (torch.Tensor): Input tensor
+
+        Returns:
+            torch.Tensor: result
+        """
+        # Handle 3D input shape (similarly done in run_matmul)
+        original_shape = x.shape
+        if len(x.shape) > 2:
+            x = x.view([-1, x.shape[-1]])
+        output = run_model(x, self.op_parameters, self.backend_cls, self.op_id)
+        return output.view(original_shape)
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/runtime.py b/python/llm/src/ipex_llm/transformers/npu_models/runtime.py
new file mode 100644
index 00000000000..3f0177eb13f
--- /dev/null
+++ b/python/llm/src/ipex_llm/transformers/npu_models/runtime.py
@@ -0,0 +1,95 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is adapted from
+# https://github.com/intel/intel-npu-acceleration-library/blob/main/intel_npu_acceleration_library/nn/linear.py
+
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+
+from intel_npu_acceleration_library.backend.runtime import set_contiguous, record_function
+from intel_npu_acceleration_library.backend.runtime import adapt_output_tensor, _model_cache
+from typing import Optional, List, Union, Any
+from collections import deque
+import torch
+
+NUM_REPLICAS = 4  # TODO: make it an environment variable?
+
+
+@torch.no_grad()
+def run_model(
+    x: Union[torch.Tensor, List[torch.Tensor]],
+    weights: List[torch.Tensor],
+    backend_cls: Any,
+    op_id: Optional[str] = None,
+) -> torch.Tensor:
+    """Run a factory operation.
+    Depending on the datatype of the weights it runs a float or quantized operation.
+
+    Args:
+        x (Union[torch.Tensor, List[torch.Tensor]]): Activation tensor(s).
+                                                     Its dtype must be torch.float16.
+        weights (torch.Tensor): Weights tensor. Its dtype can be torch.float16 or torch.int8.
+        backend_cls (Any): Backend class to run.
+        op_id (Optional[str], optional): Operation ID. Defaults to None.
+
+    Returns:
+        torch.Tensor: result
+    """
+    global _model_cache
+
+    # Use or not op_id depending on the class used
+    op_kwargs = {"op_id": op_id} if op_id else {}
+
+    if not isinstance(x, (list, tuple)):
+        x = [x]
+
+    # Reshape input
+    input_dtype = x[0].dtype
+    x_np = [set_contiguous(elem).to(torch.float16).numpy() for elem in x]
+    op_args = []
+    op_args_flatten = []
+    for w in weights:
+        if isinstance(w, tuple):  # from QuantizedLinear
+            op_args.append((set_contiguous(w[0]).numpy(), set_contiguous(w[1]).numpy()))
+            op_args_flatten.append(op_args[-1][0])
+            op_args_flatten.append(op_args[-1][1])
+        else:
+            op_args.append(set_contiguous(w).numpy())
+            op_args_flatten.append(op_args[-1])
+
+    shape_dtype_signature = "_".join(
+        ["_".join(str(dim) for dim in t.shape) + f"_{t.dtype}" for t in x_np + op_args_flatten]
+    )
+    key = f"{backend_cls.func.__name__}_{shape_dtype_signature}"
+    models = _model_cache.get(key, None)
+
+    input_shapes = [elem.shape for elem in x_np]
+    if models is None:
+        _model_cache[key] = deque([backend_cls(*input_shapes) for i in range(NUM_REPLICAS)])
+    elif len(models) < 1:
+        _model_cache[key].append(backend_cls(*input_shapes))
+    else:
+        _model_cache[key].rotate(1)
+
+    # Get the model
+    model = _model_cache[key][0]
+
+    with record_function(f"npu_factory_mul_{key}"):
+        ret = model.run(*x_np, *op_args, **op_kwargs)
+
+    return adapt_output_tensor(ret, ret.shape, input_dtype)