diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index cb67b157e6a..43559944c36 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -27,7 +27,7 @@
 
 from ipex_llm.utils.common.log4Error import invalidInputError
 from ipex_llm.transformers.utils import logger, load_imatrix_data
-from ipex_llm.transformers.npu_models.convert import optimize_llm, optimize_llm_post
+from ipex_llm.transformers.npu_models.convert import optimize_llm
 
 
 def patch_flash_attn_import(filename: str) -> List[str]:
@@ -207,8 +207,6 @@ def from_pretrained(cls, *args, **kwargs):
             model = model.eval()
             logger.info(f"Finish to convert model")
         else:
-            from intel_npu_acceleration_library.compiler import create_npu_kernels
-
             if optimize_model:
                 invalidInputError(
                     max_prompt_len < max_context_len,
@@ -232,11 +230,14 @@ def from_pretrained(cls, *args, **kwargs):
                     "convert_model": convert_model,
                     "save_directory": save_directory,
                     "fuse_layers": fuse_layers,
-                    "imatrix_data": imatrix_data
+                    "imatrix_data": imatrix_data,
+                    "skip_npu_logic": mock_device == "dummy",
                 }
+                # Dummy will skip npu related logic and save the quantized model
+                if mock_device == "dummy":
+                    model.save_low_bit = types.MethodType(save_low_bit, model)
                 model = cls.optimize_npu_model(*args, **optimize_kwargs)
             else:
-                from ipex_llm.transformers.npu_models.convert import optimize_llm
                 optimize_llm(model)
                 with torch.no_grad():
                     cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
@@ -258,7 +259,6 @@ def from_pretrained(cls, *args, **kwargs):
     def optimize_npu_model(cls, *args, **kwargs):
 
         from ipex_llm.transformers.npu_models.convert_mp import optimize_llm_pre, optimize_llm
-        from intel_npu_acceleration_library.compiler import create_npu_kernels
 
         model = kwargs.pop("model")
         qtype = kwargs.pop("qtype", "sym_int4_rtn")
@@ -275,6 +275,7 @@ def optimize_npu_model(cls, *args, **kwargs):
         save_directory = kwargs.pop('save_directory', None)
         fuse_layers = kwargs.pop('fuse_layers', None)
         imatrix_data = kwargs.pop('imatrix_data', None)
+        skip_npu_logic = kwargs.pop("skip_npu_logic", False)
         invalidInputError(save_directory is not None,
                           "Please provide the path to save converted model "
                           "through `save_directory`.")
@@ -294,51 +295,58 @@ def optimize_npu_model(cls, *args, **kwargs):
             cls.load_convert(qtype, model, "cpu", modules_to_not_convert,
                              quantization_group_size, imatrix_data,
                              *args, **kwargs)
-            create_npu_kernels(llm)
+            if not skip_npu_logic:
+                from intel_npu_acceleration_library.compiler import create_npu_kernels
+                create_npu_kernels(llm)
         model = model.eval()
         logger.info(f"Finish to convert model")
         model.config.update({"bigdl_transformers_low_bit": qtype})
-        model.share_memory()
 
-        if not pipeline:
-            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
-                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
-                optimize_llm_single_process(
-                    llm,
-                    kv_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size,
-                    qtype=qtype,
-                    save_directory=save_directory,
-                    fuse_layers=fuse_layers,
-                    has_llm=hasattr(model, "llm")
-                )
-            else:
-                optimize_llm(
-                    llm,
-                    max_context_len=max_context_len,
-                    max_prompt_len=max_prompt_len,
-                    inter_pp=inter_pp,
-                    intra_pp=intra_pp,
-                    transpose_value_cache=transpose_value_cache,
-                    group_size=quantization_group_size
-                )
+        if skip_npu_logic:
+            model.save_low_bit(model_dir=save_directory)
         else:
-            from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
-                import convert_llm
-            convert_llm(llm,
+            model.share_memory()
+
+            if not pipeline:
+                if model.config.model_type in ["qwen2", "llama", "minicpm"]:
+                    from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+                    optimize_llm_single_process(
+                        llm,
                         kv_len=max_context_len,
                         max_prompt_len=max_prompt_len,
                         transpose_value_cache=transpose_value_cache,
                         group_size=quantization_group_size,
                         qtype=qtype,
-                        convert_model=convert_model,
                         save_directory=save_directory,
-                        fuse_layers=fuse_layers)
-        model.save_low_bit = types.MethodType(save_low_bit, model)
-        model.save_low_bit(save_directory)
-        logger.info(f"Converted model has already saved to {save_directory}.")
+                        fuse_layers=fuse_layers,
+                        has_llm=hasattr(model, "llm")
+                    )
+                else:
+                    optimize_llm(
+                        llm,
+                        max_context_len=max_context_len,
+                        max_prompt_len=max_prompt_len,
+                        inter_pp=inter_pp,
+                        intra_pp=intra_pp,
+                        transpose_value_cache=transpose_value_cache,
+                        group_size=quantization_group_size
+                    )
+            else:
+                from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
+                    import convert_llm
+                convert_llm(llm,
+                            kv_len=max_context_len,
+                            max_prompt_len=max_prompt_len,
+                            transpose_value_cache=transpose_value_cache,
+                            group_size=quantization_group_size,
+                            qtype=qtype,
+                            convert_model=convert_model,
+                            save_directory=save_directory,
+                            fuse_layers=fuse_layers)
+            model.save_low_bit = types.MethodType(save_low_bit, model)
+            model.save_low_bit(save_directory)
+            logger.info(f"Converted model has already saved to {save_directory}.")
+
         return model
 
     @classmethod
@@ -379,6 +387,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
         intra_pp = kwargs.pop("intra_pp", None)
         transpose_value_cache = kwargs.pop("transpose_value_cache", True)
         modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
+        save_directory = kwargs.pop('save_directory', None)
 
         from transformers.models.auto.configuration_auto import AutoConfig
         from transformers.modeling_utils import no_init_weights, get_state_dict_dtype
@@ -650,16 +659,37 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
             param.requires_grad_(False)
 
         if optimize_model and not pipeline:
-            from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
-            optimize_llm(
-                llm,
-                max_context_len=max_context_len,
-                max_prompt_len=max_prompt_len,
-                inter_pp=inter_pp,
-                intra_pp=intra_pp,
-                transpose_value_cache=transpose_value_cache,
-                group_size=quantization_group_size
-            )
+            if model.config.model_type in ["qwen2", "llama", "minicpm"]:
+                from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
+                if save_directory is None:
+                    invalidInputError(False,
+                                      "Please specify the save_directory, the path of folder " +
+                                      "to save the compiled NPU model. If path not exists, " +
+                                      "the compiled NPU model will be saved there. " +
+                                      "Else, program will exit.")
+
+                optimize_llm_single_process(
+                    llm,
+                    kv_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size,
+                    qtype=qtype,
+                    save_directory=save_directory,
+                    fuse_layers=None,
+                    has_llm=hasattr(model, "llm")
+                )
+            else:
+                from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
+                optimize_llm(
+                    llm,
+                    max_context_len=max_context_len,
+                    max_prompt_len=max_prompt_len,
+                    inter_pp=inter_pp,
+                    intra_pp=intra_pp,
+                    transpose_value_cache=transpose_value_cache,
+                    group_size=quantization_group_size
+                )
         elif optimize_model and pipeline:
             from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
                 import convert_llm
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
index 5750540d04e..ede8d88e613 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -18,7 +18,7 @@
 import importlib
 import numpy as np
 from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params
-from ipex_llm.transformers.npu_models.lm_head import LMHeadLinear, SlicedLMHead
+from ipex_llm.transformers.npu_models.lm_head import SlicedLMHead
 from ipex_llm.utils.common.log4Error import invalidInputError
 
 
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
index 461d9a7012b..b6a8638db0b 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/linear.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -21,16 +21,25 @@
 # SPDX-License-Identifier: Apache 2.0
 #
 
-from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
-from intel_npu_acceleration_library.dtypes import NPUDtype
+
 import os
 import torch
 from torch.nn import Parameter
 import uuid
 import math
-from intel_npu_acceleration_library.backend import run_matmul
 from typing import Optional, Union
 from ipex_llm.utils.common import invalidInputError
+import importlib
+
+
+def is_acclib_available():
+    return importlib.util.find_spec("intel_npu_acceleration_library") is not None
+
+
+if is_acclib_available():
+    from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
+    from intel_npu_acceleration_library.dtypes import NPUDtype
+    from intel_npu_acceleration_library.backend import run_matmul
 
 
 class Linear(torch.nn.Module):
@@ -63,6 +72,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         if self.training:
             out = self._mm(x, self.weight, None)
         else:
+            from intel_npu_acceleration_library.backend import run_matmul
             out = run_matmul(x, self.weight, None, self.op_id)
 
         if self.bias is None:
@@ -105,6 +115,8 @@ def fromTensor(
         Returns:
             Union[Linear, QuantizedLinear]: A NPU linear layer
         """
+        from intel_npu_acceleration_library.quantization import quantize_tensor, compress_to_i4
+        from intel_npu_acceleration_library.dtypes import NPUDtype
         if dtype.is_floating_point:
             if bias is None:
                 return Linear(weight.to(dtype), None)
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
index 0184805996b..6ea78ea24cd 100644
--- a/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
+++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head.py
@@ -16,96 +16,6 @@
 import torch
 from torch import nn
 import numpy as np
-from filelock import FileLock
-from intel_npu_acceleration_library.backend import NNFactory
-from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
-
-
-class LMHeadLinear(NNFactory):
-    """Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
-    with weights prefetching."""
-
-    def __init__(
-        self,
-        inC: int,
-        outC: int,
-        batch: int,
-        split_num: int = 2,
-        profile: bool = False,
-        device: str = "NPU",
-        dtype: np.dtype = np.int8,
-        use_split: bool = False,
-        group_size: int = 0,
-        asym: bool = False,
-    ):
-        """Initialize the LMHeadLinear class.
-
-        Args:
-            inC (int): input channels
-            outC (int): output channels
-            batch (int): batch
-            split_num (int): split in_features of lm_head to how many parts
-            profile (bool): Enable/Disable profiling. Defaults to False.
-            device (str): Target device, default to "NPU".
-            dtype (np.dtype): weights datatype. Defaults to np.int8.
-
-        """
-        super().__init__(profile, device)
-        self.inC, self.outC = inC, outC
-        self.batch = batch
-
-        self.split_num = split_num
-        if use_split:
-            input = self.parameter((1, self.batch, self.inC))
-            res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
-                                       scale_factor=(group_size == 0), asym=asym)
-        else:
-            input = self.parameter((self.batch, self.inC))
-            split_size = self.inC // split_num // 2 * 2
-
-            for i in range(self.split_num):
-                start_idx = i * split_size
-                end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
-                input_slice = self.slice(input, begin=[0, start_idx],
-                                         end=[self.batch, end_idx])
-                linear_slice = self.linear(input_slice, outC, split_size, bias=False,
-                                           wt_dtype=dtype, asym=asym)
-                if i == 0:
-                    res = linear_slice
-                else:
-                    res += linear_slice
-
-        print("start compiling lm_head")
-        self.compile()
-        print("end compiling lm_head")
-
-    def set_weights(self, op_id, weights):
-        self.set_weights_async(op_id, weights)
-        with FileLock(f"lmhead_run.lock"):
-            backend_lib.run(self._mm)
-
-    def set_weights_async(self, op_id, weights):
-        self.setWeights(1, op_id, *weights)
-
-    def run(
-        self, X: np.ndarray
-    ) -> np.ndarray:
-        """Run the layer:  $X * (W * S)^T$ .
-
-        Args:
-            X (np.ndarray): activation
-
-        Raises:
-            RuntimeError: Input, weights or scale shape mismatch
-
-        Returns:
-            np.ndarray: result
-        """
-        self.set_input_tensor(X, 0)
-        self.elapsed = backend_lib.run(self._mm)
-        if len(self.out) == 1:
-            return self.out[0]
-        return self.out
 
 
 class SlicedLMHead(nn.Module):
@@ -160,6 +70,7 @@ def get_weight_dtype(self):
         return self.lm_heads[0].weight.dtype
 
     def get_fused_lm_head(self):
+        from ipex_llm.transformers.npu_models.lm_head_linear import LMHeadLinear
         np_dtype = np.uint8 if self.get_weight_dtype() == torch.uint8 else np.int8
         self.fused_lm_head = LMHeadLinear(self.inC, self.outC, 1, self.split_num,
                                           False, "NPU", dtype=np_dtype, use_split=self.use_split,
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/lm_head_linear.py b/python/llm/src/ipex_llm/transformers/npu_models/lm_head_linear.py
new file mode 100644
index 00000000000..f43f246db14
--- /dev/null
+++ b/python/llm/src/ipex_llm/transformers/npu_models/lm_head_linear.py
@@ -0,0 +1,106 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from filelock import FileLock
+from intel_npu_acceleration_library.backend import NNFactory
+from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
+
+
+class LMHeadLinear(NNFactory):
+    """Quantized Linear class for sliced lm_head, computing a matrix matrix multiplication
+    with weights prefetching."""
+
+    def __init__(
+        self,
+        inC: int,
+        outC: int,
+        batch: int,
+        split_num: int = 2,
+        profile: bool = False,
+        device: str = "NPU",
+        dtype: np.dtype = np.int8,
+        use_split: bool = False,
+        group_size: int = 0,
+        asym: bool = False,
+    ):
+        """Initialize the LMHeadLinear class.
+
+        Args:
+            inC (int): input channels
+            outC (int): output channels
+            batch (int): batch
+            split_num (int): split in_features of lm_head to how many parts
+            profile (bool): Enable/Disable profiling. Defaults to False.
+            device (str): Target device, default to "NPU".
+            dtype (np.dtype): weights datatype. Defaults to np.int8.
+
+        """
+        super().__init__(profile, device)
+        self.inC, self.outC = inC, outC
+        self.batch = batch
+
+        self.split_num = split_num
+        if use_split:
+            input = self.parameter((1, self.batch, self.inC))
+            res = self.dq_split_linear(input, self.split_num, self.outC, self.inC, wt_dtype=dtype,
+                                       scale_factor=(group_size == 0), asym=asym)
+        else:
+            input = self.parameter((self.batch, self.inC))
+            split_size = self.inC // split_num // 2 * 2
+
+            for i in range(self.split_num):
+                start_idx = i * split_size
+                end_idx = (i + 1) * split_size if i < self.split_num - 1 else self.inC
+                input_slice = self.slice(input, begin=[0, start_idx],
+                                         end=[self.batch, end_idx])
+                linear_slice = self.linear(input_slice, outC, split_size, bias=False,
+                                           wt_dtype=dtype, asym=asym)
+                if i == 0:
+                    res = linear_slice
+                else:
+                    res += linear_slice
+
+        print("start compiling lm_head")
+        self.compile()
+        print("end compiling lm_head")
+
+    def set_weights(self, op_id, weights):
+        self.set_weights_async(op_id, weights)
+        with FileLock(f"lmhead_run.lock"):
+            backend_lib.run(self._mm)
+
+    def set_weights_async(self, op_id, weights):
+        self.setWeights(1, op_id, *weights)
+
+    def run(
+        self, X: np.ndarray
+    ) -> np.ndarray:
+        """Run the layer:  $X * (W * S)^T$ .
+
+        Args:
+            X (np.ndarray): activation
+
+        Raises:
+            RuntimeError: Input, weights or scale shape mismatch
+
+        Returns:
+            np.ndarray: result
+        """
+        self.set_input_tensor(X, 0)
+        self.elapsed = backend_lib.run(self._mm)
+        if len(self.out) == 1:
+            return self.out[0]
+        return self.out