intel-analytics · MeouSker77 · Dec 27, 2024 · Dec 27, 2024
diff --git a/docs/mddocs/PythonAPI/optimize.md b/docs/mddocs/PythonAPI/optimize.md
@@ -3,7 +3,7 @@
 ## Optimize Model
 You can run any PyTorch model with `optimize_model` through only one-line code change to benefit from IPEX-LLM optimization, regardless of the library or API you are using.
 
-### `ipex_llm.optimize_model`_`(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_convert=None, cpu_embedding=False, lightweight_bmm=False, **kwargs)`_
+### `ipex_llm.optimize_model`_`(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_convert=None, cpu_embedding=False, **kwargs)`_
 
 A method to optimize any pytorch model.
 
@@ -19,8 +19,6 @@ A method to optimize any pytorch model.
 
   - **cpu_embedding**: Whether to replace the Embedding layer, may need to set it to `True` when running IPEX-LLM on GPU. Default to be `False`.
 
-  - **lightweight_bmm**: Whether to replace the `torch.bmm` ops, may need to set it to `True` when running IPEX-LLM on GPU on Windows. Default to be `False`.
-
 - **Returns**: The optimized model.
 
 - **Example**:
@@ -76,4 +74,4 @@ Load the optimized pytorch model.
   from ipex_llm.optimize import load_low_bit
   model = whisper.load_model('tiny') # A model instance through traditional loading method
   model = load_low_bit(model, saved_dir) # Load the optimized model
-  ```
+  ```
diff --git a/docs/mddocs/PythonAPI/transformers.md b/docs/mddocs/PythonAPI/transformers.md
@@ -29,8 +29,6 @@ Three new arguments are added to extend Hugging Face’s from_pretrained method
 
   - **cpu_embedding**: Whether to replace the Embedding layer, may need to set it to `True` when running IPEX-LLM on GPU. Default to be `False`.
 
-  - **lightweight_bmm**: Whether to replace the torch.bmm ops, may need to set it to `True` when running IPEX-LLM on GPU on Windows. Default to be `False`.
-
   - **imatrix**: `str` value, represent filename of importance matrix pretrained on specific datasets for use with the improved quantization methods recently added to llama.cpp.
 
   - **model_hub**: `str` value, options are `'huggingface'` and `'modelscope'`, specify the model hub. Default to be `'huggingface'`.
@@ -48,7 +46,7 @@ Three new arguments are added to extend Hugging Face’s from_pretrained method
 Load gguf model and tokenizer and convert it to bigdl-llm model and huggingface tokenzier
 
 - **Parameters**:
-  
+
   - **fpath**: Path to gguf model file
 
   - **optimize_model**: Whether to further optimize llm model, defaults to `True`
@@ -64,7 +62,7 @@ Load gguf model and tokenizer and convert it to bigdl-llm model and huggingface
 Load a low bit optimized model (including INT4, INT5 and INT8) from a saved ckpt.
 
 - **Parameters**:
-  
+
   - **pretrained_model_name_or_path**: `str` value, Path to load the optimized model ckpt.
 
   - **optimize_model**: `boolean` value, Whether to further optimize the low_bit llm model.

diff --git a/python/llm/src/ipex_llm/optimize.py b/python/llm/src/ipex_llm/optimize.py
@@ -195,7 +195,7 @@ def load_low_bit(model, model_path):
 
 
 def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_convert=None,
-                   cpu_embedding=False, lightweight_bmm=False, **kwargs):
+                   cpu_embedding=False, **kwargs):
     """
     A method to optimize any pytorch model.
 
@@ -211,8 +211,6 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
         when conducting model optimizations. Default to be ``None``.
     :param cpu_embedding: Whether to replace the Embedding layer, may need to set it
         to ``True`` when running BigDL-LLM on GPU on Windows. Default to be ``False``.
-    :param lightweight_bmm: Whether to replace the torch.bmm ops, may need to set it
-        to ``True`` when running BigDL-LLM on GPU on Windows. Default to be ``False``.
 
     :return: The optimized model.
 
@@ -256,8 +254,7 @@ def optimize_model(model, low_bit='sym_int4', optimize_llm=True, modules_to_not_
                                  torch_dtype=torch_dtype,
                                  optimize_model=optimize_llm,
                                  modules_to_not_convert=modules_to_not_convert,
-                                 cpu_embedding=cpu_embedding,
-                                 lightweight_bmm=lightweight_bmm)
+                                 cpu_embedding=cpu_embedding)
     # add save_low_bit to pretrained model dynamically
     import types
     model._bigdl_config = dict()

diff --git a/python/llm/src/ipex_llm/transformers/bmm.py b/python/llm/src/ipex_llm/transformers/bmm.py
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
@@ -1078,7 +1078,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
                          convert_shape_only=False, device="cpu",
                          modules_to_not_convert=None,
                          cpu_embedding=False,
-                         lightweight_bmm=False, torch_dtype="auto",
+                         torch_dtype="auto",
                          imatrix_data=None,
                          embedding_qtype=None,
                          mixed_precision=False):
@@ -1146,7 +1146,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
             pass
 
     if optimize_model:
-        model = _optimize_post(model, lightweight_bmm)
+        model = _optimize_post(model)
 
     if hasattr(model, "config") and hasattr(model.config, "model_type") and \
             model.config.model_type == "qwen" and hasattr(model.config, "visual"):
@@ -1247,7 +1247,7 @@ def _optimize_ipex(model, qtype=ggml_tensor_qtype["bf16"]):
     return _ipex_jit(model)
 
 
-def _optimize_post(model, lightweight_bmm=False):
+def _optimize_post(model):
     try:
         from diffusers import DiffusionPipeline, StableDiffusionXLPipeline
         if isinstance(model, DiffusionPipeline):
@@ -1627,7 +1627,7 @@ def _optimize_post(model, lightweight_bmm=False):
             vision_embedding._get_pos_embed = MethodType(_get_pos_embed, vision_embedding)
             vision_module = importlib.import_module(vision_model.__class__.__module__)
             convert_forward(vision_model, vision_module.InternAttention, intern_attention_forward)
-        _optimize_post(model.language_model, lightweight_bmm=lightweight_bmm)
+        _optimize_post(model.language_model)
     elif model.config.model_type == "qwen":
         if hasattr(model.config, "visual"):
             # for Qwen-VL-Chat
@@ -1731,7 +1731,7 @@ def _optimize_post(model, lightweight_bmm=False):
                         module.Qwen2MoeSdpaAttention,
                         qwen2_attention_forward)
     elif model.config.model_type == "qwen2_audio":
-        _optimize_post(model.language_model, lightweight_bmm=lightweight_bmm)
+        _optimize_post(model.language_model)
     elif model.config.model_type == "qwen2_vl":
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
@@ -1875,20 +1875,6 @@ def _optimize_post(model, lightweight_bmm=False):
         modeling_module_name = model.__class__.__module__
         module = importlib.import_module(modeling_module_name)
         convert_forward(model, module.YiRMSNorm, rms_norm_forward)
-    elif model.config.model_type == "whisper" and lightweight_bmm:
-        if platform.system().lower() == 'windows':
-            from ipex_llm.transformers.bmm import SafeBMM
-            modeling_module_name = model.__class__.__module__
-            module = importlib.import_module(modeling_module_name)
-            old_fwd = module.WhisperAttention.forward
-
-            def safe_bmm_fwd(*args, **kwargs):
-                with SafeBMM():
-                    return old_fwd(*args, **kwargs)
-
-            convert_forward(model,
-                            module.WhisperAttention,
-                            safe_bmm_fwd)
     elif model.config.model_type == "rwkv":
         # rwkv v4
         modeling_module_name = model.__class__.__module__
@@ -2081,7 +2067,7 @@ def safe_bmm_fwd(*args, **kwargs):
         elif model.config.hidden_size == 1536 and model.config.vocab_size == 73464:
             # MiniCPM-V ?
             model.llm.config.model_type = "minicpm"
-        _optimize_post(model.llm, lightweight_bmm=lightweight_bmm)
+        _optimize_post(model.llm)
         model.llm.config.model_type = "minicpmv"
 
         vpm_modeling_module_name = model.vpm.__class__.__module__
@@ -2135,7 +2121,7 @@ def safe_bmm_fwd(*args, **kwargs):
         # llm
         model.llm.config.model_type = "llama"
         model.llm.config.rope_scaling = {"rope_type": "default"}
-        _optimize_post(model.llm, lightweight_bmm=lightweight_bmm)
+        _optimize_post(model.llm)
         model.llm.config.model_type = "megrezo"
 
     return model
diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py
@@ -147,8 +147,6 @@ def from_pretrained(cls,
             to ``True`` when running BigDL-LLM on GPU on Windows. Default to be ``False``.
         :param disk_embedding: Whether to put the Embedding layer on disk to save memory.
             Default to be ``False``.
-        :param lightweight_bmm: Whether to replace the torch.bmm ops, may need to set it
-            to ``True`` when running BigDL-LLM on GPU on Windows. Default to be ``False``.
         :param imatrix: str value, represent filename of importance matrix pretrained on
             specific datasets for use with the improved quantization methods recently
             added to llama.cpp.
@@ -441,7 +439,6 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
                           " please use cpu_embedding instead.", FutureWarning)
             cpu_embedding = True
         disk_embedding = kwargs.pop("disk_embedding", False)
-        lightweight_bmm = kwargs.pop("lightweight_bmm", False)
         quant_config = kwargs.pop("quantization_config", None)
         imatrix_data = kwargs.pop("imatrix_data", None)
         embedding_qtype = kwargs.pop("embedding_qtype", None)
@@ -513,7 +510,6 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
         model = ggml_convert_low_bit(model, qtype, optimize_model,
                                      modules_to_not_convert=modules_to_not_convert,
                                      cpu_embedding=cpu_embedding,
-                                     lightweight_bmm=lightweight_bmm,
                                      torch_dtype=kwargs.get("torch_dtype", 'auto'),
                                      imatrix_data=imatrix_data,
                                      embedding_qtype=embedding_qtype,
@@ -576,7 +572,6 @@ def load_low_bit(cls,
                           " please use cpu_embedding instead.", FutureWarning)
             cpu_embedding = True
         disk_embedding = kwargs.pop("disk_embedding", False)
-        lightweight_bmm = kwargs.pop("lightweight_bmm", False)
         # Autofactory
         trust_remote_code = kwargs.pop("trust_remote_code", None)
         kwargs_orig = copy.deepcopy(kwargs)
@@ -713,7 +708,6 @@ def load_low_bit(cls,
         model = ggml_convert_low_bit(model, qtype, optimize_model, device=quant_device,
                                      modules_to_not_convert=modules_to_not_convert,
                                      cpu_embedding=cpu_embedding,
-                                     lightweight_bmm=lightweight_bmm,
                                      embedding_qtype=embedding_qtype, torch_dtype=torch_dtype)
 
         if is_sharded:

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -116,7 +116,6 @@ def from_pretrained(cls, *args, **kwargs):
 
         # ignore following arguments
         ignore_argument(kwargs, "model_hub")
-        ignore_argument(kwargs, "lightweight_bmm")
         ignore_argument(kwargs, "load_in_4bit")
         ignore_argument(kwargs, "load_in_8bit")
         ignore_argument(kwargs, "imatrix")
@@ -365,7 +364,6 @@ def load_convert_cpu(cls, q_k, optimize_model, device, modules_to_not_convert,
     def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs):
         # ignore following arguments
         ignore_argument(kwargs, "model_hub")
-        ignore_argument(kwargs, "lightweight_bmm")
         ignore_argument(kwargs, "cpu_embedding")
         ignore_argument(kwargs, "embedding_qtype")
         ignore_argument(kwargs, "speculative")