From 646730c176b649c2abf2c77bd5497960b5718140 Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Tue, 18 Jun 2024 15:42:04 +0800
Subject: [PATCH 1/2] add fp16 NPU Linear support and fix version 1.0 support

---
 .../src/ipex_llm/transformers/npu_model.py    | 45 ++++++++++++-------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 3c2977d9777..83ebe2448d0 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -22,7 +22,6 @@
 from transformers.dynamic_module_utils import get_imports
 
 import intel_npu_acceleration_library as npu_lib
-from intel_npu_acceleration_library.dtypes import int8, int4
 
 from ipex_llm.utils.common.log4Error import invalidInputError
 
@@ -55,7 +54,8 @@ def from_pretrained(cls,
         The loaded model will run supported OPs on NPU, then run other OPs on CPU.
 
         Three new arguments are added to extend Hugging Face's from_pretrained method as follows:
-        :param load_in_low_bit: str value, options are ``'sym_int4'``, ``'sym_int8'``, ``'fp32'``.
+        :param load_in_low_bit: str value, options are ``'sym_int4'``, ``'sym_int8'``,
+                                ``'fp16'``, ``'fp32'``.
                                 Relevant low bit optimizations will be applied to the model.
         :return: a model instance
         """
@@ -63,20 +63,31 @@ def from_pretrained(cls,
             warnings.warn("`device_map` will be ignored")
         kwargs['device_map'] = 'cpu'
 
-        low_bit = kwargs.pop('load_in_low_bit', None)
-        low_bit_to_dtype_map = {
-            'sym_int4': int4,
-            'sym_int8': int8,
-            'fp32': torch.float,
-        }
-        if low_bit is not None:
-            dtype = low_bit_to_dtype_map[low_bit]
-        else:
-            dtype = kwargs.get('torch_dtype', torch.float)
-            dtype = torch.float if dtype == 'auto' else dtype
-        invalidInputError(dtype in low_bit_to_dtype_map.values(),
-                          f"unsupported dtype: {dtype}, "
-                          "only `sym_int4`, `sym_int8`, `fp32` are supported")
+        if kwargs.get('torch_dtype', None) not in [None, 'auto', torch.float]:
+            warnings.warn("`torch_dtype` will be ignored, `torch.float` will be used")
+        kwargs['torch_dtype'] = torch.float
+
+        low_bit = kwargs.pop('load_in_low_bit', torch.float)
+        try:
+            # for intel_npu_acceleration_library >= 1.1.0
+            from intel_npu_acceleration_library.dtypes import int8, int4
+            qtype_map = {
+                'sym_int4': int4,
+                'sym_int8': int8,
+                'fp16': torch.half,
+                'fp32': torch.float,
+            }
+        except ImportError as _e:
+            # for intel_npu_acceleration_library < 1.1.0
+            qtype_map = {
+                'sym_int8': torch.int8,
+                'fp16': torch.half,
+                'fp32': torch.float,
+            }
+        invalidInputError(low_bit in qtype_map.keys(),
+                          f"unsupported low_bit: {low_bit}, "
+                          f"only {list(qtype_map.keys())} are supported")
+        qtype = qtype_map[low_bit]
 
         kwargs["low_cpu_mem_usage"] = True
 
@@ -96,7 +107,7 @@ def from_pretrained(cls,
         ignore_argument(kwargs, "pipeline_parallel_stages")
 
         model = cls.HF_Model.from_pretrained(*args, **kwargs)
-        model = npu_lib.compile(model, dtype, False)
+        model = npu_lib.compile(model, qtype, False)
 
         return model
 

From 7c9f9ecbc09c80317927ae3432f2f3f2d4c4589d Mon Sep 17 00:00:00 2001
From: Yishuo Wang <yishuo.wang@intel.com>
Date: Tue, 18 Jun 2024 15:43:48 +0800
Subject: [PATCH 2/2] update

---
 python/llm/src/ipex_llm/transformers/npu_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py
index 83ebe2448d0..78b83f88fa1 100644
--- a/python/llm/src/ipex_llm/transformers/npu_model.py
+++ b/python/llm/src/ipex_llm/transformers/npu_model.py
@@ -67,7 +67,7 @@ def from_pretrained(cls,
             warnings.warn("`torch_dtype` will be ignored, `torch.float` will be used")
         kwargs['torch_dtype'] = torch.float
 
-        low_bit = kwargs.pop('load_in_low_bit', torch.float)
+        low_bit = kwargs.pop('load_in_low_bit', 'fp32')
         try:
             # for intel_npu_acceleration_library >= 1.1.0
             from intel_npu_acceleration_library.dtypes import int8, int4