intel-analytics · Uxito-Ada · Aug 13, 2024 · Jul 19, 2024 · Jul 21, 2024 · Jul 21, 2024
diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py
@@ -144,6 +144,14 @@ def train(
 
     prompter = Prompter(prompt_template_name)
 
+    if "zero3" in deepspeed:
+        from ipex_llm.transformers.utils \
+            import _constant_buffered_norm2
+        from ipex_llm.llm_patching import replace_attr
+        import deepspeed as ds
+        replace_attr(ds.runtime.zero.stage3.DeepSpeedZeroOptimizer_Stage3,
+                     "_constant_buffered_norm2", _constant_buffered_norm2)
+
     device_map = "auto"
     world_size = int(os.environ.get("WORLD_SIZE", 1))
     ddp = world_size != 1
@@ -162,6 +170,7 @@ def train(
             torch_dtype=torch.bfloat16,
             modules_to_not_convert=["lm_head"],
             trust_remote_code=True,
+            enable_deepspeed_zero3="zero3" in deepspeed
         )
     else:
         # According to the QLoRA paper, using "nf4" could yield better model quality than "int4"
@@ -175,7 +184,8 @@ def train(
         model = AutoModelForCausalLM.from_pretrained(base_model,
                                                      torch_dtype=torch.bfloat16,
                                                      quantization_config=bnb_config,
-                                                     trust_remote_code=True)
+                                                     trust_remote_code=True,
+                                                     enable_deepspeed_zero3="zero3" in deepspeed)
         # below is also supported
         # Load the base model from a directory or the HF Hub to 4-bit format
         # model = AutoModelForCausalLM.from_pretrained(
@@ -186,9 +196,10 @@ def train(
         #     # device_map=device_map,
         #     modules_to_not_convert=["lm_head"],
         # )
-    print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
-    model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
-    print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
+    if not "zero3" in deepspeed:
+        print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}")
+        model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}')
+        print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}")
 
     tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
     print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}")

diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json
@@ -0,0 +1,15 @@
+{
+    "zero_optimization": {
+      "stage": 3,
+      "contiguous_gradients": true,
+      "overlap_comm": true,
+      "offload_optimizer": {"device": "cpu"}
+    },
+    "bf16": {
+      "enabled": true
+    },
+    "world_size": 2,
+    "train_batch_size": 32,
+    "train_micro_batch_size_per_gpu": 2,
+    "gradient_accumulation_steps": 8
+}
diff --git a/...lm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh b/...lm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh
@@ -0,0 +1,42 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=29503
+export FI_PROVIDER=tcp
+export CCL_ATL_TRANSPORT=ofi
+export CCL_ZE_IPC_EXCHANGE=sockets
+export UR_L0_IN_ORDER_BARRIER_BY_SIGNAL=0
+basekit_root=/opt/intel/oneapi
+source $basekit_root/setvars.sh --force
+source $basekit_root/ccl/latest/env/vars.sh --force
+
+NUM_GPUS=2 # number of used GPU
+export USE_XETLA=OFF
+export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
+export TORCH_LLM_ALLREDUCE=0 # Different from PVC
+export DS_SKIP_CUDA_CHECK=1
+export IPEX_LLM_ENABLE_DEEPSPEED_ZERO3=1
+
+mpirun -n $NUM_GPUS \
+          python -u ./alpaca_qlora_finetuning.py \
+          --base_model "meta-llama/Llama-2-13b-hf" \
+          --data_path "yahma/alpaca-cleaned" \
+          --output_dir "./ipex-llm-qlora-alpaca" \
+          --gradient_checkpointing True \
+          --micro_batch_size 2 \
+          --batch_size 32 \
+          --deepspeed ./deepspeed_zero3.json
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
@@ -317,6 +317,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                  mixed_precision=False,
                                  act_order=False,
                                  enable_scale_search=False,
+                                 enable_deepspeed_zero3=False,
                                  ):
     from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \
         FP16Linear, BF16Linear
@@ -404,6 +405,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                         enable_xetla=enable_xetla,
                         optimize_lm_head=optimize_lm_head,
                         enable_scale_search=enable_scale_search,
+                        enable_deepspeed_zero3=enable_deepspeed_zero3
                     )
                     device = module.weight.data.device
                     # Copy the weights
@@ -416,6 +418,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                                              imatrix=cur_imatrix,
                                              in_features=in_features,
                                              enable_xetla=enable_xetla,
+                                             enable_deepspeed_zero3=enable_deepspeed_zero3,
                                              enable_scale_search=enable_scale_search).to(device)
                     new_linear._parameters['weight'] = paramsLowBit
                     if module.bias is not None:
@@ -492,6 +495,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
                 mixed_precision=mixed_precision,
                 act_order=act_order,
                 enable_scale_search=enable_scale_search,
+                enable_deepspeed_zero3=enable_deepspeed_zero3,
             )
             has_been_replaced = _flag or has_been_replaced
     return model, has_been_replaced
@@ -747,7 +751,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
                          imatrix_data=None,
                          embedding_qtype=None,
                          enable_xetla=False,
-                         mixed_precision=False):
+                         mixed_precision=False,
+                         enable_deepspeed_zero3=False):
     if qtype in ggml_tensor_qtype.values():
         index = list(ggml_tensor_qtype.values()).index(qtype)
         logger.info(f"Converting the current model to "
@@ -793,6 +798,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True,
         mixed_precision=mixed_precision,
         act_order=act_order,
         enable_scale_search=enable_scale_search,
+        enable_deepspeed_zero3=enable_deepspeed_zero3,
     )
     if not has_been_replaced:
         warnings.warn(

diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py
@@ -208,7 +208,8 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
                        device=None, convert_shape_only=False,
                        imatrix: torch.Tensor=None,
                        in_features: int=None,
-                       enable_scale_search: bool=False):
+                       enable_scale_search: bool=False,
+                       enable_deepspeed_zero3: bool=False):
     QK = ggml.ggml_qk_size(qtype)
     block_size_in_bytes = ggml.ggml_type_size(qtype)
 
@@ -229,8 +230,12 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
         scale = torch.empty(n // k, dtype=torch.float32,
                             device=device)
     else:
-        dst_tensor = torch.empty(dst_size, dtype=torch.uint8,
-                                 device=device)
+        if enable_deepspeed_zero3:
+            dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16,
+                                     device=device)
+        else:
+            dst_tensor = torch.empty(dst_size, dtype=torch.uint8,
+                                     device=device)
 
     if not convert_shape_only and device != 'meta':
         dst = ctypes.c_void_p(dst_tensor.data.data_ptr())
@@ -259,9 +264,12 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int,
 
 
 def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int):
+    import os
+    enable_deepspeed_zero3 = (os.getenv("IPEX_LLM_ENABLE_DEEPSPEED_ZERO3", "0") == "1")
 
-    invalidInputError(tensor.dtype == torch.uint8,
-                      "Input tensor must be uint8")
+    if not enable_deepspeed_zero3:
+        invalidInputError(tensor.dtype == torch.uint8,
+                          "Input tensor must be uint8")
 
     invalidInputError(tensor.device == torch.device('cpu'),
                       "Input tensor must be uint8")
@@ -381,7 +389,8 @@ def __new__(cls,
                 imatrix=None,
                 in_features=None,
                 enable_xetla=False,
-                enable_scale_search=False):
+                enable_scale_search=False,
+                enable_deepspeed_zero3=False):
         if data is None:
             data = torch.empty(0)
 
@@ -395,6 +404,7 @@ def __new__(cls,
         self.in_features = in_features
         self.enable_xetla = enable_xetla
         self.enable_scale_search = enable_scale_search
+        self.enable_deepspeed_zero3 = enable_deepspeed_zero3
         return self
 
     def ggml_mse(self, w, ggml_qtype, device):
@@ -453,7 +463,8 @@ def quantize(self, device=None):
                                                  convert_shape_only=self.convert_shape_only,
                                                  imatrix=self.imatrix,
                                                  in_features=self.in_features,
-                                                 enable_scale_search=self.enable_scale_search)
+                                                 enable_scale_search=self.enable_scale_search,
+                                                 enable_deepspeed_zero3=self.enable_deepspeed_zero3)
                 self.data = w_quantized
             self.quantized = True
             self._shape = w.shape
@@ -581,7 +592,13 @@ class MatMulLowBit(torch.autograd.Function):
     def forward(ctx, A, weight, input_seq_size):
         ctx.is_empty = False
         import xe_linear
-        result = xe_linear.forward_new(A, weight.data, weight.qtype, input_seq_size)
+        if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3:
+            result = xe_linear.forward_new(A,
+                                           weight.data.view(torch.uint8),
+                                           weight.qtype,
+                                           input_seq_size)
+        else:
+            result = xe_linear.forward_new(A, weight.data, weight.qtype, input_seq_size)
         if any(ctx.needs_input_grad[:2]):
             ctx.tensors = (A, weight)
         else:
@@ -601,7 +618,12 @@ def backward(ctx, grad_output):
         if req_gradA:
             if torch.xpu.is_autocast_xpu_enabled():
                 grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype())
-            dequant_weight = xe_linear.dequant(A, weight.data, weight.qtype)
+            if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3:
+                dequant_weight = xe_linear.dequant(A,
+                                                   weight.data.view(torch.uint8),
+                                                   weight.qtype)
+            else:
+                dequant_weight = xe_linear.dequant(A, weight.data, weight.qtype)
             grad_A = torch.matmul(grad_output, dequant_weight.reshape(weight._shape))
 
         return grad_A, grad_weight, None
@@ -640,13 +662,15 @@ class LowBitLinear(nn.Linear):
     def __init__(self, input_features, output_features, qtype, bias=True,
                  conver_to_half=True, mp_group=None, enable_xetla=False,
                  optimize_lm_head=False, act_order=False,
-                 enable_scale_search=False):
+                 enable_scale_search=False,
+                 enable_deepspeed_zero3=False):
         super().__init__(input_features, output_features, bias)
         self.weight = FP4Params(self.weight.data,
                                 requires_grad=False,
                                 quantized=False, _shape=None, qtype=qtype,
                                 enable_xetla=enable_xetla,
-                                enable_scale_search=enable_scale_search)
+                                enable_scale_search=enable_scale_search,
+                                enable_deepspeed_zero3=enable_deepspeed_zero3)
         self.in_len = input_features
         self.out_len = output_features
         self.weight_shape = (self.out_len, self.in_len)
@@ -666,6 +690,7 @@ def __init__(self, input_features, output_features, qtype, bias=True,
         self.is_lm_head = self.in_len * self.out_len >= 32000 * 4096 and self.bias is None
         self.low_memory_mode = self.is_lm_head
         self.act_order = act_order
+        self.enable_deepspeed_zero3 = enable_deepspeed_zero3
         if act_order:
             self.register_buffer(
                 "g_idx_map",
@@ -736,9 +761,17 @@ def forward(self, x: torch.Tensor):
                 if x_2d.requires_grad:
                     result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size)
                 else:
-                    result = xe_linear.forward_new(x_2d, self.weight.data,
-                                                   self.weight.qtype,
-                                                   input_seq_size)
+                    if hasattr(self.weight, "enable_deepspeed_zero3") \
+                       and self.weight.enable_deepspeed_zero3:
+                        result = xe_linear.forward_new(x_2d,
+                                                       self.weight.data.view(torch.uint8),
+                                                       self.weight.qtype,
+                                                       input_seq_size)
+                    else:
+                        result = xe_linear.forward_new(x_2d,
+                                                       self.weight.data,
+                                                       self.weight.qtype,
+                                                       input_seq_size)
             elif self.enable_xetla:
                 x_2d = x_2d.half()
                 result = xe_linear.mm_xetla(x_2d, self.weight.data, self.qtype)

diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py
@@ -454,6 +454,7 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
         if embedding_qtype is not None:
             embedding_qtype = ggml_tensor_qtype[embedding_qtype]
         enable_xetla = kwargs.pop("enable_xetla", False)
+        enable_deepspeed_zero3 = kwargs.pop("enable_deepspeed_zero3", False)
         _args = copy.deepcopy(args)
         _kwargs = copy.deepcopy(kwargs)
         awq_config = None
@@ -524,7 +525,8 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs):
                                      imatrix_data=imatrix_data,
                                      embedding_qtype=embedding_qtype,
                                      enable_xetla=enable_xetla,
-                                     mixed_precision=mixed_precision)
+                                     mixed_precision=mixed_precision,
+                                     enable_deepspeed_zero3=enable_deepspeed_zero3)
 
         if disk_embedding:
             from ipex_llm.transformers.embedding import DiskEmbedding

diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py
@@ -361,3 +361,15 @@ def get_modelscope_hf_config(model_id_or_path: str,
 def is_torch_bf16_gpu_available():
     # always true for XPU and CPU
     return True
+
+
+# Arc platfrom does not support FP64,
+# Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2  method
+def _constant_buffered_norm2(self, input, buffer_size=250000000):
+    norm = None
+    for part in input.view(-1).split(buffer_size):
+        if norm is None:
+            norm = part.data.norm(2)**2.0
+        else:
+            norm += part.data.norm(2)**2.0
+    return norm**0.5