From 526aa208e35c55c0a4bd25f98ced105c4f78ef72 Mon Sep 17 00:00:00 2001 From: Uxito-Ada <414416158@qq.com> Date: Fri, 19 Jul 2024 15:40:32 +0800 Subject: [PATCH 01/29] deepspeed zero3 QLoRA finetuning --- .../alpaca-qlora/alpaca_qlora_finetuning.py | 7 +++ .../QLoRA/alpaca-qlora/deepspeed_zero3.json | 16 +++++ .../qlora_finetune_llama2_13b_arch_2_card.sh | 41 ++++++++++++ .../llm/src/ipex_llm/transformers/convert.py | 9 ++- .../ipex_llm/transformers/low_bit_linear.py | 62 ++++++++++++++----- python/llm/src/ipex_llm/transformers/model.py | 4 +- python/llm/src/ipex_llm/transformers/utils.py | 11 ++++ 7 files changed, 131 insertions(+), 19 deletions(-) create mode 100644 python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json create mode 100644 python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 61916fff9b7..7df46a52561 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -144,6 +144,12 @@ def train( prompter = Prompter(prompt_template_name) + if "zero3" in deepspeed: + from ipex_llm.transformers.utils \ + import _constant_buffered_norm2 + replace_attr(deepspeed.runtime.zero.stage3.DeepSpeedZeroOptimizer_Stage3, + "_constant_buffered_norm2", _constant_buffered_norm2) + device_map = "auto" world_size = int(os.environ.get("WORLD_SIZE", 1)) ddp = world_size != 1 @@ -162,6 +168,7 @@ def train( torch_dtype=torch.bfloat16, modules_to_not_convert=["lm_head"], trust_remote_code=True, + enable_deepspeed_zero3="zero3" in deepspeed ) else: # According to the QLoRA paper, using "nf4" could yield better model quality than "int4" diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json new file mode 100644 index 00000000000..39143b5c991 --- /dev/null +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json @@ -0,0 +1,16 @@ +{ + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu" + }, + "contiguous_gradients": true, + "overlap_comm": true + }, + "bf16": { + "enabled": true + }, + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto" +} + diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh new file mode 100644 index 00000000000..cbe1600e5b1 --- /dev/null +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh @@ -0,0 +1,41 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +export MASTER_ADDR=127.0.0.1 +export MASTER_PORT=29503 +export FI_PROVIDER=tcp +export CCL_ATL_TRANSPORT=ofi +export CCL_ZE_IPC_EXCHANGE=sockets +export UR_L0_IN_ORDER_BARRIER_BY_SIGNAL=0 +basekit_root=/opt/intel/oneapi +source $basekit_root/setvars.sh --force +source $basekit_root/ccl/latest/env/vars.sh --force + +NUM_GPUS=2 # number of used GPU +export USE_XETLA=OFF +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 +export TORCH_LLM_ALLREDUCE=0 # Different from PVC +export DS_SKIP_CUDA_CHECK=1 + +mpirun -n $NUM_GPUS \ + python -u ./alpaca_qlora_zero3_finetuning.py \ + --base_model "meta-llama/Llama-2-13b-hf" \ + --data_path "yahma/alpaca-cleaned" \ + --output_dir "./ipex-llm-qlora-alpaca" \ + --gradient_checkpointing True \ + --micro_batch_size 2 \ + --batch_size 32 \ + --deepspeed ./deepspeed_zero3.json diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index cc820fa1674..7e8184e6f85 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -317,6 +317,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, mixed_precision=False, act_order=False, enable_scale_search=False, + enable_deepspeed_zero3=False, ): from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ FP16Linear, BF16Linear @@ -404,6 +405,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, enable_xetla=enable_xetla, optimize_lm_head=optimize_lm_head, enable_scale_search=enable_scale_search, + enable_deepspeed_zero3=enable_deepspeed_zero3 ) device = module.weight.data.device # Copy the weights @@ -416,7 +418,8 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, imatrix=cur_imatrix, in_features=in_features, enable_xetla=enable_xetla, - enable_scale_search=enable_scale_search).to(device) + enable_scale_search=enable_scale_search + enable_deepspeed_zero3=enable_deepspeed_zero3).to(device) new_linear._parameters['weight'] = paramsLowBit if module.bias is not None: new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ @@ -747,7 +750,8 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, imatrix_data=None, embedding_qtype=None, enable_xetla=False, - mixed_precision=False): + mixed_precision=False, + enable_deepspeed_zero3=False): if qtype in ggml_tensor_qtype.values(): index = list(ggml_tensor_qtype.values()).index(qtype) logger.info(f"Converting the current model to " @@ -793,6 +797,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, mixed_precision=mixed_precision, act_order=act_order, enable_scale_search=enable_scale_search, + enable_deepspeed_zero3=enable_deepspeed_zero3, ) if not has_been_replaced: warnings.warn( diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index c30ca4a284e..eacfbba86f3 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -208,7 +208,8 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=None, convert_shape_only=False, imatrix: torch.Tensor=None, in_features: int=None, - enable_scale_search: bool=False): + enable_scale_search: bool=False, + enable_deepspeed_zero3: bool=False): QK = ggml.ggml_qk_size(qtype) block_size_in_bytes = ggml.ggml_type_size(qtype) @@ -229,8 +230,12 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, scale = torch.empty(n // k, dtype=torch.float32, device=device) else: - dst_tensor = torch.empty(dst_size, dtype=torch.uint8, - device=device) + if enable_deepspeed_zero3: + dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, + device=device) + else: + dst_tensor = torch.empty(dst_size, dtype=torch.uint8, + device=device) if not convert_shape_only and device != 'meta': dst = ctypes.c_void_p(dst_tensor.data.data_ptr()) @@ -258,10 +263,11 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, return dst_tensor -def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int): - - invalidInputError(tensor.dtype == torch.uint8, - "Input tensor must be uint8") +def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int + enable_deepspeed_zero3=False): + if not enable_deepspeed_zero3=False: + invalidInputError(tensor.dtype == torch.uint8, + "Input tensor must be uint8") invalidInputError(tensor.device == torch.device('cpu'), "Input tensor must be uint8") @@ -381,7 +387,8 @@ def __new__(cls, imatrix=None, in_features=None, enable_xetla=False, - enable_scale_search=False): + enable_scale_search=False, + enable_deepspeed_zero3=False): if data is None: data = torch.empty(0) @@ -395,6 +402,7 @@ def __new__(cls, self.in_features = in_features self.enable_xetla = enable_xetla self.enable_scale_search = enable_scale_search + self.enable_deepspeed_zero3 = enable_deepspeed_zero3 return self def ggml_mse(self, w, ggml_qtype, device): @@ -453,7 +461,8 @@ def quantize(self, device=None): convert_shape_only=self.convert_shape_only, imatrix=self.imatrix, in_features=self.in_features, - enable_scale_search=self.enable_scale_search) + enable_scale_search=self.enable_scale_search, + enable_deepspeed_zero3=self.enable_deepspeed_zero3) self.data = w_quantized self.quantized = True self._shape = w.shape @@ -581,7 +590,13 @@ class MatMulLowBit(torch.autograd.Function): def forward(ctx, A, weight, input_seq_size): ctx.is_empty = False import xe_linear - result = xe_linear.forward_new(A, weight.data, weight.qtype, input_seq_size) + if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3: + result = xe_linear.forward_new(A, + weight.data.view(torch.uint8), + weight.qtype, + input_seq_size) + else: + result = xe_linear.forward_new(A, weight.data, weight.qtype, input_seq_size) if any(ctx.needs_input_grad[:2]): ctx.tensors = (A, weight) else: @@ -601,7 +616,12 @@ def backward(ctx, grad_output): if req_gradA: if torch.xpu.is_autocast_xpu_enabled(): grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype()) - dequant_weight = xe_linear.dequant(A, weight.data, weight.qtype) + if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3: + dequant_weight = xe_linear.dequant(A, + weight.data.view(torch.uint8), + weight.qtype) + else: + dequant_weight = xe_linear.dequant(A, weight.data, weight.qtype) grad_A = torch.matmul(grad_output, dequant_weight.reshape(weight._shape)) return grad_A, grad_weight, None @@ -640,13 +660,15 @@ class LowBitLinear(nn.Linear): def __init__(self, input_features, output_features, qtype, bias=True, conver_to_half=True, mp_group=None, enable_xetla=False, optimize_lm_head=False, act_order=False, - enable_scale_search=False): + enable_scale_search=False, + enable_deepspeed_zero3=False): super().__init__(input_features, output_features, bias) self.weight = FP4Params(self.weight.data, requires_grad=False, quantized=False, _shape=None, qtype=qtype, enable_xetla=enable_xetla, - enable_scale_search=enable_scale_search) + enable_scale_search=enable_scale_search, + enable_deepspeed_zero3=enable_deepspeed_zero3) self.in_len = input_features self.out_len = output_features self.weight_shape = (self.out_len, self.in_len) @@ -666,6 +688,7 @@ def __init__(self, input_features, output_features, qtype, bias=True, self.is_lm_head = self.in_len * self.out_len >= 32000 * 4096 and self.bias is None self.low_memory_mode = self.is_lm_head self.act_order = act_order + self.enable_deepspeed_zero3 = enable_deepspeed_zero3 if act_order: self.register_buffer( "g_idx_map", @@ -736,9 +759,16 @@ def forward(self, x: torch.Tensor): if x_2d.requires_grad: result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size) else: - result = xe_linear.forward_new(x_2d, self.weight.data, - self.weight.qtype, - input_seq_size) + if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3: + result = xe_linear.forward_new(x_2d, + self.weight.data.view(torch.uint8), + self.weight.qtype, + input_seq_size) + else: + result = xe_linear.forward_new(x_2d, + self.weight.data, + self.weight.qtype, + input_seq_size) elif self.enable_xetla: x_2d = x_2d.half() result = xe_linear.mm_xetla(x_2d, self.weight.data, self.qtype) diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index d9dd6354970..3bb0cd1de89 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -454,6 +454,7 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): if embedding_qtype is not None: embedding_qtype = ggml_tensor_qtype[embedding_qtype] enable_xetla = kwargs.pop("enable_xetla", False) + enable_deepspeed_zero3 = kwargs.pop("enable_deepspeed_zero3", False) _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) awq_config = None @@ -524,7 +525,8 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): imatrix_data=imatrix_data, embedding_qtype=embedding_qtype, enable_xetla=enable_xetla, - mixed_precision=mixed_precision) + mixed_precision=mixed_precision, + enable_deepspeed_zero3=enable_deepspeed_zero3) if disk_embedding: from ipex_llm.transformers.embedding import DiskEmbedding diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index 74e10244042..9756b3db6c1 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -361,3 +361,14 @@ def get_modelscope_hf_config(model_id_or_path: str, def is_torch_bf16_gpu_available(): # always true for XPU and CPU return True + +# Arc platfrom does not support FP64, +# Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2 method +def _constant_buffered_norm2(self, input, buffer_size=250000000): + norm = None + for part in input.view(-1).split(buffer_size): + if norm is None: + norm = part.data.norm(2)**2.0 + else: + norm += part.data.norm(2)**2.0 + return norm**0.5 From ce049015ea22c72d10b9915f848d64f5fc0ec530 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 19:45:30 +0800 Subject: [PATCH 02/29] Update convert.py --- python/llm/src/ipex_llm/transformers/convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 7e8184e6f85..6454ec26991 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -418,8 +418,8 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, imatrix=cur_imatrix, in_features=in_features, enable_xetla=enable_xetla, - enable_scale_search=enable_scale_search - enable_deepspeed_zero3=enable_deepspeed_zero3).to(device) + enable_deepspeed_zero3=enable_deepspeed_zero3, + enable_scale_search=enable_scale_search).to(device) new_linear._parameters['weight'] = paramsLowBit if module.bias is not None: new_linear._parameters['bias'] = nn.Parameter(module.bias.data)\ From e8c083a119b63812980e2501f7c8de27d7cd733f Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 19:46:16 +0800 Subject: [PATCH 03/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index eacfbba86f3..c0d5cb802d6 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -265,7 +265,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int enable_deepspeed_zero3=False): - if not enable_deepspeed_zero3=False: + if not enable_deepspeed_zero3: invalidInputError(tensor.dtype == torch.uint8, "Input tensor must be uint8") From baec9e963192558eed7ac5d8a46862da3e9ba37d Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 19:49:56 +0800 Subject: [PATCH 04/29] Update utils.py --- python/llm/src/ipex_llm/transformers/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index 9756b3db6c1..f4aa035688c 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -362,6 +362,7 @@ def is_torch_bf16_gpu_available(): # always true for XPU and CPU return True + # Arc platfrom does not support FP64, # Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2 method def _constant_buffered_norm2(self, input, buffer_size=250000000): From a32975654da532f83c51934e1193c868eddb950d Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 20:03:53 +0800 Subject: [PATCH 05/29] Update qlora_finetune_llama2_13b_arch_2_card.sh --- .../QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh index cbe1600e5b1..ba5a11b03b0 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh @@ -31,7 +31,7 @@ export TORCH_LLM_ALLREDUCE=0 # Different from PVC export DS_SKIP_CUDA_CHECK=1 mpirun -n $NUM_GPUS \ - python -u ./alpaca_qlora_zero3_finetuning.py \ + python -u ./alpaca_qlora_finetuning.py \ --base_model "meta-llama/Llama-2-13b-hf" \ --data_path "yahma/alpaca-cleaned" \ --output_dir "./ipex-llm-qlora-alpaca" \ From 2f7ba1648fac0e7ca36c4c01e1280254dde3350c Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 20:04:39 +0800 Subject: [PATCH 06/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index c0d5cb802d6..192fe28b30b 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -263,7 +263,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, return dst_tensor -def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int +def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int, enable_deepspeed_zero3=False): if not enable_deepspeed_zero3: invalidInputError(tensor.dtype == torch.uint8, From 65d5403d121cd2a1224bf70931ca71a3599bad25 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 20:14:54 +0800 Subject: [PATCH 07/29] Update alpaca_qlora_finetuning.py --- .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 7df46a52561..ce4268d7522 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -147,7 +147,9 @@ def train( if "zero3" in deepspeed: from ipex_llm.transformers.utils \ import _constant_buffered_norm2 - replace_attr(deepspeed.runtime.zero.stage3.DeepSpeedZeroOptimizer_Stage3, + from ipex_llm.llm_patching import replace_attr + import deepspeed as ds + replace_attr(ds.runtime.zero.stage3.DeepSpeedZeroOptimizer_Stage3, "_constant_buffered_norm2", _constant_buffered_norm2) device_map = "auto" From 6bd5811937b41782385686d885979ab50228dce8 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 20:38:09 +0800 Subject: [PATCH 08/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 192fe28b30b..f3e92cce351 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -759,7 +759,8 @@ def forward(self, x: torch.Tensor): if x_2d.requires_grad: result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size) else: - if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3: + if hasattr(self.weight, "enable_deepspeed_zero3") \ + and self.weight.enable_deepspeed_zero3: result = xe_linear.forward_new(x_2d, self.weight.data.view(torch.uint8), self.weight.qtype, From 876266aa0be58d392bc108c35ccf17b9bb0e3380 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 20:41:52 +0800 Subject: [PATCH 09/29] Update utils.py --- python/llm/src/ipex_llm/transformers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index f4aa035688c..cb807c39c9d 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -363,7 +363,7 @@ def is_torch_bf16_gpu_available(): return True -# Arc platfrom does not support FP64, +# Arc platfrom does not support FP64, # Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2 method def _constant_buffered_norm2(self, input, buffer_size=250000000): norm = None From 154a110131ea7aaa2020336fa227ee90f19811ec Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:01:42 +0800 Subject: [PATCH 10/29] Update convert.py --- python/llm/src/ipex_llm/transformers/convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 6454ec26991..e394b6ad6bd 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -495,6 +495,7 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, mixed_precision=mixed_precision, act_order=act_order, enable_scale_search=enable_scale_search, + enable_deepspeed_zero3=enable_deepspeed_zero3, ) has_been_replaced = _flag or has_been_replaced return model, has_been_replaced From dc2bb4d2292f2fd4603d9f0a55bc28686b4c8b3e Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:09:25 +0800 Subject: [PATCH 11/29] Update alpaca_qlora_finetuning.py --- .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index ce4268d7522..9b314433d6e 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -184,7 +184,8 @@ def train( model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16, quantization_config=bnb_config, - trust_remote_code=True) + trust_remote_code=True, + enable_deepspeed_zero3="zero3" in deepspeed) # below is also supported # Load the base model from a directory or the HF Hub to 4-bit format # model = AutoModelForCausalLM.from_pretrained( From ccd53eedacf4c85e47d3f14a092e7f0e9264b53a Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:29:43 +0800 Subject: [PATCH 12/29] Update alpaca_qlora_finetuning.py --- .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 9b314433d6e..51bf312294b 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -196,9 +196,10 @@ def train( # # device_map=device_map, # modules_to_not_convert=["lm_head"], # ) - print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}") - model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}') - print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}") + if not "zero3" in deepspeed: + print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}") + model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}') + print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}") tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) print(f"Tokenizer loaded on rank {os.environ.get('LOCAL_RANK')}") From 8b3e9e42793aa418a3d69df38455f2fb13825f8d Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:31:11 +0800 Subject: [PATCH 13/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index f3e92cce351..ca881383ffe 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -263,8 +263,10 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, return dst_tensor -def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int, - enable_deepspeed_zero3=False): +def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int): + import os + enable_deepspeed_zero3 = (os.getenv("IPEX_LLM_ENABLE_DEEPSPEED_ZERO3", "0") == "1") + if not enable_deepspeed_zero3: invalidInputError(tensor.dtype == torch.uint8, "Input tensor must be uint8") From 1f53ba8f0f0c0e8166b78e843cef9ed052f64b45 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:31:54 +0800 Subject: [PATCH 14/29] Update deepspeed_zero3.json --- .../QLoRA/alpaca-qlora/deepspeed_zero3.json | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json index 39143b5c991..7ee8a787c0b 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/deepspeed_zero3.json @@ -1,16 +1,15 @@ { "zero_optimization": { "stage": 3, - "offload_optimizer": { - "device": "cpu" - }, "contiguous_gradients": true, - "overlap_comm": true - }, + "overlap_comm": true, + "offload_optimizer": {"device": "cpu"} + }, "bf16": { "enabled": true }, - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto" + "world_size": 2, + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 2, + "gradient_accumulation_steps": 8 } - From 3f4b35bd00ce984a254b0f8a9245d0798af560ba Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Sun, 21 Jul 2024 21:32:20 +0800 Subject: [PATCH 15/29] Update qlora_finetune_llama2_13b_arch_2_card.sh --- .../QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh index ba5a11b03b0..7a7b034f601 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh @@ -29,6 +29,7 @@ export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 # Different from PVC export DS_SKIP_CUDA_CHECK=1 +export IPEX_LLM_ENABLE_DEEPSPEED_ZERO3=1 mpirun -n $NUM_GPUS \ python -u ./alpaca_qlora_finetuning.py \ From a69d0380441c183763bbc3d0f2aa3872e2a8b627 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 22 Jul 2024 13:09:18 +0800 Subject: [PATCH 16/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index ca881383ffe..285d4e56f7e 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -264,7 +264,6 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int): - import os enable_deepspeed_zero3 = (os.getenv("IPEX_LLM_ENABLE_DEEPSPEED_ZERO3", "0") == "1") if not enable_deepspeed_zero3: From dad468456502dbaca26c2ed84293a31d95902b69 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:05:49 +0800 Subject: [PATCH 17/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 285d4e56f7e..576a01a7b85 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -231,6 +231,8 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=device) else: if enable_deepspeed_zero3: + # Deepspeed zero3 requires unified dtype, thus here uses bfloat16 consistent to other layers + # dst_size above is computed based on uint8, and for bfloat16, buffer size should be half dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, device=device) else: From 6df300cd92e22ec416583144a5769749a9ae3410 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:07:31 +0800 Subject: [PATCH 18/29] Update utils.py --- python/llm/src/ipex_llm/transformers/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index cb807c39c9d..df1054721b1 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -365,6 +365,7 @@ def is_torch_bf16_gpu_available(): # Arc platfrom does not support FP64, # Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2 method +# original: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage3.py#L1365-L1372 def _constant_buffered_norm2(self, input, buffer_size=250000000): norm = None for part in input.view(-1).split(buffer_size): From 3f3d612d68220e5a98dcee3ba46b40fe2b421f21 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:15:50 +0800 Subject: [PATCH 19/29] fix style --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 576a01a7b85..fb292ce2125 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -231,8 +231,10 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=device) else: if enable_deepspeed_zero3: - # Deepspeed zero3 requires unified dtype, thus here uses bfloat16 consistent to other layers - # dst_size above is computed based on uint8, and for bfloat16, buffer size should be half + # Deepspeed zero3 requires unified dtype, + # thus here uses bfloat16 consistent to other layers + # dst_size above is computed based on uint8, and for bfloat16, + # buffer size should be half dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, device=device) else: @@ -378,7 +380,6 @@ def use_batch_forward(x: torch.Tensor, qtype: int, output_len: int): # Rename to FP4Params to trigger initializing # the params layer with all parameters on the CPU -# https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/modeling.py#L333 class FP4Params(torch.nn.Parameter): def __new__(cls, data=None, From 2ab722003360076e6baf95dd1bf7e71a7c18ecce Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:18:06 +0800 Subject: [PATCH 20/29] fix style --- python/llm/src/ipex_llm/transformers/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/utils.py b/python/llm/src/ipex_llm/transformers/utils.py index 3f40a57e065..5cd706c2e5b 100644 --- a/python/llm/src/ipex_llm/transformers/utils.py +++ b/python/llm/src/ipex_llm/transformers/utils.py @@ -386,7 +386,7 @@ def check_hidden_size(qtype, hidden_size): # Arc platfrom does not support FP64, # Disable FP64 in DeepSpeedZeroOptimizer_Stage3's _constant_buffered_norm2 method -# original: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage3.py#L1365-L1372 +# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/zero/stage3.py#L1365 def _constant_buffered_norm2(self, input, buffer_size=250000000): norm = None for part in input.view(-1).split(buffer_size): @@ -394,4 +394,4 @@ def _constant_buffered_norm2(self, input, buffer_size=250000000): norm = part.data.norm(2)**2.0 else: norm += part.data.norm(2)**2.0 - return norm**0.5 \ No newline at end of file + return norm**0.5 From 511859563449938fc9f2138664db0b5dd0c67ec5 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:40:50 +0800 Subject: [PATCH 21/29] Update alpaca_qlora_finetuning.py --- .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 51bf312294b..691f905bb86 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -169,8 +169,7 @@ def train( optimize_model=False, torch_dtype=torch.bfloat16, modules_to_not_convert=["lm_head"], - trust_remote_code=True, - enable_deepspeed_zero3="zero3" in deepspeed + trust_remote_code=True ) else: # According to the QLoRA paper, using "nf4" could yield better model quality than "int4" @@ -184,8 +183,7 @@ def train( model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.bfloat16, quantization_config=bnb_config, - trust_remote_code=True, - enable_deepspeed_zero3="zero3" in deepspeed) + trust_remote_code=True) # below is also supported # Load the base model from a directory or the HF Hub to 4-bit format # model = AutoModelForCausalLM.from_pretrained( From 13884f5bad0a8f6828838532c06da86a510671bf Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:41:31 +0800 Subject: [PATCH 22/29] Update qlora_finetune_llama2_13b_arch_2_card.sh --- .../QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh index 7a7b034f601..ba5a11b03b0 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/qlora_finetune_llama2_13b_arch_2_card.sh @@ -29,7 +29,6 @@ export USE_XETLA=OFF export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2 export TORCH_LLM_ALLREDUCE=0 # Different from PVC export DS_SKIP_CUDA_CHECK=1 -export IPEX_LLM_ENABLE_DEEPSPEED_ZERO3=1 mpirun -n $NUM_GPUS \ python -u ./alpaca_qlora_finetuning.py \ From 486da9c3bb9752cc98aff0cf53c04a40bca325db Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:43:04 +0800 Subject: [PATCH 23/29] Update convert.py --- python/llm/src/ipex_llm/transformers/convert.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py index 5fa7ef8dbfe..05d16926ab8 100644 --- a/python/llm/src/ipex_llm/transformers/convert.py +++ b/python/llm/src/ipex_llm/transformers/convert.py @@ -318,7 +318,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, mixed_precision=False, act_order=False, enable_scale_search=False, - enable_deepspeed_zero3=False, ): from ipex_llm.transformers.low_bit_linear import LowBitLinear, FP4Params, \ FP16Linear, BF16Linear @@ -409,7 +408,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, enable_xetla=enable_xetla, optimize_lm_head=optimize_lm_head, enable_scale_search=enable_scale_search, - enable_deepspeed_zero3=enable_deepspeed_zero3 ) device = module.weight.data.device # Copy the weights @@ -422,7 +420,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, imatrix=cur_imatrix, in_features=in_features, enable_xetla=enable_xetla, - enable_deepspeed_zero3=enable_deepspeed_zero3, enable_scale_search=enable_scale_search).to(device) new_linear._parameters['weight'] = paramsLowBit if module.bias is not None: @@ -499,7 +496,6 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None, mixed_precision=mixed_precision, act_order=act_order, enable_scale_search=enable_scale_search, - enable_deepspeed_zero3=enable_deepspeed_zero3, ) has_been_replaced = _flag or has_been_replaced return model, has_been_replaced @@ -758,8 +754,7 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, imatrix_data=None, embedding_qtype=None, enable_xetla=False, - mixed_precision=False, - enable_deepspeed_zero3=False): + mixed_precision=False): if qtype in ggml_tensor_qtype.values(): index = list(ggml_tensor_qtype.values()).index(qtype) logger.info(f"Converting the current model to " @@ -805,7 +800,6 @@ def ggml_convert_low_bit(model, qtype, optimize_model=True, mixed_precision=mixed_precision, act_order=act_order, enable_scale_search=enable_scale_search, - enable_deepspeed_zero3=enable_deepspeed_zero3, ) if not has_been_replaced: warnings.warn( From 2d8550fe7cdd2f92415778c68634f832afa2fd34 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:50:56 +0800 Subject: [PATCH 24/29] Update low_bit_linear.py --- .../ipex_llm/transformers/low_bit_linear.py | 49 +++++++------------ 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index fb292ce2125..f91c1c23468 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -208,8 +208,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, device=None, convert_shape_only=False, imatrix: torch.Tensor=None, in_features: int=None, - enable_scale_search: bool=False, - enable_deepspeed_zero3: bool=False): + enable_scale_search: bool=False): QK = ggml.ggml_qk_size(qtype) block_size_in_bytes = ggml.ggml_type_size(qtype) @@ -230,16 +229,12 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, scale = torch.empty(n // k, dtype=torch.float32, device=device) else: - if enable_deepspeed_zero3: - # Deepspeed zero3 requires unified dtype, - # thus here uses bfloat16 consistent to other layers - # dst_size above is computed based on uint8, and for bfloat16, - # buffer size should be half - dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, - device=device) - else: - dst_tensor = torch.empty(dst_size, dtype=torch.uint8, - device=device) + # Deepspeed zero3 requires unified dtype, + # thus here uses bfloat16 consistent to other layers + # dst_size above is computed based on uint8, and for bfloat16, + # buffer size should be half + dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, + device=device) if not convert_shape_only and device != 'meta': dst = ctypes.c_void_p(dst_tensor.data.data_ptr()) @@ -268,14 +263,11 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int): - enable_deepspeed_zero3 = (os.getenv("IPEX_LLM_ENABLE_DEEPSPEED_ZERO3", "0") == "1") - - if not enable_deepspeed_zero3: - invalidInputError(tensor.dtype == torch.uint8, - "Input tensor must be uint8") + invalidInputError(tensor.dtype == torch.bfloat16, + "Input tensor must be bfloat16") invalidInputError(tensor.device == torch.device('cpu'), - "Input tensor must be uint8") + "Input tensor must be on cpu") src = ctypes.c_void_p(tensor.data.data_ptr()) @@ -391,8 +383,7 @@ def __new__(cls, imatrix=None, in_features=None, enable_xetla=False, - enable_scale_search=False, - enable_deepspeed_zero3=False): + enable_scale_search=False): if data is None: data = torch.empty(0) @@ -406,7 +397,6 @@ def __new__(cls, self.in_features = in_features self.enable_xetla = enable_xetla self.enable_scale_search = enable_scale_search - self.enable_deepspeed_zero3 = enable_deepspeed_zero3 return self def ggml_mse(self, w, ggml_qtype, device): @@ -465,8 +455,7 @@ def quantize(self, device=None): convert_shape_only=self.convert_shape_only, imatrix=self.imatrix, in_features=self.in_features, - enable_scale_search=self.enable_scale_search, - enable_deepspeed_zero3=self.enable_deepspeed_zero3) + enable_scale_search=self.enable_scale_search) self.data = w_quantized self.quantized = True self._shape = w.shape @@ -594,7 +583,7 @@ class MatMulLowBit(torch.autograd.Function): def forward(ctx, A, weight, input_seq_size): ctx.is_empty = False import xe_linear - if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3: + if weight.qtype == NF4: result = xe_linear.forward_new(A, weight.data.view(torch.uint8), weight.qtype, @@ -620,7 +609,7 @@ def backward(ctx, grad_output): if req_gradA: if torch.xpu.is_autocast_xpu_enabled(): grad_output = grad_output.to(torch.xpu.get_autocast_xpu_dtype()) - if hasattr(weight, "enable_deepspeed_zero3") and weight.enable_deepspeed_zero3: + if weight.qtype == NF4: dequant_weight = xe_linear.dequant(A, weight.data.view(torch.uint8), weight.qtype) @@ -664,15 +653,13 @@ class LowBitLinear(nn.Linear): def __init__(self, input_features, output_features, qtype, bias=True, conver_to_half=True, mp_group=None, enable_xetla=False, optimize_lm_head=False, act_order=False, - enable_scale_search=False, - enable_deepspeed_zero3=False): + enable_scale_search=False): super().__init__(input_features, output_features, bias) self.weight = FP4Params(self.weight.data, requires_grad=False, quantized=False, _shape=None, qtype=qtype, enable_xetla=enable_xetla, - enable_scale_search=enable_scale_search, - enable_deepspeed_zero3=enable_deepspeed_zero3) + enable_scale_search=enable_scale_search) self.in_len = input_features self.out_len = output_features self.weight_shape = (self.out_len, self.in_len) @@ -692,7 +679,6 @@ def __init__(self, input_features, output_features, qtype, bias=True, self.is_lm_head = self.in_len * self.out_len >= 32000 * 4096 and self.bias is None self.low_memory_mode = self.is_lm_head self.act_order = act_order - self.enable_deepspeed_zero3 = enable_deepspeed_zero3 if act_order: self.register_buffer( "g_idx_map", @@ -763,8 +749,7 @@ def forward(self, x: torch.Tensor): if x_2d.requires_grad: result = MatMulLowBit.apply(x_2d, self.weight, input_seq_size) else: - if hasattr(self.weight, "enable_deepspeed_zero3") \ - and self.weight.enable_deepspeed_zero3: + if self.weight.qtype == NF4: result = xe_linear.forward_new(x_2d, self.weight.data.view(torch.uint8), self.weight.qtype, From 95c252bef9561f3bdd73e1b3da2462136f16f3e8 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:51:55 +0800 Subject: [PATCH 25/29] Update model.py --- python/llm/src/ipex_llm/transformers/model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/model.py b/python/llm/src/ipex_llm/transformers/model.py index 3bb0cd1de89..d9dd6354970 100644 --- a/python/llm/src/ipex_llm/transformers/model.py +++ b/python/llm/src/ipex_llm/transformers/model.py @@ -454,7 +454,6 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): if embedding_qtype is not None: embedding_qtype = ggml_tensor_qtype[embedding_qtype] enable_xetla = kwargs.pop("enable_xetla", False) - enable_deepspeed_zero3 = kwargs.pop("enable_deepspeed_zero3", False) _args = copy.deepcopy(args) _kwargs = copy.deepcopy(kwargs) awq_config = None @@ -525,8 +524,7 @@ def load_convert(cls, q_k, optimize_model, *args, **kwargs): imatrix_data=imatrix_data, embedding_qtype=embedding_qtype, enable_xetla=enable_xetla, - mixed_precision=mixed_precision, - enable_deepspeed_zero3=enable_deepspeed_zero3) + mixed_precision=mixed_precision) if disk_embedding: from ipex_llm.transformers.embedding import DiskEmbedding From 369c3693741441b454c3e0e8592bd60853707404 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Mon, 12 Aug 2024 16:31:18 +0800 Subject: [PATCH 26/29] Update alpaca_qlora_finetuning.py --- .../QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py index 691f905bb86..c1df15db3be 100644 --- a/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py +++ b/python/llm/example/GPU/LLM-Finetuning/QLoRA/alpaca-qlora/alpaca_qlora_finetuning.py @@ -144,7 +144,7 @@ def train( prompter = Prompter(prompt_template_name) - if "zero3" in deepspeed: + if deepspeed is not None and "zero3" in deepspeed: from ipex_llm.transformers.utils \ import _constant_buffered_norm2 from ipex_llm.llm_patching import replace_attr @@ -194,7 +194,7 @@ def train( # # device_map=device_map, # modules_to_not_convert=["lm_head"], # ) - if not "zero3" in deepspeed: + if deepspeed is not None and not "zero3" in deepspeed: print(f"Model loaded on rank {os.environ.get('LOCAL_RANK')}") model = model.to(f'xpu:{os.environ.get("LOCAL_RANK", 0)}') print(f"Model moved to rank {os.environ.get('LOCAL_RANK')}") From ce99bca41043bb9d29dcc2a711071312921b5eff Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:15:09 +0800 Subject: [PATCH 27/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index f91c1c23468..52959aa1e04 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -263,8 +263,12 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, def ggml_q_format_convet_cpu2xpu(tensor: torch.Tensor, num_elem: int, qtype: int): - invalidInputError(tensor.dtype == torch.bfloat16, - "Input tensor must be bfloat16") + if qtype == NF4: + invalidInputError(tensor.dtype == torch.bfloat16, + "NF4 Input tensor must be bfloat16") + else: + invalidInputError(tensor.dtype == torch.uint8, + "Input tensor except NF4 must be uint8") invalidInputError(tensor.device == torch.device('cpu'), "Input tensor must be on cpu") From 17dbf80217bcb777dc32659bd8d6ce90c0b703a4 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:38:58 +0800 Subject: [PATCH 28/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index 52959aa1e04..fb9fc080fb7 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -228,12 +228,15 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, dst_tensor = dst_tensor.reshape(tensor.shape[0], tensor.shape[-1] // QK) scale = torch.empty(n // k, dtype=torch.float32, device=device) - else: + elif qtype == NF4: # Deepspeed zero3 requires unified dtype, # thus here uses bfloat16 consistent to other layers # dst_size above is computed based on uint8, and for bfloat16, # buffer size should be half dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, + device=device) + else: + dst_tensor = torch.empty(dst_size, dtype=torch.uint8, device=device) if not convert_shape_only and device != 'meta': From 1c8cd6c8e9aedf64ec8b6e1667681a1151f15d25 Mon Sep 17 00:00:00 2001 From: Heyang Sun <60865256+Uxito-Ada@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:40:34 +0800 Subject: [PATCH 29/29] Update low_bit_linear.py --- python/llm/src/ipex_llm/transformers/low_bit_linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/llm/src/ipex_llm/transformers/low_bit_linear.py b/python/llm/src/ipex_llm/transformers/low_bit_linear.py index fb9fc080fb7..5536bf2cb34 100644 --- a/python/llm/src/ipex_llm/transformers/low_bit_linear.py +++ b/python/llm/src/ipex_llm/transformers/low_bit_linear.py @@ -234,7 +234,7 @@ def ggml_convert_qtype(tensor: torch.Tensor, qtype: int, # dst_size above is computed based on uint8, and for bfloat16, # buffer size should be half dst_tensor = torch.empty(dst_size // 2, dtype=torch.bfloat16, - device=device) + device=device) else: dst_tensor = torch.empty(dst_size, dtype=torch.uint8, device=device)