diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-Chat/training/utils/utils.py index b5cfb8d6d..a9a84b618 100644 --- a/applications/DeepSpeed-Chat/training/utils/utils.py +++ b/applications/DeepSpeed-Chat/training/utils/utils.py @@ -174,15 +174,18 @@ def get_optimizer_grouped_parameters( model, weight_decay, lora_lr=5e-4, - no_decay_name_list=["bias", "LayerNorm.weight"], + no_decay_name_list=[ + "bias", "layer_norm.weight", "layernorm.weight", "norm.weight", + "ln_f.weight" + ], lora_name_list=["lora_right_weight", "lora_left_weight"], ): optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() - if (not any(nd in n for nd in no_decay_name_list) - and p.requires_grad and not any(nd in n + if (not any(nd in n.lower() for nd in no_decay_name_list) + and p.requires_grad and not any(nd in n.lower() for nd in lora_name_list)) ], "weight_decay": @@ -191,8 +194,8 @@ def get_optimizer_grouped_parameters( { "params": [ p for n, p in model.named_parameters() - if (not any(nd in n for nd in no_decay_name_list) - and p.requires_grad and any(nd in n + if (not any(nd in n.lower() for nd in no_decay_name_list) + and p.requires_grad and any(nd in n.lower() for nd in lora_name_list)) ], "weight_decay": @@ -203,7 +206,7 @@ def get_optimizer_grouped_parameters( { "params": [ p for n, p in model.named_parameters() - if (any(nd in n + if (any(nd in n.lower() for nd in no_decay_name_list) and p.requires_grad) ], "weight_decay":