diff --git a/applications/DeepSpeed-Chat/training/utils/utils.py b/applications/DeepSpeed-Chat/training/utils/utils.py
index b5cfb8d6d..a9a84b618 100644
--- a/applications/DeepSpeed-Chat/training/utils/utils.py
+++ b/applications/DeepSpeed-Chat/training/utils/utils.py
@@ -174,15 +174,18 @@ def get_optimizer_grouped_parameters(
     model,
     weight_decay,
     lora_lr=5e-4,
-    no_decay_name_list=["bias", "LayerNorm.weight"],
+    no_decay_name_list=[
+        "bias", "layer_norm.weight", "layernorm.weight", "norm.weight",
+        "ln_f.weight"
+    ],
     lora_name_list=["lora_right_weight", "lora_left_weight"],
 ):
     optimizer_grouped_parameters = [
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (not any(nd in n for nd in no_decay_name_list)
-                    and p.requires_grad and not any(nd in n
+                if (not any(nd in n.lower() for nd in no_decay_name_list)
+                    and p.requires_grad and not any(nd in n.lower()
                                                     for nd in lora_name_list))
             ],
             "weight_decay":
@@ -191,8 +194,8 @@ def get_optimizer_grouped_parameters(
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (not any(nd in n for nd in no_decay_name_list)
-                    and p.requires_grad and any(nd in n
+                if (not any(nd in n.lower() for nd in no_decay_name_list)
+                    and p.requires_grad and any(nd in n.lower()
                                                 for nd in lora_name_list))
             ],
             "weight_decay":
@@ -203,7 +206,7 @@ def get_optimizer_grouped_parameters(
         {
             "params": [
                 p for n, p in model.named_parameters()
-                if (any(nd in n
+                if (any(nd in n.lower()
                         for nd in no_decay_name_list) and p.requires_grad)
             ],
             "weight_decay":