From 7b9ff3d2c091664ec71d1c456913efc19725b2d3 Mon Sep 17 00:00:00 2001
From: gc-fu <guancheng.fu@intel.com>
Date: Thu, 8 Aug 2024 13:50:50 +0800
Subject: [PATCH] Fix chatglm multi-reference problem

---
 python/llm/src/ipex_llm/transformers/convert.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index 0b3babd9c42..65d394b5ab9 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -331,6 +331,11 @@ def _replace_with_low_bit_linear(model, qtype, modules_to_not_convert=None,
         if any(key in full_module_name for key in modules_to_not_convert):
             continue
 
+        if is_linear and getattr(model_config, "model_type", None) == "chatglm" and name == "lm_head":
+            # Now we re-reference it to output_layer
+            model._modules[name] = model._modules["transformer"]._modules["output_layer"]
+            continue
+
         if is_linear and not isinstance(module, LowBitLinear):
             in_features, out_features, mp_group = linear_args
             optimize_lm_head = False