remove code

intel-analytics · cyita · Nov 8, 2024 · Nov 1, 2024 · Nov 4, 2024 · Nov 4, 2024
commit 4a5cdd6d7bdc1d17ab47b84c0f2960eb24bc9543
diff --git a/python/llm/src/ipex_llm/transformers/npu_models/baichuan_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/baichuan_mp.py
@@ -199,20 +199,11 @@ def __init__(
             new_key_states = self.convert_to_fp16(curr_key_values[i][0])
             new_value_states = self.convert_to_fp16(curr_key_values[i][1])
 
-        # print("start compiling")
-        # self.compile()
-        print(f"{mode} start compiling - {num_layers}-{n_splits_linear}-{n_splits_down_proj}")
-        t1 = time.perf_counter()
+        print("start compiling")
         if mode == "prefill" and os.environ.get("IPEX_LLM_NPU_DISABLE_COMPILE_OPT", "0") != "1":
             self.compile(npu_dpu_groups=6)
         else:
             self.compile()
-        t2 = time.perf_counter()
-        print(f"{mode} end compiling - {num_layers}-{n_splits_linear}-{n_splits_down_proj}, time: {t2 - t1}s")
-        xml_path = f"gw/baichuan2-7b-npu-{mode}-{num_layers}-{transpose_value}-{n_splits_linear}-{n_splits_down_proj}.xml"
-
-        if not os.path.exists(xml_path):
-            self.save(xml_path)
 
 
     def attention(self,

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py
@@ -71,8 +71,6 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
                 iqtype = ggml_tensor_qtype[qtype]
         qweights, scale = ggml_convert_qtype(layer.weight.data.to(torch.float32),
                                              iqtype, device=device)
-        # if layer.out_features == 125696:
-        #     group_size = 0
         return QuantizedLinear(qweights, scale, layer.bias,
                                group_size=group_size)
 

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py b/python/llm/src/ipex_llm/transformers/npu_models/convert_mp.py
@@ -87,7 +87,6 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
             model.llm.config.model_type = "llama"
         model = model.llm
 
-    print(model)
     if model.config.model_type in ["qwen2", "llama", "minicpm", "baichuan"]:
         from ipex_llm.transformers.npu_models.common import split_linears
         if quantization_group_size == 0:
@@ -107,7 +106,7 @@ def optimize_llm_pre(model: torch.nn.Module, qtype, mixed_precision,
         model.apply(lambda m: split_linears(m, n_splits_hidden_size=n_splits_linear,
                                             n_splits_down_proj=n_splits_down_proj,
                                             load=load))
-        
+
         if quantization_group_size != 0:
             split_num = model.config.hidden_size // quantization_group_size
             if model.config.model_type == "minicpm" and model.config.num_hidden_layers == 40:

diff --git a/python/llm/src/ipex_llm/transformers/npu_models/linear.py b/python/llm/src/ipex_llm/transformers/npu_models/linear.py
@@ -158,7 +158,6 @@ def __init__(
         if group_size != 0:
             self.scale = Parameter(scale, requires_grad=False)
         else:
-            # print("scale_factor True")
             if self.weight.dtype == torch.uint8:
                 # Int4 we need to double the input channels because weights are compressed
                 self.inC *= 2

diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py
@@ -374,7 +374,6 @@ def convert_llm(model: torch.nn.Module,
         model.lm_head_1.get_fused_lm_head()
         model.lm_head_0.get_fused_lm_head()
 
-    print(model)
     # patch generate function
     import types
     model.generate = types.MethodType(generate, model)