intel-analytics · xiangyuT · Jul 5, 2024 · Jun 27, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
@@ -306,18 +306,21 @@ async def main():
                         help='The port number on which the server will run.')
     parser.add_argument('--max-num-seqs', type=int, default=8,
                         help='Max num sequences in a batch.')
+    parser.add_argument('--max-prefilled-seqs', type=int, default=0,
+                        help='Max num sequences in a batch during prefilling.')
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
     low_bit = args.low_bit
     max_num_seqs = args.max_num_seqs
+    max_prefilled_seqs = args.max_prefilled_seqs
 
     # serialize model initialization so that we do not run out of CPU memory
     for i in range(my_size):
         if my_rank == i:
             logger.info("start model initialization")
             global local_model
-            local_model = ModelRunner(model_path, my_rank, my_size, low_bit, max_num_seqs)
+            local_model = ModelRunner(model_path, my_rank, my_size, low_bit, max_num_seqs, max_prefilled_seqs)
             logger.info("model initialized")
         dist.barrier()
     # Load tokenizer

diff --git a/python/llm/src/ipex_llm/transformers/models/chatglm2.py b/python/llm/src/ipex_llm/transformers/models/chatglm2.py
@@ -80,6 +80,7 @@ def chatglm2_model_forward(
     else:
         inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
         seq_length, batch_size, _ = inputs_embeds.shape
+        input_ids = torch.empty((batch_size, seq_length), device=inputs_embeds.device)
 
     if full_attention_mask is None:
         if (attention_mask is not None and not attention_mask.all()) or (