diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md
index b01e9282758..a18816ab72f 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md
+++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md
@@ -32,8 +32,12 @@ pip install transformers==4.37.0
 bash run.sh
 ```
 
-> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine.
+### Command Line Arguments in `run.sh`
+> Note: INT4 optimization is applied to the model by default. You could specify other low bit optimizations (such as 'fp8' and 'fp6') through `--low-bit`. Besides, you could change `NUM_GPUS` to the number of GPUs you have on your machine. Other relative settings are listed below:
 
+- `--low-bit`: Sets the low bit optimizations (such as 'sym_int4', 'fp16', 'fp8' and 'fp6') for the model.
+- `--max-num-seqs`: Sets the maximum batch size on a single card during pipeline parallel serving.
+- `--max-prefilled-seqs`: Sets the maximum batch size for prefilled sequences. Use `0` to disable partial prefetching and process all requests in a single batch.
 
 ### 3. Sample Input and Output
 
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
index d917891c937..0567280d26c 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
+++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
@@ -306,18 +306,21 @@ async def main():
                         help='The port number on which the server will run.')
     parser.add_argument('--max-num-seqs', type=int, default=8,
                         help='Max num sequences in a batch.')
+    parser.add_argument('--max-prefilled-seqs', type=int, default=0,
+                        help='Max num sequences in a batch during prefilling.')
     
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
     low_bit = args.low_bit
     max_num_seqs = args.max_num_seqs
+    max_prefilled_seqs = args.max_prefilled_seqs
 
     # serialize model initialization so that we do not run out of CPU memory
     for i in range(my_size):
         if my_rank == i:
             logger.info("start model initialization")
             global local_model
-            local_model = ModelRunner(model_path, my_rank, my_size, low_bit, max_num_seqs)
+            local_model = ModelRunner(model_path, my_rank, my_size, low_bit, max_num_seqs, max_prefilled_seqs)
             logger.info("model initialized")
         dist.barrier()
     # Load tokenizer
diff --git a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
index d1bbacf4acd..91bf6161bff 100644
--- a/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
+++ b/python/llm/example/GPU/Pipeline-Parallel-FastAPI/run.sh
@@ -24,11 +24,13 @@ source $basekit_root/setvars.sh --force
 source $basekit_root/ccl/latest/env/vars.sh --force
 
 export USE_XETLA=OFF
-export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
+if [[ $KERNEL_VERSION != *"6.5"* ]]; then
+    export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
+fi
 export TORCH_LLM_ALLREDUCE=0
 
 export MODEL_PATH=YOUR_MODEL_PATH
 export NUM_GPUS=2
 export IPEX_LLM_QUANTIZE_KV_CACHE=1
 
-CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8 --max-num-seqs 4
+CCL_ZE_IPC_EXCHANGE=sockets torchrun --standalone --nnodes=1 --nproc-per-node $NUM_GPUS pipeline_serving.py --repo-id-or-model-path $MODEL_PATH --low-bit fp8 --max-num-seqs 4 --max-prefilled-seqs 0
diff --git a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
index 5ecfa903c39..72908d61876 100644
--- a/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
+++ b/python/llm/src/ipex_llm/transformers/pipeline_parallel.py
@@ -152,6 +152,9 @@ def pipeline_parallel(model, pipeline_parallel_stages):
             model._modules['lm_head'] = DummyLayer()
 
     model.pipeline_parallel_stages = pipeline_parallel_stages
+    model.layer_start = layer_start
+    model.layer_end = layer_end
+    model.num_layers = num_layers
     model = model.to(f'xpu:{local_rank}')
     return model
 
@@ -349,6 +352,9 @@ class BatchTask(BaseModel):
     prompt_lengths: List[int]
     stopped: bool
 
+    prefilled_index: int
+    partial_prefilling: int
+
 
 def make_attention_mask(prompt_lengths):
     max_length = max(prompt_lengths)
@@ -360,7 +366,7 @@ def make_attention_mask(prompt_lengths):
 
 class ModelRunner:
     """Implementation for pipeline parallel multi-stage serving."""
-    def __init__(self, checkpoint, rank, world_size, low_bit, max_num_seqs,
+    def __init__(self, checkpoint, rank, world_size, low_bit, max_num_seqs, max_prefilled_seqs,
                  torch_dtype=torch.float16):
         self.pp_config = PPConfig(rank, world_size)
         self.dtype = torch_dtype
@@ -389,7 +395,11 @@ def __init__(self, checkpoint, rank, world_size, low_bit, max_num_seqs,
         self.print_len = {}
         self.is_finish = {}
         self.model_name = checkpoint
-        self.layer_start = 0
+        # self.layer_start = 0
+        # self.layer_end = 0
+
+        self.max_prefilled_seqs = max_prefilled_seqs
+        self.partial_output_dict = {}
 
     def load_model(self, model_path, world_size, low_bit='sym_int4'):
         from ipex_llm.transformers import AutoModelForCausalLM, AutoModel
@@ -412,11 +422,90 @@ def load_model(self, model_path, world_size, low_bit='sym_int4'):
         model = model.eval()
         return model
 
+    def prepare_batch(self, cur_batch):
+        if self.rank == 0:
+            cur_input_start = cur_batch.prefilled_index
+            if self.max_prefilled_seqs > 0:
+                if cur_input_start < cur_batch.batch_size:
+                    cur_input_end = cur_input_start + self.max_prefilled_seqs
+                    cur_input_end = min(cur_input_end, cur_batch.batch_size)
+                    cur_batch.partial_prefilling = cur_input_end - cur_input_start
+                else:
+                    cur_batch.partial_prefilling = 0
+
+        return cur_batch
+
+    def cat_kv_cache(self, model_type, kv_cache_1, kv_cache_2):
+        if model_type in ["baichuan", "chatglm"]:
+            result = []
+            for sub_tuple1, sub_tuple2 in zip(kv_cache_1, kv_cache_2):
+                if sub_tuple1 is None:
+                    sub_result = [sub_tuple2]
+                elif sub_tuple2 is None:
+                    sub_result = [sub_tuple1]
+                else:
+                    sub_result = []
+                    for t1, t2 in zip(sub_tuple1, sub_tuple2):
+                        if t1 is None:
+                            sub_result.append(t2)
+                        elif t2 is None:
+                            sub_result.append(t1)
+                        else:
+                            if model_type == "chatglm" and self.model.config.num_layers != 40:
+                                sub_result.append(torch.cat((t1, t2), dim=1))
+                            else:
+                                sub_result.append(torch.cat((t1, t2), dim=0))
+                result.append(tuple(sub_result))
+            return tuple(result)
+        else:
+            # num_layers = self.model.layer_end - self.model.layer_start
+            for layer_idx in range(self.model.num_layers):
+                kv_cache_1.key_cache[layer_idx] = \
+                    torch.cat([kv_cache_1.key_cache[layer_idx],
+                               kv_cache_2.key_cache[layer_idx]], dim=0)
+                kv_cache_1.value_cache[layer_idx] = \
+                    torch.cat([kv_cache_1.value_cache[layer_idx],
+                               kv_cache_2.value_cache[layer_idx]], dim=0)
+
+            return kv_cache_1
+
+    def update_kv_cache(self, kv_cache, cur_id):
+        layer_start = self.model.layer_start
+        layer_end = self.model.layer_end
+        num_layers = self.model.num_layers
+
+        if self.model.config.model_type == "chatglm" and self.model.config.num_layers == 40:
+            # for glm-4-9b-chat
+            if self.past_key_values_dict.get(cur_id, None) is None:
+                value_placeholder = torch.empty_like((kv_cache)[-1][0])
+                past_key_values_placeholder = tuple(
+                    (value_placeholder, value_placeholder) for _ in range(layer_start)
+                ) + (kv_cache)[:layer_end - layer_start] + tuple(
+                    (value_placeholder, value_placeholder) for _ in range(layer_end, num_layers)
+                )
+                kv_cache = past_key_values_placeholder
+            else:
+                pass
+        elif self.model.config.model_type in ["baichuan", "chatglm"] and self.rank > 0:
+            value_placeholder = torch.empty_like((kv_cache)[-1][0])
+            kv_cache = tuple((value_placeholder, value_placeholder)) + \
+                tuple(None for _ in range(layer_start)) + \
+                (kv_cache)[layer_start:]
+            # past_key_values_placeholder = tuple(
+            #     (value_placeholder, value_placeholder) for _ in range(layer_start)
+            # ) + (kv_cache)[layer_start:]
+            # kv_cache = past_key_values_placeholder
+        else:
+            pass
+
+        return kv_cache
+
     @torch.no_grad()
     def model_step(self, input, cur_batch):
         if cur_batch is None or cur_batch.stopped or input is None:
-            return None
+            return None, cur_batch
 
+        # logger.info(f"{self.rank} {cur_batch} {input.shape}")
         cur_id = cur_batch.batch_id
         _past_key_values = self.past_key_values_dict.get(cur_id, None)
         attention_mask = make_attention_mask(cur_batch.prompt_lengths).to(input.device)
@@ -424,44 +513,71 @@ def model_step(self, input, cur_batch):
         if self.rank == 0:
             input_ids = input
             inputs_embeds = None
+
+            if cur_batch.partial_prefilling > 0:
+                cur_input_start = cur_batch.prefilled_index
+                cur_input_end = cur_input_start + cur_batch.partial_prefilling
+                input_ids = input_ids[cur_input_start:cur_input_end]
+                attention_mask = attention_mask[cur_input_start:cur_input_end]
+                tmp_past_key_values = _past_key_values
+                _past_key_values = None
         else:
             input_ids = None
             inputs_embeds = input
 
-        torch.xpu.empty_cache()
+            if cur_batch.partial_prefilling > 0:
+                cur_input_start = cur_batch.prefilled_index
+                cur_input_end = cur_input_start + cur_batch.partial_prefilling
+                attention_mask = attention_mask[cur_input_start:cur_input_end]
+                tmp_past_key_values = _past_key_values
+                _past_key_values = None
+
+        # torch.xpu.empty_cache()
         output = self.model(input_ids=input_ids,
                             inputs_embeds=inputs_embeds,
                             past_key_values=_past_key_values,
                             attention_mask=attention_mask,
                             use_cache=True,)
 
-        if self.model.config.model_type == "chatglm" and self.model.config.num_layers == 40:
-            # for glm-4-9b-chat
-            if self.past_key_values_dict.get(cur_id, None) is None:
-                value_placeholder = torch.empty_like((output.past_key_values)[-1][0])
-                past_key_values_placeholder = tuple(
-                    (value_placeholder, value_placeholder) for _ in range(layer_start)
-                ) + (output.past_key_values)[: layer_end - layer_start] + tuple(
-                    (value_placeholder, value_placeholder) for _ in range(layer_end, num_layers)
-                )
-                _past_key_values = past_key_values_placeholder
+        if cur_batch.partial_prefilling > 0:
+            cur_batch.prefilled_index = cur_input_end
+            if tmp_past_key_values is None:
+                tmp_past_key_values = output.past_key_values
             else:
-                _past_key_values = output.past_key_values
-        elif self.model.config.model_type in ["baichuan", "chatglm"] and self.rank > 0:
-            # for baichuan2 and chatglm3
-            value_placeholder = torch.empty_like((output.past_key_values)[-1][0])
-            past_key_values_placeholder = tuple(
-                (value_placeholder, value_placeholder) for _ in range(layer_start)
-            ) + (output.past_key_values)[layer_start:]
-            _past_key_values = past_key_values_placeholder
+                tmp_past_key_values = self.cat_kv_cache(self.model.config.model_type,
+                                                        tmp_past_key_values,
+                                                        output.past_key_values)
+                # torch.xpu.empty_cache()
+
+            if cur_batch.prefilled_index == cur_batch.batch_size:
+                tmp_past_key_values = self.update_kv_cache(tmp_past_key_values, cur_id)
+
+            self.past_key_values_dict[cur_id] = tmp_past_key_values
+
+            if self.pp_config.is_tail:
+                _pre_output = self.partial_output_dict.get(cur_id, None)
+                tmp_output = output.logits.to(self.dtype)
+                tmp_output = torch.argmax(tmp_output[:, -1:, :], dim=-1)
+                if _pre_output is None:
+                    _pre_output = tmp_output
+                else:
+                    _pre_output = torch.cat((_pre_output, tmp_output), dim=0)
+                self.partial_output_dict[cur_id] = _pre_output
         else:
-            _past_key_values = output.past_key_values
-        self.past_key_values_dict[cur_id] = _past_key_values
+            _past_key_values = self.update_kv_cache(output.past_key_values, cur_id)
+            self.past_key_values_dict[cur_id] = _past_key_values
         torch.xpu.synchronize()
         if not self.pp_config.is_tail:
-            return output[0].to(self.dtype)
+            return output[0].to(self.dtype), cur_batch
         else:
-            return output.logits
+            if cur_batch.partial_prefilling > 0 and \
+               cur_batch.prefilled_index == cur_batch.batch_size:
+                _output = self.partial_output_dict.pop(cur_id, None)
+                cur_batch.partial_prefilling = 0
+                return _output, cur_batch
+            else:
+                _output = torch.argmax(output.logits[:, -1:, :], dim=-1)
+                return _output, cur_batch
 
     def is_initialized(self):
         return True
@@ -489,6 +605,8 @@ async def add_request(self, tokenizer):
             input_len=input_ids.size(1),
             prompt_lengths=[sum(attention_mask[i, :]) for i in range(input_ids.size(0))],
             stopped=False,
+            prefilled_index=0,
+            partial_prefilling=0,
         )
 
         self.input_ids_dict[new_batch.batch_id] = input_ids
@@ -502,11 +620,15 @@ def clear_batch(self, cur_id):
         self.token_times.pop(cur_id, None)
         self.past_key_values_dict.pop(cur_id, None)
 
+        self.is_finish.pop(cur_id, None)
+        self.partial_output_dict.pop(cur_id, None)
+
     async def process_step(self, tokenizer, result_dict):
         cur_batch = None
 
         if self.rank == 0:
             if self.send_buff is not None:
+                # logger.info(f"send {self.rank} {self.send_buff.shape}")
                 dist.send(self.send_buff, dst=self.next_rank)
 
             if self.on_going_batches[0] is not None:
@@ -515,6 +637,7 @@ async def process_step(self, tokenizer, result_dict):
 
             if cur_batch is None:
                 if not self.waiting_requests.empty():
+                    # wait more requests to be put in self.waiting_requests
                     await asyncio.sleep(0.01)
                     cur_batch = await self.add_request(tokenizer)
                     cur_input = self.input_ids_dict[cur_batch.batch_id]
@@ -524,84 +647,99 @@ async def process_step(self, tokenizer, result_dict):
 
             if (cur_batch is not None) and (not cur_batch.stopped) and (cur_input is None):
                 cur_id = cur_batch.batch_id
-                next_ids = torch.empty((cur_batch.batch_size, 1,), device=f'xpu:{self.rank}',
-                                       dtype=torch.int64)
-                dist.recv(next_ids, src=self.pre_rank)
+                # cur_batch = self.prepare_batch(cur_batch)
+                if cur_batch.prefilled_index >= cur_batch.batch_size:
+                    cur_batch.partial_prefilling = 0
+                if cur_batch.partial_prefilling > 0:
+                    next_ids = torch.empty((cur_batch.partial_prefilling, 1,),
+                                           device=f'xpu:{self.rank}', dtype=torch.int64)
+                else:
+                    next_ids = torch.empty((cur_batch.batch_size, 1,),
+                                           device=f'xpu:{self.rank}', dtype=torch.int64)
 
-                if self.tokens.get(cur_id, None) is None:
-                    self.tokens[cur_id] = []
-
-                if len(next_ids.shape) == 1:
-                    next_ids = next_ids.unsqueeze(0)
-                self.tokens[cur_id].append(next_ids)
-                self.token_times[cur_id].append(time.perf_counter())
-                cur_input = next_ids
-                cur_batch.input_len = 1
-                cur_batch.prompt_lengths = [x + 1 for x in cur_batch.prompt_lengths]
-
-                for index, request_id in enumerate(cur_batch.request_ids):
-
-                    if not self.is_finish.get(request_id, False):
-                        remain = cur_batch.max_tokens - len(self.tokens[cur_id])
-
-                        if self.streamer.get(request_id, None) is None:
-                            self.streamer[request_id] = asyncio.Queue()
-
-                        # Currently ignore eos for benchmark
-                        # if next_ids[index].int() == tokenizer.eos_token_id:
-                        #     remain = 0
-                        #     self.is_finish[request_id] = True
-
-                        if self.token_cache.get(request_id, None) is None:
-                            self.token_cache[request_id] = []
-                            self.print_len[request_id] = 0
-                        self.token_cache[request_id].extend(next_ids[index].tolist())
-
-                        text = tokenizer.decode(self.token_cache[request_id])
-                        if text.endswith("\n"):
-                            printable_text = text[self.print_len[request_id]:]
-                            self.token_cache[request_id] = []
-                            self.print_len[request_id] = 0
-                        elif len(text) > 0 and _is_chinese_char(ord(text[-1])):
-                            printable_text = text[self.print_len[request_id]:]
-                            self.print_len[request_id] += len(printable_text)
-                        else:
-                            printable_text = text[self.print_len[request_id]: text.rfind(" ") + 1]
-                            self.print_len[request_id] += len(printable_text)
+                # logger.info(f"recv {self.rank} {next_ids.shape}")
+                dist.recv(next_ids, src=self.pre_rank)
 
-                        if remain > 0:
-                            await self.streamer[request_id].put((remain, printable_text))
-                        else:
-                            printable_text = printable_text + text[self.print_len[request_id]:]
-                            self.token_cache.pop(request_id, None)
-                            self.print_len.pop(request_id, None)
-                            await self.streamer[request_id].put((remain, printable_text))
-
-                if len(self.tokens[cur_id]) >= cur_batch.max_tokens:
-                    # Finish a batch
-                    outputs = torch.cat(self.tokens[cur_id], dim=1)
-                    outputs = outputs.cpu()
-                    output_strs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
-                    for request_id, output_str in zip(cur_batch.request_ids, output_strs):
-                        with self.dict_lock:
-                            result_dict[request_id] = output_str
-
-                    cur_times = self.token_times[cur_id]
-                    first_token = cur_times[1] - cur_times[0]
-                    next_token = (cur_times[-1] - cur_times[1]) / (len(self.tokens[cur_id]) - 1)
-                    logger.info(f"First token latency: {first_token}, "
-                                f"next token latency: {next_token}")
-                    self.clear_batch(cur_id)
-                    cur_batch.stopped = True
+                if cur_batch.partial_prefilling > 0:
+                    cur_input = self.input_ids_dict[cur_batch.batch_id]
+                else:
+                    if self.tokens.get(cur_id, None) is None:
+                        self.tokens[cur_id] = []
+
+                    if len(next_ids.shape) == 1:
+                        next_ids = next_ids.unsqueeze(0)
+                    self.tokens[cur_id].append(next_ids)
+                    self.token_times[cur_id].append(time.perf_counter())
+                    cur_input = next_ids
+                    cur_batch.input_len = 1
+                    cur_batch.prompt_lengths = [x + 1 for x in cur_batch.prompt_lengths]
+
+                    for index, request_id in enumerate(cur_batch.request_ids):
+
+                        if not self.is_finish.get(request_id, False):
+                            remain = cur_batch.max_tokens - len(self.tokens[cur_id])
+
+                            if self.streamer.get(request_id, None) is None:
+                                self.streamer[request_id] = asyncio.Queue()
+
+                            # Currently ignore eos for benchmark
+                            # if next_ids[index].int() == tokenizer.eos_token_id:
+                            #     remain = 0
+                            #     self.is_finish[request_id] = True
+
+                            if self.token_cache.get(request_id, None) is None:
+                                self.token_cache[request_id] = []
+                                self.print_len[request_id] = 0
+                            self.token_cache[request_id].extend(next_ids[index].tolist())
+
+                            text = tokenizer.decode(self.token_cache[request_id])
+                            if text.endswith("\n"):
+                                printable_text = text[self.print_len[request_id]:]
+                                self.token_cache[request_id] = []
+                                self.print_len[request_id] = 0
+                            elif len(text) > 0 and _is_chinese_char(ord(text[-1])):
+                                printable_text = text[self.print_len[request_id]:]
+                                self.print_len[request_id] += len(printable_text)
+                            else:
+                                r_index = text.rfind(" ") + 1
+                                printable_text = text[self.print_len[request_id]: r_index]
+                                self.print_len[request_id] += len(printable_text)
+
+                            if remain > 0:
+                                await self.streamer[request_id].put((remain, printable_text))
+                            else:
+                                printable_text = printable_text + text[self.print_len[request_id]:]
+                                self.token_cache.pop(request_id, None)
+                                self.print_len.pop(request_id, None)
+                                await self.streamer[request_id].put((remain, printable_text))
+
+                    if len(self.tokens[cur_id]) >= cur_batch.max_tokens:
+                        # Finish a batch
+                        outputs = torch.cat(self.tokens[cur_id], dim=1)
+                        outputs = outputs.cpu()
+                        output_strs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
+                        for request_id, output_str in zip(cur_batch.request_ids, output_strs):
+                            with self.dict_lock:
+                                result_dict[request_id] = output_str
+
+                        cur_times = self.token_times[cur_id]
+                        first_token = cur_times[1] - cur_times[0]
+                        next_token = (cur_times[-1] - cur_times[1]) / (len(self.tokens[cur_id]) - 1)
+                        logger.info(f"First token latency: {first_token}, "
+                                    f"next token latency: {next_token}")
+                        self.clear_batch(cur_id)
+                        cur_batch.stopped = True
             else:
                 if (cur_batch is not None) and cur_batch.stopped:
                     cur_batch = None
 
             if cur_batch is not None:
+                cur_batch = self.prepare_batch(cur_batch)
                 dist.broadcast_object_list([cur_batch], src=0)
 
         else:
             if self.send_buff is not None:
+                # logger.info(f"send {self.rank} {self.send_buff.shape}")
                 dist.send(self.send_buff, dst=self.next_rank)
 
             batch_list = [None]
@@ -614,14 +752,26 @@ async def process_step(self, tokenizer, result_dict):
                 if cur_batch.stopped:
                     self.clear_batch(cur_batch.batch_id)
                 else:
+                    cur_batch = self.prepare_batch(cur_batch)
                     cur_len = cur_batch.input_len
-                    cur_input = torch.empty((cur_batch.batch_size, cur_len, self.hidden_size,),
-                                            device=f'xpu:{self.rank}', dtype=self.dtype)
+                    if cur_batch.partial_prefilling:
+                        cur_input = torch.empty(
+                            (cur_batch.partial_prefilling, cur_len, self.hidden_size,),
+                            device=f'xpu:{self.rank}',
+                            dtype=self.dtype,
+                        )
+                    else:
+                        cur_input = torch.empty(
+                            (cur_batch.batch_size, cur_len, self.hidden_size,),
+                            device=f'xpu:{self.rank}',
+                            dtype=self.dtype,
+                        )
+                    # logger.info(f"recv {self.rank} {cur_input.shape}")
                     dist.recv(cur_input, src=self.pre_rank)
 
-        output = self.model_step(cur_input, cur_batch)
-        if output is not None and self.rank == self.world_size - 1:
-            output = torch.argmax(output[:, -1:, :], dim=-1)
+        output, cur_batch = self.model_step(cur_input, cur_batch)
+        # if output is not None and self.rank == self.world_size - 1:
+        #     output = torch.argmax(output[:, -1:, :], dim=-1)
 
         if output is not None:
             # dist.send(output, dst=self.next_rank)