diff --git a/vllm/sequence.py b/vllm/sequence.py index 574f06c26582e..34d8158197617 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -168,7 +168,7 @@ def reset_state_for_recompute(self) -> None: self._stage = SequenceStage.PREFILL def get_num_uncomputed_tokens(self) -> int: - """Return the number of prefil tokens that are not computed.""" + """Return the number of prefill tokens that are not computed.""" # we use `get_len()` which includes prompt_len + output_len instead # of prompt_len here. This is because during recompute we need to # prefill for both prompt and output. @@ -353,12 +353,9 @@ def fork(self, new_seq_id: int) -> "Sequence": def get_num_new_tokens(self) -> int: """Get the number of new tokens to be computed. - Args: - remainig_token_budget: The remaining token budgets. Returns: - The new number of tokens to be computed. I.e., 1 for decode, prompt - size for prefill. If there's not enough remainig_token_budget, it - can return the chunked number of new tokens. + The new number of tokens to be computed. I.e., 1 for decode, or + the remaining prompt size for prefill. """ if self.data.stage == SequenceStage.DECODE: return 1