Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove token budget from decode buckets #241

Merged
merged 1 commit into from
Sep 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 31 additions & 35 deletions vllm/worker/habana_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ def warmup_range(config: Tuple[int, int, int]):
return list(filter(lambda bucket: bucket >= bmin, buckets))


def warmup_buckets(bs_bucket_config, seq_bucket_config,
max_num_batched_tokens):
def warmup_buckets(bs_bucket_config,
seq_bucket_config,
max_num_batched_tokens=None):
buckets = list(
itertools.product(warmup_range(bs_bucket_config),
warmup_range(seq_bucket_config)))
Expand All @@ -107,28 +108,32 @@ def warmup_buckets(bs_bucket_config, seq_bucket_config,
f"seq:{seq_bucket_config}")
raise ValueError(msg)

# Remove buckets exceeding batch token budget
filtered_buckets = list(
filter(lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
buckets))

if len(filtered_buckets) == 0:
# legacy case - we can handle this if we ignore max_num_batched_tokens
min_bucket_bs, min_bucket_seq = min(buckets,
key=lambda b: (b[0] * b[1]))
min_reqd_budget = min_bucket_bs * min_bucket_seq
msg = (
"The current bucketing configuration "
f"(min, step, max_warmup): "
f"bs:{bs_bucket_config}, "
f"seq:{seq_bucket_config} cannot be used with specified "
f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
f"smallest bucket ({min_reqd_budget}) would exceed token budget. "
"Please increase max_num_batched_tokens or decrease bucket minimum "
"Ignoring max_num_batched_tokens at risk of out-of-memory errors.")
logger.error(msg)
return list(sorted(buckets, key=lambda b:
(b[0] * b[1], b[1], b[0]))), []
filtered_buckets = buckets
if max_num_batched_tokens is not None:
# Remove buckets exceeding batch token budget
filtered_buckets = list(
filter(
lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens,
buckets))

if len(filtered_buckets) == 0:
# we can handle this if we ignore max_num_batched_tokens
min_bucket_bs, min_bucket_seq = min(buckets,
key=lambda b: (b[0] * b[1]))
min_reqd_budget = min_bucket_bs * min_bucket_seq
msg = (
"The current bucketing configuration "
f"(min, step, max_warmup): "
f"bs:{bs_bucket_config}, "
f"seq:{seq_bucket_config} cannot be used with specified "
f"max_num_batched_tokens ({max_num_batched_tokens}), as the "
f"smallest bucket ({min_reqd_budget}) would exceed token "
"budget. Please increase max_num_batched_tokens or decrease "
"bucket minimum Ignoring max_num_batched_tokens at risk of "
"out-of-memory errors.")
logger.error(msg)
return list(
sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), []

captured_buckets = list(
sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0])))
Expand Down Expand Up @@ -589,9 +594,8 @@ def _setup_buckets(self) -> None:
f"bs:{self.decode_bs_bucket_cfg}, "
f"seq:{self.decode_seq_bucket_cfg}")
logger.info(msg)
self.decode_buckets, decode_omitted_buckets = warmup_buckets(
self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg,
self.max_num_batched_tokens)
self.decode_buckets, _ = warmup_buckets(self.decode_bs_bucket_cfg,
self.decode_seq_bucket_cfg)
if self.lora_config:
self.decode_buckets[:] = [
bucket for bucket in self.decode_buckets
Expand All @@ -601,14 +605,6 @@ def _setup_buckets(self) -> None:
f"{list(sorted(self.decode_buckets))}")
logger.info(msg)

msg = (f"Omitted {len(decode_omitted_buckets)} "
"decode buckets due to exceeded token budget "
f"(max_num_batched_tokens={self.max_num_batched_tokens})")
logger.info(msg)

msg = f"Omitted decode buckets: {list(sorted(decode_omitted_buckets))}"
logger.debug(msg)

def _prepare_prompt(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
Expand Down
Loading