Skip to content

Commit

Permalink
linting/formatting fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
afeldman-nm authored and dbogunowicz committed Mar 26, 2024
1 parent c89b45d commit a9eda02
Show file tree
Hide file tree
Showing 10 changed files with 110 additions and 69 deletions.
2 changes: 1 addition & 1 deletion benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ async def async_request_deepspeed_mii(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

# DeepSpeed-MII doesn't support streaming
# DeepSpeed-MII doesn't support streaming
# as of Jan 28 2024, will use 0 as placeholder.
# https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,8 @@ def main(args: argparse.Namespace):

# Save to file
base_model_id = model_id.split("/")[-1]
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
file_name = f"{backend}-{args.request_rate}qps-" \
f"{base_model_id}-{current_dt}.json"
with open(file_name, "w") as outfile:
json.dump(result_json, outfile)

Expand Down Expand Up @@ -341,7 +342,8 @@ def main(args: argparse.Namespace):
"--tokenizer",
type=str,
help=
"Name or path of the tokenizer, if not using the default model tokenizer.",
"Name or path of the tokenizer, if not " \
"using the default model tokenizer.",
)
parser.add_argument(
"--best-of",
Expand Down
4 changes: 3 additions & 1 deletion examples/offline_inference_enc_dec.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
native_output, skip_special_tokens=True) # Decode the generated text
vllm_generated_text = vllm_output.outputs[0].text
print(
f"Prompt: {prompt!r}, Native PyTorch generated text: {native_generated_text!r}, vLLM generated text: {vllm_generated_text!r}"
f"Prompt: {prompt!r}, Native PyTorch generated text: " \
"{native_generated_text!r}, " \
"vLLM generated text: {vllm_generated_text!r}"
)
i += 1
6 changes: 4 additions & 2 deletions examples/offline_inference_with_prefix.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@

print("-" * 80)

# The llm.generate call will batch all prompts and send the batch at once if resources allow.
# The prefix will only be cached after the first batch is processed, so we need to call generate once
# The llm.generate call will batch all prompts
# and send the batch at once if resources allow.
# The prefix will only be cached after the
# first batch is processed, so we need to call generate once
# to calculate the prefix and cache it.
outputs = llm.generate(generating_prompts[0], sampling_params)

Expand Down
21 changes: 14 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
import setuptools
import torch
import torch.utils.cpp_extension as torch_cpp_ext
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
from torch.utils.cpp_extension import (BuildExtension, CUDAExtension,
CUDA_HOME, ROCM_HOME)

ROOT_DIR = os.path.dirname(__file__)

Expand Down Expand Up @@ -53,7 +54,8 @@ def _is_cuda() -> bool:
if _is_hip():
if ROCM_HOME is None:
raise RuntimeError(
"Cannot find ROCM_HOME. ROCm must be available to build the package."
"Cannot find ROCM_HOME. " \
"ROCm must be available to build the package."
)
NVCC_FLAGS += ["-DUSE_ROCM"]
NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
Expand Down Expand Up @@ -139,7 +141,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
"""
env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)

# If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
# If we don't have PYTORCH_ROCM_ARCH
# specified pull the list from rocm_agent_enumerator
if env_arch_list is None:
command = "rocm_agent_enumerator"
env_arch_list = (subprocess.check_output(
Expand Down Expand Up @@ -250,10 +253,14 @@ def get_torch_arch_list() -> Set[str]:
"CUDA 11.1 or higher is required for compute capability 8.6.")
if nvcc_cuda_version < Version("11.8"):
if any(cc.startswith("8.9") for cc in compute_capabilities):
# CUDA 11.8 is required to generate the code targeting compute capability 8.9.
# However, GPUs with compute capability 8.9 can also run the code generated by
# the previous versions of CUDA 11 and targeting compute capability 8.0.
# Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
# CUDA 11.8 is required to generate the
# code targeting compute capability 8.9.
# However, GPUs with compute capability
# 8.9 can also run the code generated by
# the previous versions of CUDA 11 and
# targeting compute capability 8.0.
# Therefore, if CUDA 11.8 is not available,
# we target compute capability 8.0
# instead of 8.9.
warnings.warn(
"CUDA 11.8 or higher is required for compute capability 8.9. "
Expand Down
23 changes: 11 additions & 12 deletions vllm/model_executor/layers/attention/enc_dec_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,22 +145,22 @@ def forward(
block_size = value_cache.shape[3]
prompt_table_len = (max_prompt_len + block_size - 1) // block_size
self_attn_block_tables = input_metadata.block_tables[:,
prompt_table_len:].contiguous(
)
prompt_table_len:].contiguous(
)

output = PagedAttentionImpl.forward_decode(
query,
key_cache,
value_cache,
input_metadata,
self.num_heads,
self.scale,
None, # No alibi slopes
apply_attn_bias=True, # Relative positional encoding (utilized i.e. by T5),
None, # No alibi slopes
apply_attn_bias=
True, # Relative positional encoding (utilized i.e. by T5),
override_context_lens=input_metadata.context_lens,
override_max_context_len=input_metadata.max_context_len,
override_block_tables=self_attn_block_tables
)
override_block_tables=self_attn_block_tables)
return output.view(batch_size, seq_len, hidden_size)


Expand Down Expand Up @@ -215,8 +215,8 @@ def forward(
block_size = value_cache.shape[3]
prompt_table_len = (max_prompt_len + block_size - 1) // block_size
cross_attn_block_tables = input_metadata.block_tables[:, :
prompt_table_len].contiguous(
)
prompt_table_len].contiguous(
)

# Cross-attention decode run.
output = PagedAttentionImpl.forward_decode(
Expand All @@ -226,11 +226,10 @@ def forward(
input_metadata,
self.num_heads,
self.scale,
None, # No alibi slopes
None, # No alibi slopes
apply_attn_bias=False,
override_context_lens=input_metadata.prompt_lens.int(),
override_max_context_len=max_prompt_len,
override_block_tables=cross_attn_block_tables
)
override_block_tables=cross_attn_block_tables)

return output.view(batch_size, seq_len, hidden_size)
42 changes: 24 additions & 18 deletions vllm/model_executor/layers/attention/ops/paged_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,17 @@ def reshape_and_cache(

@staticmethod
def forward_decode(
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
input_metadata: InputMetadata,
num_kv_heads: int,
scale: float,
alibi_slopes: Optional[torch.Tensor],
apply_attn_bias: bool = False,
override_context_lens: Optional[torch.Tensor] = None,
override_max_context_len: Optional[int] = None,
override_block_tables: Optional[torch.Tensor] = None
query: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
input_metadata: InputMetadata,
num_kv_heads: int,
scale: float,
alibi_slopes: Optional[torch.Tensor],
apply_attn_bias: bool = False,
override_context_lens: Optional[torch.Tensor] = None,
override_max_context_len: Optional[int] = None,
override_block_tables: Optional[torch.Tensor] = None
) -> torch.Tensor:
output = torch.empty_like(query)

Expand All @@ -56,7 +56,7 @@ def forward_decode(
max_num_partitions = (
(input_metadata.max_context_len + _PARTITION_SIZE - 1) //
_PARTITION_SIZE)

attn_bias = input_metadata.attn_bias
if apply_attn_bias and attn_bias is not None:
attn_bias = attn_bias.to(torch.float32)
Expand All @@ -79,10 +79,13 @@ def forward_decode(
value_cache,
num_kv_heads,
scale,
input_metadata.block_tables if override_block_tables is None else override_block_tables,
input_metadata.context_lens if override_context_lens is None else override_context_lens,
input_metadata.block_tables
if override_block_tables is None else override_block_tables,
input_metadata.context_lens
if override_context_lens is None else override_context_lens,
block_size,
input_metadata.max_context_len if override_max_context_len is None else override_max_context_len,
input_metadata.max_context_len if
override_max_context_len is None else override_max_context_len,
alibi_slopes,
attn_bias if apply_attn_bias else None,
input_metadata.kv_cache_dtype,
Expand Down Expand Up @@ -111,10 +114,13 @@ def forward_decode(
value_cache,
num_kv_heads,
scale,
input_metadata.block_tables if override_block_tables is None else override_block_tables,
input_metadata.context_lens if override_context_lens is None else override_context_lens,
input_metadata.block_tables
if override_block_tables is None else override_block_tables,
input_metadata.context_lens
if override_context_lens is None else override_context_lens,
block_size,
input_metadata.max_context_len if override_max_context_len is None else override_max_context_len,
input_metadata.max_context_len if
override_max_context_len is None else override_max_context_len,
alibi_slopes,
attn_bias if apply_attn_bias else None,
input_metadata.kv_cache_dtype,
Expand Down
48 changes: 31 additions & 17 deletions vllm/model_executor/models/t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,20 @@ class T5LayerNorm(nn.Module):

def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
Construct a layernorm module in the T5 style.
No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
# Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
# w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
# T5 uses a layer_norm which only scales and doesn't shift,
# which is also known as Root Mean
# Square Layer Normalization https://arxiv.org/abs/1910.07467
# thus variance is calculated
# w/o mean and there is no bias. Additionally we want
# to make sure that the accumulation for
# half-precision inputs is done in fp32

variance = hidden_states.to(torch.float32).pow(2).mean(-1,
Expand Down Expand Up @@ -143,8 +147,10 @@ def __init__(
):
super().__init__()
self.is_decoder = config.is_decoder
self.relative_attention_num_buckets = config.relative_attention_num_buckets
self.relative_attention_max_distance = config.relative_attention_max_distance
self.relative_attention_num_buckets = \
config.relative_attention_num_buckets
self.relative_attention_max_distance = \
config.relative_attention_max_distance
self.d_model = config.d_model
self.key_value_proj_dim = config.d_kv
total_num_heads = config.num_heads
Expand Down Expand Up @@ -184,12 +190,18 @@ def _relative_position_bucket(relative_position,
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on
Translate relative position to a bucket number for relative
attention. The relative position is defined as
memory_position - query_position, i.e. the distance
in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative
positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for
larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All
relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to
longer sequences than the model has been trained on
Args:
relative_position: an int32 Tensor
Expand All @@ -198,7 +210,8 @@ def _relative_position_bucket(relative_position,
max_distance: an integer
Returns:
a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
a Tensor with the same shape as relative_position,
containing int32 values in the range [0, num_buckets)
"""
relative_buckets = 0
if bidirectional:
Expand All @@ -215,7 +228,8 @@ def _relative_position_bucket(relative_position,
max_exact = num_buckets // 2
is_small = relative_position < max_exact

# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
# The other half of the buckets are for logarithmically
# bigger bins in positions up to max_distance
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact) /
math.log(max_distance / max_exact) *
Expand Down Expand Up @@ -296,9 +310,9 @@ def forward(
1 if input_metadata.is_prompt else context_len,
(context_len + block_size - 1) // block_size *
block_size).repeat(batch_size, 1, 1, 1)
input_metadata.attn_bias = position_bias[:, :,
-seq_len:, :].contiguous(
)
input_metadata.attn_bias = \
position_bias[:, :,-seq_len:, :] \
.contiguous()

key_cache, value_cache = kv_cache

Expand Down
3 changes: 2 additions & 1 deletion vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ def __init__(
self.logical_token_blocks: List[LogicalTokenBlock] = []
initial_token_ids = prompt_token_ids
if is_encoder_decoder:
# We need to separate the prompt and generated tokens for encoder-decoder models.
# We need to separate the prompt and
# generated tokens for encoder-decoder models.
num_prompt_blocks = (len(prompt_token_ids) + block_size -
1) // block_size
padded_prompt_len = num_prompt_blocks * block_size
Expand Down
24 changes: 16 additions & 8 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,14 @@ def __init__(
self.model_config.enforce_eager = True

# Unpack HF is_encoder_decoder config attribute
# NOTE: must handle "self.model_config is None" case imposed by certain tests i.e. test_prepare_prompt()
# In the None case, default to is_encoder_decoder == False since vLLM decoder-only mode is known to handle
# NOTE: must handle "self.model_config is None" case imposed by
# certain tests i.e. test_prepare_prompt()
# In the None case, default to is_encoder_decoder == False
# since vLLM decoder-only mode is known to handle
# the None case correctly.
self.is_encoder_decoder = False if self.model_config is None else \
getattr(self.model_config.hf_config, "is_encoder_decoder", False)
getattr(self.model_config.hf_config, \
"is_encoder_decoder", False)

def load_model(self) -> None:
with measure_cuda_memory() as m:
Expand Down Expand Up @@ -162,7 +165,8 @@ def _prepare_prompt(
computed_block_nums = seq_group_metadata.computed_block_nums
if self.is_encoder_decoder:
# Encoder/decoder mode does not support prefix cache
assert computed_block_nums is None or len(computed_block_nums) == 0, \
assert computed_block_nums is None or \
len(computed_block_nums) == 0, \
"Encoder decoder models do not support Prefix Cache yet"
# Context length == 1 due to decoder_start token
context_lens.append(1)
Expand Down Expand Up @@ -267,7 +271,8 @@ def _prepare_prompt(
device=self.device)
if self.is_encoder_decoder:
padded_block_tables = []
# Pad the encoder block tables to the same length and then add a decoder block table in the end
# Pad the encoder block tables to the same length
# and then add a decoder block table in the end
for block_table in block_tables:
block_table = block_table[:-1] + [0] * (
max_block_table_len - len(block_table)) + block_table[-1:]
Expand Down Expand Up @@ -353,11 +358,13 @@ def _prepare_decode(
prompt_lens.append(prompt_len)

if self.is_encoder_decoder:
# Encoder-decoder model stores prompt and generation tokens separately,
# Encoder-decoder model stores
# prompt and generation tokens separately,
# so we need to adjust to the pad.
prompt_blocks_num = (prompt_len + self.block_size -
1) // self.block_size
prompt_pad = prompt_blocks_num * self.block_size - prompt_len
prompt_pad = prompt_blocks_num * \
self.block_size - prompt_len
position += prompt_pad + 1 # One extra for decoder_start_id

if self.is_encoder_decoder:
Expand Down Expand Up @@ -385,7 +392,8 @@ def _prepare_decode(
block_tables.append(block_table)
if self.is_encoder_decoder:
padded_block_tables = []
# Pad the encoder block tables to the same length and then add a decoder block table in the end
# Pad the encoder block tables to the same length
# and then add a decoder block table in the end
for block_table in block_tables:
block_table = block_table[:-1] + [0] * (
max_block_table_len - len(block_table)) + block_table[-1:]
Expand Down

0 comments on commit a9eda02

Please sign in to comment.