linting/formatting fixes

afeldman-nm · Mar 26, 2024 · a9eda02 · a9eda02
1 parent c89b45d
commit a9eda02
Show file tree

Hide file tree

Showing 10 changed files with 110 additions and 69 deletions.
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -193,7 +193,7 @@ async def async_request_deepspeed_mii(
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
-        # DeepSpeed-MII doesn't support streaming 
+        # DeepSpeed-MII doesn't support streaming
         # as of Jan 28 2024, will use 0 as placeholder.
         # https://github.com/microsoft/DeepSpeed-MII/pull/311
         output.ttft = 0

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -293,7 +293,8 @@ def main(args: argparse.Namespace):
 
         # Save to file
         base_model_id = model_id.split("/")[-1]
-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        file_name = f"{backend}-{args.request_rate}qps-" \
+                    f"{base_model_id}-{current_dt}.json"
         with open(file_name, "w") as outfile:
             json.dump(result_json, outfile)
 
@@ -341,7 +342,8 @@ def main(args: argparse.Namespace):
         "--tokenizer",
         type=str,
         help=
-        "Name or path of the tokenizer, if not using the default model tokenizer.",
+        "Name or path of the tokenizer, if not " \
+        "using the default model tokenizer.",
     )
     parser.add_argument(
         "--best-of",

diff --git a/examples/offline_inference_enc_dec.py b/examples/offline_inference_enc_dec.py
@@ -71,6 +71,8 @@
         native_output, skip_special_tokens=True)  # Decode the generated text
     vllm_generated_text = vllm_output.outputs[0].text
     print(
-        f"Prompt: {prompt!r}, Native PyTorch generated text: {native_generated_text!r}, vLLM generated text: {vllm_generated_text!r}"
+        f"Prompt: {prompt!r}, Native PyTorch generated text: " \
+        "{native_generated_text!r}, " \
+        "vLLM generated text: {vllm_generated_text!r}"
     )
     i += 1
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
@@ -37,8 +37,10 @@
 
 print("-" * 80)
 
-# The llm.generate call will batch all prompts and send the batch at once if resources allow.
-# The prefix will only be cached after the first batch is processed, so we need to call generate once
+# The llm.generate call will batch all prompts
+# and send the batch at once if resources allow.
+# The prefix will only be cached after the
+# first batch is processed, so we need to call generate once
 # to calculate the prefix and cache it.
 outputs = llm.generate(generating_prompts[0], sampling_params)
 

diff --git a/setup.py b/setup.py
@@ -11,7 +11,8 @@
 import setuptools
 import torch
 import torch.utils.cpp_extension as torch_cpp_ext
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+from torch.utils.cpp_extension import (BuildExtension, CUDAExtension,
+                                       CUDA_HOME, ROCM_HOME)
 
 ROOT_DIR = os.path.dirname(__file__)
 
@@ -53,7 +54,8 @@ def _is_cuda() -> bool:
 if _is_hip():
     if ROCM_HOME is None:
         raise RuntimeError(
-            "Cannot find ROCM_HOME. ROCm must be available to build the package."
+            "Cannot find ROCM_HOME. " \
+            "ROCm must be available to build the package."
         )
     NVCC_FLAGS += ["-DUSE_ROCM"]
     NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
@@ -139,7 +141,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
     """
     env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
 
-    # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
+    # If we don't have PYTORCH_ROCM_ARCH
+    # specified pull the list from rocm_agent_enumerator
     if env_arch_list is None:
         command = "rocm_agent_enumerator"
         env_arch_list = (subprocess.check_output(
@@ -250,10 +253,14 @@ def get_torch_arch_list() -> Set[str]:
             "CUDA 11.1 or higher is required for compute capability 8.6.")
     if nvcc_cuda_version < Version("11.8"):
         if any(cc.startswith("8.9") for cc in compute_capabilities):
-            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
-            # However, GPUs with compute capability 8.9 can also run the code generated by
-            # the previous versions of CUDA 11 and targeting compute capability 8.0.
-            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
+            # CUDA 11.8 is required to generate the
+            # code targeting compute capability 8.9.
+            # However, GPUs with compute capability
+            # 8.9 can also run the code generated by
+            # the previous versions of CUDA 11 and
+            # targeting compute capability 8.0.
+            # Therefore, if CUDA 11.8 is not available,
+            # we target compute capability 8.0
             # instead of 8.9.
             warnings.warn(
                 "CUDA 11.8 or higher is required for compute capability 8.9. "

diff --git a/vllm/model_executor/layers/attention/enc_dec_attention.py b/vllm/model_executor/layers/attention/enc_dec_attention.py
@@ -145,22 +145,22 @@ def forward(
         block_size = value_cache.shape[3]
         prompt_table_len = (max_prompt_len + block_size - 1) // block_size
         self_attn_block_tables = input_metadata.block_tables[:,
-                                                   prompt_table_len:].contiguous(
-                                                   )
-        
+                                                             prompt_table_len:].contiguous(
+                                                             )
+
         output = PagedAttentionImpl.forward_decode(
             query,
             key_cache,
             value_cache,
             input_metadata,
             self.num_heads,
             self.scale,
-            None, # No alibi slopes
-            apply_attn_bias=True, # Relative positional encoding (utilized i.e. by T5),
+            None,  # No alibi slopes
+            apply_attn_bias=
+            True,  # Relative positional encoding (utilized i.e. by T5),
             override_context_lens=input_metadata.context_lens,
             override_max_context_len=input_metadata.max_context_len,
-            override_block_tables=self_attn_block_tables
-        )
+            override_block_tables=self_attn_block_tables)
         return output.view(batch_size, seq_len, hidden_size)
 
 
@@ -215,8 +215,8 @@ def forward(
         block_size = value_cache.shape[3]
         prompt_table_len = (max_prompt_len + block_size - 1) // block_size
         cross_attn_block_tables = input_metadata.block_tables[:, :
-                                                   prompt_table_len].contiguous(
-                                                   )
+                                                              prompt_table_len].contiguous(
+                                                              )
 
         # Cross-attention decode run.
         output = PagedAttentionImpl.forward_decode(
@@ -226,11 +226,10 @@ def forward(
             input_metadata,
             self.num_heads,
             self.scale,
-            None, # No alibi slopes
+            None,  # No alibi slopes
             apply_attn_bias=False,
             override_context_lens=input_metadata.prompt_lens.int(),
             override_max_context_len=max_prompt_len,
-            override_block_tables=cross_attn_block_tables
-        )
+            override_block_tables=cross_attn_block_tables)
 
         return output.view(batch_size, seq_len, hidden_size)
diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py
@@ -37,17 +37,17 @@ def reshape_and_cache(
 
     @staticmethod
     def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        input_metadata: InputMetadata,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: Optional[torch.Tensor],
-        apply_attn_bias: bool = False,
-        override_context_lens: Optional[torch.Tensor] = None,
-        override_max_context_len: Optional[int] = None,
-        override_block_tables: Optional[torch.Tensor] = None
+            query: torch.Tensor,
+            key_cache: torch.Tensor,
+            value_cache: torch.Tensor,
+            input_metadata: InputMetadata,
+            num_kv_heads: int,
+            scale: float,
+            alibi_slopes: Optional[torch.Tensor],
+            apply_attn_bias: bool = False,
+            override_context_lens: Optional[torch.Tensor] = None,
+            override_max_context_len: Optional[int] = None,
+            override_block_tables: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         output = torch.empty_like(query)
 
@@ -56,7 +56,7 @@ def forward_decode(
         max_num_partitions = (
             (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
             _PARTITION_SIZE)
-        
+
         attn_bias = input_metadata.attn_bias
         if apply_attn_bias and attn_bias is not None:
             attn_bias = attn_bias.to(torch.float32)
@@ -79,10 +79,13 @@ def forward_decode(
                 value_cache,
                 num_kv_heads,
                 scale,
-                input_metadata.block_tables if override_block_tables is None else override_block_tables,
-                input_metadata.context_lens if override_context_lens is None else override_context_lens,
+                input_metadata.block_tables
+                if override_block_tables is None else override_block_tables,
+                input_metadata.context_lens
+                if override_context_lens is None else override_context_lens,
                 block_size,
-                input_metadata.max_context_len if override_max_context_len is None else override_max_context_len,
+                input_metadata.max_context_len if
+                override_max_context_len is None else override_max_context_len,
                 alibi_slopes,
                 attn_bias if apply_attn_bias else None,
                 input_metadata.kv_cache_dtype,
@@ -111,10 +114,13 @@ def forward_decode(
                 value_cache,
                 num_kv_heads,
                 scale,
-                input_metadata.block_tables if override_block_tables is None else override_block_tables,
-                input_metadata.context_lens if override_context_lens is None else override_context_lens,
+                input_metadata.block_tables
+                if override_block_tables is None else override_block_tables,
+                input_metadata.context_lens
+                if override_context_lens is None else override_context_lens,
                 block_size,
-                input_metadata.max_context_len if override_max_context_len is None else override_max_context_len,
+                input_metadata.max_context_len if
+                override_max_context_len is None else override_max_context_len,
                 alibi_slopes,
                 attn_bias if apply_attn_bias else None,
                 input_metadata.kv_cache_dtype,

diff --git a/vllm/model_executor/models/t5.py b/vllm/model_executor/models/t5.py
@@ -53,16 +53,20 @@ class T5LayerNorm(nn.Module):
 
     def __init__(self, hidden_size, eps=1e-6):
         """
-        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        Construct a layernorm module in the T5 style. 
+        No bias and no subtraction of mean.
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # T5 uses a layer_norm which only scales and doesn't shift,
+        # which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467
+        # thus variance is calculated
+        # w/o mean and there is no bias. Additionally we want
+        # to make sure that the accumulation for
         # half-precision inputs is done in fp32
 
         variance = hidden_states.to(torch.float32).pow(2).mean(-1,
@@ -143,8 +147,10 @@ def __init__(
     ):
         super().__init__()
         self.is_decoder = config.is_decoder
-        self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.relative_attention_num_buckets = \
+            config.relative_attention_num_buckets
+        self.relative_attention_max_distance = \
+            config.relative_attention_max_distance
         self.d_model = config.d_model
         self.key_value_proj_dim = config.d_kv
         total_num_heads = config.num_heads
@@ -184,12 +190,18 @@ def _relative_position_bucket(relative_position,
         Adapted from Mesh Tensorflow:
         https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
 
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
+        Translate relative position to a bucket number for relative 
+        attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance 
+        in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative 
+        positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for 
+        larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All 
+        relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to 
+        longer sequences than the model has been trained on
 
         Args:
             relative_position: an int32 Tensor
@@ -198,7 +210,8 @@ def _relative_position_bucket(relative_position,
             max_distance: an integer
 
         Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+            a Tensor with the same shape as relative_position, 
+             containing int32 values in the range [0, num_buckets)
         """
         relative_buckets = 0
         if bidirectional:
@@ -215,7 +228,8 @@ def _relative_position_bucket(relative_position,
         max_exact = num_buckets // 2
         is_small = relative_position < max_exact
 
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        # The other half of the buckets are for logarithmically
+        # bigger bins in positions up to max_distance
         relative_position_if_large = max_exact + (
             torch.log(relative_position.float() / max_exact) /
             math.log(max_distance / max_exact) *
@@ -296,9 +310,9 @@ def forward(
                     1 if input_metadata.is_prompt else context_len,
                     (context_len + block_size - 1) // block_size *
                     block_size).repeat(batch_size, 1, 1, 1)
-                input_metadata.attn_bias = position_bias[:, :,
-                                                         -seq_len:, :].contiguous(
-                                                         )
+                input_metadata.attn_bias = \
+                    position_bias[:, :,-seq_len:, :] \
+                        .contiguous()
 
             key_cache, value_cache = kv_cache
 

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -175,7 +175,8 @@ def __init__(
         self.logical_token_blocks: List[LogicalTokenBlock] = []
         initial_token_ids = prompt_token_ids
         if is_encoder_decoder:
-            # We need to separate the prompt and generated tokens for encoder-decoder models.
+            # We need to separate the prompt and
+            # generated tokens for encoder-decoder models.
             num_prompt_blocks = (len(prompt_token_ids) + block_size -
                                  1) // block_size
             padded_prompt_len = num_prompt_blocks * block_size

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -85,11 +85,14 @@ def __init__(
             self.model_config.enforce_eager = True
 
         # Unpack HF is_encoder_decoder config attribute
-        # NOTE: must handle "self.model_config is None" case imposed by certain tests i.e. test_prepare_prompt()
-        # In the None case, default to is_encoder_decoder == False since vLLM decoder-only mode is known to handle
+        # NOTE: must handle "self.model_config is None" case imposed by
+        # certain tests i.e. test_prepare_prompt()
+        # In the None case, default to is_encoder_decoder == False
+        # since vLLM decoder-only mode is known to handle
         # the None case correctly.
         self.is_encoder_decoder = False if self.model_config is None else \
-                                        getattr(self.model_config.hf_config, "is_encoder_decoder", False)
+                                        getattr(self.model_config.hf_config, \
+                                                "is_encoder_decoder", False)
 
     def load_model(self) -> None:
         with measure_cuda_memory() as m:
@@ -162,7 +165,8 @@ def _prepare_prompt(
             computed_block_nums = seq_group_metadata.computed_block_nums
             if self.is_encoder_decoder:
                 # Encoder/decoder mode does not support prefix cache
-                assert computed_block_nums is None or len(computed_block_nums) == 0, \
+                assert computed_block_nums is None or \
+                       len(computed_block_nums) == 0, \
                        "Encoder decoder models do not support Prefix Cache yet"
                 # Context length == 1 due to decoder_start token
                 context_lens.append(1)
@@ -267,7 +271,8 @@ def _prepare_prompt(
                                            device=self.device)
         if self.is_encoder_decoder:
             padded_block_tables = []
-            # Pad the encoder block tables to the same length and then add a decoder block table in the end
+            # Pad the encoder block tables to the same length
+            # and then add a decoder block table in the end
             for block_table in block_tables:
                 block_table = block_table[:-1] + [0] * (
                     max_block_table_len - len(block_table)) + block_table[-1:]
@@ -353,11 +358,13 @@ def _prepare_decode(
                 prompt_lens.append(prompt_len)
 
                 if self.is_encoder_decoder:
-                    # Encoder-decoder model stores prompt and generation tokens separately,
+                    # Encoder-decoder model stores
+                    # prompt and generation tokens separately,
                     # so we need to adjust to the pad.
                     prompt_blocks_num = (prompt_len + self.block_size -
                                          1) // self.block_size
-                    prompt_pad = prompt_blocks_num * self.block_size - prompt_len
+                    prompt_pad = prompt_blocks_num * \
+                                 self.block_size - prompt_len
                     position += prompt_pad + 1  # One extra for decoder_start_id
 
                 if self.is_encoder_decoder:
@@ -385,7 +392,8 @@ def _prepare_decode(
                 block_tables.append(block_table)
         if self.is_encoder_decoder:
             padded_block_tables = []
-            # Pad the encoder block tables to the same length and then add a decoder block table in the end
+            # Pad the encoder block tables to the same length
+            # and then add a decoder block table in the end
             for block_table in block_tables:
                 block_table = block_table[:-1] + [0] * (
                     max_block_table_len - len(block_table)) + block_table[-1:]