diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index cd16cecf21546..e8060e369a889 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -25,10 +25,13 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 + pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2 - name: Analysing the code with ruff run: | ruff . - name: Spelling check with codespell run: | - codespell --toml pyproject.toml \ No newline at end of file + codespell --toml pyproject.toml + - name: Run isort + run: | + isort . --check-only diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 546c61e847839..1f3274a28cad5 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,8 +1,7 @@ import argparse import time -from vllm import LLM -from vllm import SamplingParams +from vllm import LLM, SamplingParams PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 9404608b5554b..976cd28b066ac 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,15 +25,12 @@ from typing import AsyncGenerator, List, Tuple import numpy as np +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.transformers_utils.tokenizer import get_tokenizer -from backend_request_func import ( - ASYNC_REQUEST_FUNCS, - RequestFuncInput, - RequestFuncOutput, -) +from vllm.transformers_utils.tokenizer import get_tokenizer @dataclass diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 72bdc4b3b4540..fae4776b2e093 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,9 +6,9 @@ from typing import List, Optional, Tuple import torch +from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from tqdm import tqdm def sample_requests( diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py index 964eca5aaf72b..8e976fbcb3028 100644 --- a/benchmarks/kernels/benchmark_mixtral_moe.py +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -2,11 +2,13 @@ import os import sys -from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name import torch import torch.nn.functional as F import triton +from vllm.model_executor.layers.fused_moe import (fused_moe, + get_config_file_name) + os.environ['CUDA_VISIBLE_DEVICES'] = '0' diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index d921dea1220e1..f6c8f900a3bff 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,12 +1,12 @@ -from typing import Optional import argparse import random import time +from typing import Optional import torch -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random from vllm._C import ops +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random NUM_BLOCKS = 1024 PARTITION_SIZE = 512 diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index f9564dd9588f0..9188e811e2982 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,9 +1,10 @@ +import argparse +from itertools import accumulate from typing import Optional -import argparse -import torch import nvtx -from itertools import accumulate +import torch + from vllm.model_executor.layers.rotary_embedding import get_rope diff --git a/cmake/hipify.py b/cmake/hipify.py index c4d8450630ba3..340e41c8179e3 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -9,8 +9,8 @@ # import argparse -import shutil import os +import shutil from torch.utils.hipify.hipify_python import hipify diff --git a/collect_env.py b/collect_env.py index edcbfe73b38d0..8982fba024274 100644 --- a/collect_env.py +++ b/collect_env.py @@ -6,10 +6,10 @@ # Run it with `python collect_env.py` or `python -m torch.utils.collect_env` import datetime import locale +import os import re import subprocess import sys -import os from collections import namedtuple try: diff --git a/docs/source/conf.py b/docs/source/conf.py index 2ca0d642b7463..61d8e55d2cc6c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,10 +10,11 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. +import logging import os import sys + from sphinx.ext import autodoc -import logging sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) diff --git a/examples/gradio_openai_chatbot_webserver.py b/examples/gradio_openai_chatbot_webserver.py index 61e91d6b0c8b6..8ceb8f68ea0ce 100644 --- a/examples/gradio_openai_chatbot_webserver.py +++ b/examples/gradio_openai_chatbot_webserver.py @@ -1,6 +1,7 @@ import argparse -from openai import OpenAI + import gradio as gr +from openai import OpenAI # Argument parser setup parser = argparse.ArgumentParser( diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py index cea15f8045d8a..a81c4b3e399c3 100644 --- a/examples/llm_engine_example.py +++ b/examples/llm_engine_example.py @@ -1,7 +1,7 @@ import argparse from typing import List, Tuple -from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams def create_test_prompts() -> List[Tuple[str, SamplingParams]]: diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py index 9f28e16cf667a..6aa25b4689ec8 100644 --- a/examples/multilora_inference.py +++ b/examples/multilora_inference.py @@ -5,11 +5,11 @@ Requires HuggingFace credentials for access to Llama2. """ -from typing import Optional, List, Tuple +from typing import List, Optional, Tuple from huggingface_hub import snapshot_download -from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams from vllm.lora.request import LoRARequest diff --git a/examples/offline_inference_distributed.py b/examples/offline_inference_distributed.py index 0897045fd94ae..e4f085fa6665a 100644 --- a/examples/offline_inference_distributed.py +++ b/examples/offline_inference_distributed.py @@ -5,11 +5,13 @@ Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html """ -from vllm import LLM, SamplingParams from typing import Dict + import numpy as np import ray +from vllm import LLM, SamplingParams + # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) diff --git a/format.sh b/format.sh index ff30111123bee..deb57b2b049d1 100755 --- a/format.sh +++ b/format.sh @@ -25,6 +25,7 @@ YAPF_VERSION=$(yapf --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') CODESPELL_VERSION=$(codespell --version) +ISORT_VERSION=$(isort --vn) # # params: tool name, tool version, required version tool_version_check() { @@ -37,6 +38,7 @@ tool_version_check() { tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" +tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)" tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)" YAPF_FLAGS=( @@ -178,6 +180,46 @@ else lint_changed fi +# check spelling of specified files +isort_check() { + isort "$@" +} + +isort_check_all(){ + isort . +} + +# Spelling check of files that differ from main branch. +isort_check_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + isort + fi +} + +# Run Isort +# This flag runs spell check of individual files. --files *must* be the first command line +# arg to use this option. +if [[ "$1" == '--files' ]]; then + isort_check "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + isort_check_all +else + # Check spelling only of the files that changed in last commit. + isort_check_changed +fi +echo 'vLLM isort: Done' + if ! git diff --quiet &>/dev/null; then echo 'Reformatted files. Please review and stage the changes.' echo 'Changes not staged for commit:' diff --git a/pyproject.toml b/pyproject.toml index b6d7649477dcc..4d6fb5a362fc7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,3 +51,7 @@ exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/" [tool.codespell] ignore-words-list = "dout, te, indicies" skip = "./tests/prompts" + +[tool.isort] +use_parentheses = true +skip_gitignore = true diff --git a/requirements-dev.txt b/requirements-dev.txt index 51fa57f068003..72525d7c12280 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ toml==0.10.2 tomli==2.0.1 ruff==0.1.5 codespell==2.2.6 +isort==5.13.2 # type checking mypy==0.991 diff --git a/setup.py b/setup.py index 27106b1f45907..9c9a428f94683 100644 --- a/setup.py +++ b/setup.py @@ -1,16 +1,16 @@ import io +import logging import os import re -import logging import subprocess import sys +from shutil import which from typing import List -from packaging.version import parse, Version -from setuptools import setup, find_packages, Extension -from setuptools.command.build_ext import build_ext -from shutil import which import torch +from packaging.version import Version, parse +from setuptools import Extension, find_packages, setup +from setuptools.command.build_ext import build_ext from torch.utils.cpp_extension import CUDA_HOME ROOT_DIR = os.path.dirname(__file__) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index e98bba8d43b49..6972ae1dee4a1 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -1,12 +1,12 @@ -from dataclasses import dataclass import os import pathlib +from dataclasses import dataclass import pytest -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.transformers_utils.tokenizer import get_tokenizer chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( __file__))).parent.parent / "examples/template_chatml.jinja" diff --git a/tests/conftest.py b/tests/conftest.py index c06b271e6c7f6..40a25ba012697 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,8 +6,8 @@ from transformers import AutoModelForCausalLM from vllm import LLM, SamplingParams -from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.config import TokenizerPoolConfig +from vllm.transformers_utils.tokenizer import get_tokenizer _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 9473a33f0ee68..ee8e43890b333 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -1,13 +1,14 @@ -import pytest import time from typing import List +import pytest + from vllm import SamplingParams from vllm.block import PhysicalTokenBlock -from vllm.core.block_manager import (UncachedBlockAllocator, BlockSpaceManager, - AllocStatus) +from vllm.core.block_manager import (AllocStatus, BlockSpaceManager, + UncachedBlockAllocator) +from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device -from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob from .utils import create_dummy_prompt diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 4a690e24ec720..c66809c6642c3 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,10 +1,11 @@ +import time from typing import List + import pytest # noqa -import time from vllm.config import CacheConfig, SchedulerConfig from vllm.core.scheduler import Scheduler -from vllm.sequence import SequenceGroup, Logprob +from vllm.sequence import Logprob, SequenceGroup from .utils import create_dummy_prompt diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 9474cb21599d4..1d376b18a66b3 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -3,14 +3,12 @@ Run `pytest tests/distributed/test_comm_ops.py --forked`. """ import pytest -import torch import ray +import torch from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, - tensor_model_parallel_all_gather, - broadcast_tensor_dict, -) + broadcast_tensor_dict, tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce) from vllm.test_utils import (init_test_distributed_environment, multi_process_tensor_parallel) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index ed4965593c2f0..1e6e7f89a528c 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,6 +1,6 @@ +import os import random -import os import pytest import ray import torch diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index 4a0e3e759e25a..5622744566bcc 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -1,11 +1,11 @@ # This unit test should be moved to a new # tests/test_guided_decoding directory. -from transformers import AutoTokenizer import torch +from transformers import AutoTokenizer -from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor, - JSONLogitsProcessor) +from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor, + RegexLogitsProcessor) TEST_SCHEMA = { "type": "object", diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 86d9a85af80b1..0d1c32804fffa 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1,22 +1,21 @@ +# imports for guided decoding tests +import json import os +import re import subprocess +import sys import time -import sys +import jsonschema +import openai # use the official client for correctness check import pytest -import requests # using Ray for overall ease of process management, parallel requests, # and debugging. import ray -import openai # use the official client for correctness check +import requests # downloading lora to test lora requests from huggingface_hub import snapshot_download -# imports for guided decoding tests -import json -import jsonschema -import re - from vllm.transformers_utils.tokenizer import get_tokenizer MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds diff --git a/tests/kernels/conftest.py b/tests/kernels/conftest.py index 8c51bfc149efe..d26da2c7fe4ee 100644 --- a/tests/kernels/conftest.py +++ b/tests/kernels/conftest.py @@ -1,4 +1,5 @@ import pytest + from vllm.utils import create_kv_caches_with_random diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index f78913f120aa4..86ecc6412c648 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -2,10 +2,10 @@ import pytest import torch +from allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) -from allclose_default import get_default_atol, get_default_rtol DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index fb571de63d4e1..b03fecffdc645 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -3,13 +3,12 @@ import pytest import torch +from allclose_default import get_default_atol, get_default_rtol from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask -from vllm._C import ops, cache_ops -from vllm.utils import get_max_shared_memory_bytes -from vllm.utils import is_hip -from allclose_default import get_default_atol, get_default_rtol +from vllm._C import cache_ops, ops +from vllm.utils import get_max_shared_memory_bytes, is_hip FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 9c707b41c81ab..0cdb92f2d9700 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,10 +1,9 @@ import random +from typing import Tuple import pytest import torch -from typing import Tuple - from vllm._C import cache_ops COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index f22bcf97365d2..affbbfb4aa94e 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,8 +7,8 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock -from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index ffdcc1e8c80fd..bf1856972cf33 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,9 +1,10 @@ +from itertools import accumulate from typing import List, Optional import pytest import torch from allclose_default import get_default_atol, get_default_rtol -from itertools import accumulate + from vllm.model_executor.layers.rotary_embedding import get_rope IS_NEOX_STYLE = [True, False] diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 5a09095e76688..eb706c0dbb6a4 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -1,12 +1,13 @@ import random -import pytest import time +import pytest import torch -from vllm.attention.ops.prefix_prefill import context_attention_fwd from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask +from vllm.attention.ops.prefix_prefill import context_attention_fwd + NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] HEAD_SIZES = [128] diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index 3b9d0d732acf5..a4242d22eb489 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -1,7 +1,8 @@ -import torch -import pytest import random +import pytest +import torch + from vllm.model_executor.layers.ops.rand import seeded_uniform from vllm.model_executor.utils import set_random_seed diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index 5f8c51fb074f4..e28f809309ec5 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -1,15 +1,15 @@ import gc -import torch import pytest +import torch import triton import triton.language as tl from vllm.model_executor.layers.ops.sample import ( - _uniform_to_exponential, sample, get_num_triton_sampler_splits, - MAX_TRITON_N_COLS) -from vllm.model_executor.utils import set_random_seed + MAX_TRITON_N_COLS, _uniform_to_exponential, get_num_triton_sampler_splits, + sample) from vllm.model_executor.sampling_metadata import SamplingTensors +from vllm.model_executor.utils import set_random_seed SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 38560c251696a..0705a51ca2cff 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -2,7 +2,7 @@ import gc import tempfile from collections import OrderedDict -from unittest.mock import patch, MagicMock +from unittest.mock import MagicMock, patch import pytest import ray @@ -12,13 +12,13 @@ import vllm from vllm.config import LoRAConfig -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.model_loader import get_model from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader import get_model from vllm.model_executor.parallel_utils.parallel_state import ( destroy_model_parallel, initialize_model_parallel) diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 95cf0cede8729..7d37aa6474adc 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -1,12 +1,14 @@ +import tempfile +from random import sample from typing import List, Optional + import peft import pytest -from random import sample -import tempfile from transformers import AutoModelForCausalLM import vllm from vllm.lora.request import LoRARequest + from .conftest import cleanup MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 7dfc3952016f5..caaece883ba21 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,32 +1,28 @@ -import pytest import random from copy import deepcopy from dataclasses import dataclass -from typing import List, Optional, Dict, Tuple +from typing import Dict, List, Optional, Tuple +import pytest import torch import torch.nn.functional as F -from vllm.lora.layers import ( - ColumnParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA, - QKVParallelLinearWithLora, - VocabParallelEmbeddingWithLoRA, - RowParallelLinearWithLoRA, - LogitsProcessorWithLoRA, - LoRAMapping, - BaseLayerWithLoRA, -) -from vllm.lora.models import (LoRALayerWeights, convert_mapping, - PackedLoRALayerWeights) from vllm.config import LoRAConfig -from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, + LogitsProcessorWithLoRA, LoRAMapping, + MergedColumnParallelLinearWithLoRA, + QKVParallelLinearWithLora, + RowParallelLinearWithLoRA, + VocabParallelEmbeddingWithLoRA) +from vllm.lora.models import (LoRALayerWeights, PackedLoRALayerWeights, + convert_mapping) from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, - RowParallelLinear, - QKVParallelLinear) + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.utils import set_random_seed from .utils import DummyLoRAManager diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 130906c3d584d..f5a571e81acba 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -3,6 +3,7 @@ import vllm from vllm.lora.request import LoRARequest + from .conftest import cleanup MODEL_PATH = "meta-llama/Llama-2-7b-hf" diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 2d4fc085b719b..c08eee9910149 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,11 +8,11 @@ from vllm.config import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, - RowParallelLinearWithLoRA, - MergedColumnParallelLinearWithLoRA) + MergedColumnParallelLinearWithLoRA, + RowParallelLinearWithLoRA) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.models import (LoRAModel, LoRAModelManager, - LRUCacheLoRAModelManager, LoRAMapping) +from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, + LRUCacheLoRAModelManager) from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 5fec3f179925a..2dcad23c2b547 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,8 +1,10 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase + from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from vllm.transformers_utils.tokenizer import get_lora_tokenizer +from vllm.transformers_utils.tokenizer_group import get_tokenizer_group + from ..conftest import get_tokenizer_pool_config diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 2996322f4aa48..892f6081e2aaa 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -2,8 +2,8 @@ from torch import nn +from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule from vllm.utils import LRUCache -from vllm.lora.utils import (parse_fine_tuned_lora_name, replace_submodule) def test_parse_fine_tuned_lora_name(): diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 8e640ea2bac49..60aa90fe4ee8a 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,10 +3,10 @@ import tempfile from unittest.mock import patch +from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - DeviceConfig, LoRAConfig) from vllm.worker.worker import Worker diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index a3a1487e62e05..bddd6e4b50e04 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -11,9 +11,11 @@ Run `pytest tests/models/test_marlin.py --forked`. """ +from dataclasses import dataclass + import pytest import torch -from dataclasses import dataclass + from vllm.model_executor.layers.quantization import ( _QUANTIZATION_CONFIG_REGISTRY) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 14f1872c45258..41b7f3da1e839 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,7 +1,7 @@ import pytest import torch -from tests.conftest import VllmRunner +from tests.conftest import VllmRunner from vllm import SamplingParams MODELS = ["facebook/opt-125m"] diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 99ee78ce49824..d2c3a798d3087 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,13 +1,12 @@ """Tests for rejection sampling.""" -import pytest from typing import List, Tuple +import pytest import torch import torch.nn.functional as F -from vllm.model_executor.utils import set_random_seed - from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from vllm.model_executor.utils import set_random_seed CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 92aec831d02e2..3310f190e87a7 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,11 +1,10 @@ import random -from typing import Tuple, List +from typing import List, Optional, Tuple from unittest.mock import patch import pytest import torch from transformers import GenerationConfig, GenerationMixin -from typing import Optional from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.utils import set_random_seed diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index fcb0e09d46143..7dfc261c9830f 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -8,8 +8,8 @@ import pytest -from vllm.model_executor.utils import set_random_seed from vllm import SamplingParams +from vllm.model_executor.utils import set_random_seed MODEL = "facebook/opt-125m" RANDOM_SEEDS = list(range(5)) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index fddc3995452cc..80a960acf0be5 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,9 +1,9 @@ -import torch import pytest +import torch from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from .utils import mock_worker, create_seq_group_metadata_from_prompts +from .utils import create_seq_group_metadata_from_prompts, mock_worker @pytest.mark.parametrize('num_target_seq_ids', [100]) diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 09847136d13e9..36e91672069dc 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -1,9 +1,9 @@ -import torch import math -import pytest - from unittest.mock import MagicMock +import pytest +import torch + from vllm.spec_decode.metrics import AsyncMetricsCollector diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 5f788549d44d0..f4d44108b47c2 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,18 +1,19 @@ -import torch import random -import pytest from unittest.mock import MagicMock -from vllm.spec_decode.multi_step_worker import (MultiStepWorker, - DraftModelTop1Proposer) -from vllm.worker.worker import Worker +import pytest +import torch + from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplerOutput +from vllm.spec_decode.multi_step_worker import (DraftModelTop1Proposer, + MultiStepWorker) +from vllm.worker.worker import Worker -from .utils import (create_execute_model_data, create_worker, - create_seq_group_metadata_from_prompts, zero_kv_cache, - patch_execute_model_with_seeds, - assert_logprobs_dict_allclose, create_batch) +from .utils import (assert_logprobs_dict_allclose, create_batch, + create_execute_model_data, + create_seq_group_metadata_from_prompts, create_worker, + patch_execute_model_with_seeds, zero_kv_cache) @pytest.mark.parametrize('num_steps', list(range(1, 17))) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 39c3f18b20bb3..87d3716ca98d7 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,18 +1,20 @@ -import torch import random -import pytest from unittest.mock import MagicMock +import pytest +import torch + +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from vllm.model_executor.utils import set_random_seed +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.spec_decode.metrics import (AsyncMetricsCollector, + SpecDecodeWorkerMetrics) from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.model_executor.utils import set_random_seed -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from .utils import (mock_worker, create_batch, ExecuteModelData, - create_sampler_output_list) -from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics, - AsyncMetricsCollector) + +from .utils import (ExecuteModelData, create_batch, create_sampler_output_list, + mock_worker) @pytest.mark.parametrize('k', [1, 2, 6]) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 19833ddb06154..6b6f35a1a1d05 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -1,9 +1,9 @@ -from vllm.spec_decode.util import get_all_seq_ids -from vllm.sequence import SequenceGroupMetadata -from vllm.spec_decode.util import split_batch_by_proposal_len +from unittest.mock import MagicMock import pytest -from unittest.mock import MagicMock + +from vllm.sequence import SequenceGroupMetadata +from vllm.spec_decode.util import get_all_seq_ids, split_batch_by_proposal_len def test_get_all_seq_ids(): diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index b7e9edbea88e2..e6756195694c1 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,17 +1,19 @@ -import torch -from typing import List, Optional, Dict, Iterable, Union +from dataclasses import dataclass, fields +from itertools import count +from typing import Dict, Iterable, List, Optional, Union from unittest.mock import MagicMock -from vllm.worker.worker import Worker -from vllm.utils import get_distributed_init_method, get_ip, get_open_port +import torch + from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData, - SamplerOutput, SequenceGroupOutput, SequenceOutput) +from vllm.model_executor.utils import set_random_seed from vllm.sampling_params import SamplingParams +from vllm.sequence import (Logprob, SamplerOutput, SequenceData, + SequenceGroupMetadata, SequenceGroupOutput, + SequenceOutput) +from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.cache_engine import CacheEngine -from vllm.model_executor.utils import set_random_seed -from itertools import count -from dataclasses import dataclass, fields +from vllm.worker.worker import Worker @dataclass diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index a3ca3548a37a6..3b257ac062f56 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -7,8 +7,8 @@ import pytest from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.sequence import Sequence +from vllm.transformers_utils.tokenizer_group import TokenizerGroup # Make two prefixes with different first blocks. prefix_start = [("You are an expert"), ("You are a")] diff --git a/tests/test_sequence.py b/tests/test_sequence.py index e18df059d770f..bb6bcddf1343e 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,6 +1,6 @@ import pytest -from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput +from vllm.sequence import SamplerOutput, SequenceGroupOutput, SequenceOutput @pytest.fixture diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index 181e800325128..4c8238fd8d113 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,7 +1,9 @@ from copy import deepcopy -from vllm.transformers_utils.tokenizer import get_cached_tokenizer + from transformers import AutoTokenizer +from vllm.transformers_utils.tokenizer import get_cached_tokenizer + def test_cached_tokenizer(): reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 082034083aebd..5949858a354dc 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,12 +1,12 @@ -import pytest +from typing import Dict, List +import pytest from transformers import AutoTokenizer -from typing import List, Dict -from vllm.sequence import Sequence, Logprob, SamplingParams, SequenceGroup -from vllm.transformers_utils.tokenizer_group import get_tokenizer_group -from vllm.transformers_utils.tokenizer import detokenize_incrementally +from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import Detokenizer +from vllm.transformers_utils.tokenizer import detokenize_incrementally +from vllm.transformers_utils.tokenizer_group import get_tokenizer_group TRUTH = [ "Hello here, this is a simple test", diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index d0788ee87563d..31571dbfff6f6 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,14 +1,16 @@ -import os -import pytest import asyncio +import os from unittest.mock import patch +import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase + from vllm.transformers_utils.tokenizer_group import get_tokenizer_group from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( RayTokenizerGroupPool) from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( TokenizerGroup) + from ..conftest import get_tokenizer_pool_config diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 5548b2c795222..bc86fb574e024 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,8 +1,8 @@ import torch from vllm.engine.arg_utils import EngineArgs -from vllm.worker.worker import Worker from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.worker.worker import Worker def test_swap() -> None: diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index e8b9b95dc4234..9acb82c0df2c2 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,4 +1,5 @@ -from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadata) from vllm.attention.layer import Attention from vllm.attention.selector import get_attn_backend diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index ac33a917bb0ad..e50d52377b8e0 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -7,12 +7,13 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Type -from flash_attn import flash_attn_varlen_func import torch +from flash_attn import flash_attn_varlen_func from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) -from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata +from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) class FlashAttentionBackend(AttentionBackend): diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index b7eff2b598e1a..fcd903ddf5f51 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -11,7 +11,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) -from vllm.attention.ops.paged_attn import PagedAttention, PagedAttentionMetadata +from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) from vllm.logger import init_logger from vllm.utils import is_hip diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index b20711eb95e59..5901af4f0a02f 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -3,8 +3,7 @@ import torch -from vllm._C import cache_ops -from vllm._C import ops +from vllm._C import cache_ops, ops from vllm.attention.ops.prefix_prefill import context_attention_fwd # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 42b05ee320314..90fce1a0349b2 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -13,11 +13,13 @@ def get_attn_backend(dtype: torch.dtype) -> AttentionBackend: if _can_use_flash_attn(dtype): logger.info("Using FlashAttention backend.") - from vllm.attention.backends.flash_attn import FlashAttentionBackend # noqa: F401 + from vllm.attention.backends.flash_attn import ( # noqa: F401 + FlashAttentionBackend) return FlashAttentionBackend else: logger.info("Using XFormers backend.") - from vllm.attention.backends.xformers import XFormersBackend # noqa: F401 + from vllm.attention.backends.xformers import ( # noqa: F401 + XFormersBackend) return XFormersBackend diff --git a/vllm/config.py b/vllm/config.py index 2003563e4e50e..6070d9d9e50f1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,15 +1,15 @@ -from typing import TYPE_CHECKING, Optional, Union, ClassVar -from dataclasses import dataclass +import json import os -from packaging.version import Version +from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar, Optional, Union -import json import torch +from packaging.version import Version from transformers import PretrainedConfig from vllm.logger import init_logger from vllm.transformers_utils.config import get_config -from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version +from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -103,7 +103,8 @@ def __init__( if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. - from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C + # pylint: disable=C. + from modelscope.hub.snapshot_download import snapshot_download if not os.path.exists(model): model_path = snapshot_download(model_id=model, diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index c26514b8c0936..c6fca4134e9a6 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,15 +1,15 @@ """A block manager that manages token blocks.""" import enum +from abc import ABC, abstractmethod from itertools import count, takewhile from os.path import commonprefix from typing import Dict, List, Optional, Set, Tuple -from abc import ABC, abstractmethod from vllm.block import BlockTable, PhysicalTokenBlock +from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor +from vllm.logger import init_logger from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device -from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor -from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 92515468a8a1f..aa51dd6938872 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,6 +1,6 @@ import enum -from typing import OrderedDict from abc import ABC, abstractmethod, abstractproperty +from typing import OrderedDict from vllm.block import PhysicalTokenBlock diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 4bd0ef360b3ff..72b4cf043e908 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1,13 +1,13 @@ -from collections import deque import enum import time -from typing import Deque, Dict, Iterable, List, Optional, Tuple, Union, Set +from collections import deque +from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.block_manager import AllocStatus, BlockSpaceManager from vllm.core.policy import PolicyFactory -from vllm.lora.request import LoRARequest from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata, SequenceStatus) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a47edaf05a356..fc6665dbe64bc 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -3,9 +3,8 @@ from dataclasses import dataclass from typing import Optional, Tuple -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig, - TokenizerPoolConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, TokenizerPoolConfig) @dataclass diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1a463ab1baae7..d642915aee192 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,17 +2,17 @@ import os import time from functools import partial -from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator) +from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional, + Set, Tuple, Type, Union) from transformers import PreTrainedTokenizer -from vllm.lora.request import LoRARequest from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.engine.ray_utils import initialize_ray_cluster, ray from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 283b5d9ac44c1..f9638d1101906 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -4,22 +4,22 @@ from transformers import PreTrainedTokenizer import vllm -from vllm.lora.request import LoRARequest -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs -from vllm.executor.executor_base import ExecutorBase from vllm.engine.metrics import StatLogger, Stats from vllm.engine.ray_utils import initialize_ray_cluster +from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) +from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, get_tokenizer_group) -from vllm.transformers_utils.detokenizer import Detokenizer from vllm.utils import Counter logger = init_logger(__name__) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 17b1852f5b0a3..905db52a1912b 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,11 +1,12 @@ -from vllm.logger import init_logger -from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY, - disable_created_metrics) - import time -import numpy as np -from typing import Dict, List from dataclasses import dataclass +from typing import Dict, List + +import numpy as np +from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info, + disable_created_metrics) + +from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 27414f085b45a..70d5c9b1fae05 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,10 +1,9 @@ import pickle - -from typing import Optional, List, Tuple +from typing import List, Optional, Tuple from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import is_hip, set_cuda_visible_devices, get_ip +from vllm.utils import get_ip, is_hip, set_cuda_visible_devices logger = init_logger(__name__) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index ba93b1beb2aa4..378136e81cbe5 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -11,9 +11,9 @@ import ssl from typing import AsyncGenerator +import uvicorn from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse -import uvicorn from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e9b3d46d4bb61..db223d809ea02 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -3,9 +3,9 @@ from tqdm import tqdm from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast -from vllm.lora.request import LoRARequest from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine +from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.utils import Counter diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a0685a4d38fbe..06e8bdf11abd3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,28 +1,27 @@ import asyncio -from contextlib import asynccontextmanager -import os import importlib import inspect +import os +from contextlib import asynccontextmanager +from http import HTTPStatus -from prometheus_client import make_asgi_app import fastapi import uvicorn -from http import HTTPStatus from fastapi import Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse, StreamingResponse, Response +from fastapi.responses import JSONResponse, Response, StreamingResponse +from prometheus_client import make_asgi_app import vllm from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import (CompletionRequest, - ChatCompletionRequest, - ErrorResponse) -from vllm.logger import init_logger from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + CompletionRequest, ErrorResponse) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.logger import init_logger TIMEOUT_KEEP_ALIVE = 5 # seconds diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 1f089d524fd03..f1fae1f825f97 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,12 +3,11 @@ import time from typing import Dict, List, Literal, Optional, Union +import torch from pydantic import BaseModel, Field, model_validator -from vllm.utils import random_uuid from vllm.sampling_params import SamplingParams - -import torch +from vllm.utils import random_uuid class ErrorResponse(BaseModel): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index bfdfe39f210ed..0de80f04e51f3 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,19 +1,21 @@ -import time import codecs +import time +from typing import AsyncGenerator, AsyncIterator, List, Optional, Union + from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Optional, List, Union -from vllm.logger import init_logger -from vllm.utils import random_uuid + from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, UsageInfo) -from vllm.outputs import RequestOutput -from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA +from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing +from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) +from vllm.outputs import RequestOutput +from vllm.utils import random_uuid logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 5f2be878a7b76..33c797341114a 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,24 +1,23 @@ import asyncio import time +from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List, + Optional, Tuple) + from fastapi import Request -from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional, - Dict, Tuple) -from vllm.logger import init_logger -from vllm.utils import random_uuid + from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import ( - CompletionRequest, - CompletionResponse, - CompletionResponseChoice, - CompletionResponseStreamChoice, - CompletionStreamResponse, - LogProbs, - UsageInfo, -) -from vllm.outputs import RequestOutput -from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA +from vllm.entrypoints.openai.protocol import (CompletionRequest, + CompletionResponse, + CompletionResponseChoice, + CompletionResponseStreamChoice, + CompletionStreamResponse, + LogProbs, UsageInfo) +from vllm.entrypoints.openai.serving_engine import LoRA, OpenAIServing +from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) +from vllm.outputs import RequestOutput +from vllm.utils import random_uuid logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 976046beec245..47e0275c9d465 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -3,16 +3,16 @@ from dataclasses import dataclass from http import HTTPStatus from typing import Dict, List, Optional, Union -from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import (CompletionRequest, - ChatCompletionRequest, - ErrorResponse, LogProbs, - ModelCard, ModelList, +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + CompletionRequest, ErrorResponse, + LogProbs, ModelCard, ModelList, ModelPermission) +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import Logprob +from vllm.transformers_utils.tokenizer import get_tokenizer logger = init_logger(__name__) diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 30717e8a87358..cc6c12edcffe2 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index eb2ee262b6733..a48f0ac7f0e53 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -1,13 +1,13 @@ from typing import Dict, List, Optional -from vllm.lora.request import LoRARequest -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.utils import (get_ip, get_open_port, get_distributed_init_method, +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) logger = init_logger(__name__) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index c0ade4767156c..9eae7a7df1367 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -1,10 +1,10 @@ from typing import Dict, List, Optional -from vllm.lora.request import LoRARequest -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata logger = init_logger(__name__) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 1faf5b7d68faf..e39b881f7caba 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -1,20 +1,20 @@ import asyncio import copy -from collections import defaultdict import os import pickle +from collections import defaultdict from typing import TYPE_CHECKING, Any, Dict, List, Optional -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) from vllm.engine.ray_utils import RayWorkerVllm, ray from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port, - get_distributed_init_method, make_async) +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async, set_cuda_visible_devices) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -343,7 +343,7 @@ def _compiled_ray_dag(self): raise ValueError(f"Ray version {required_version} or greater is " f"required, but found {current_version}") - from ray.dag import MultiOutputNode, InputNode + from ray.dag import InputNode, MultiOutputNode assert self.parallel_config.worker_use_ray # Right now, compiled DAG requires at least 1 arg. We send diff --git a/vllm/logger.py b/vllm/logger.py index d25fcef9ba2ee..e5e46f5cce3fe 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -2,8 +2,8 @@ # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py """Logging configuration for vLLM.""" import logging -import sys import os +import sys VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1")) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 9975df37b320b..920523e58ccfc 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -10,18 +10,16 @@ from vllm.config import LoRAConfig from vllm.lora.punica import add_lora, add_lora_slice, bgmv -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_gather, - tensor_model_parallel_all_reduce, - tensor_model_parallel_gather, -) from vllm.model_executor.layers.linear import (ColumnParallelLinear, - RowParallelLinear, + MergedColumnParallelLinear, QKVParallelLinear, - MergedColumnParallelLinear) + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce, + tensor_model_parallel_gather) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.parallel_utils.utils import ( diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index f4b3762a53f13..21c2196eb2739 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,6 +1,7 @@ from typing import List, Optional import torch + from vllm.utils import is_pin_memory_available diff --git a/vllm/lora/models.py b/vllm/lora/models.py index a96b49c236eda..97ee1a78b20b7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,19 +4,18 @@ import math import os import re -from typing import (Callable, Dict, Hashable, List, Optional, Tuple, Type) +from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type import safetensors.torch import torch from torch import nn from vllm.config import LoRAConfig -from vllm.utils import LRUCache, is_pin_memory_available - from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_logits_processor) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule +from vllm.utils import LRUCache, is_pin_memory_available logger = logging.getLogger(__name__) diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 911115d63a639..840f1b513035e 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -4,11 +4,11 @@ import torch +from vllm.config import LoRAConfig +from vllm.lora.layers import LoRAMapping from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) from vllm.lora.request import LoRARequest -from vllm.lora.layers import LoRAMapping -from vllm.config import LoRAConfig logger = logging.getLogger(__name__) diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py index bd09cf9cb6ee3..e56f74c7794fb 100644 --- a/vllm/model_executor/guided_decoding.py +++ b/vllm/model_executor/guided_decoding.py @@ -5,16 +5,16 @@ from functools import lru_cache from json import dumps as json_dumps from re import escape as regex_escape -from typing import Union, Tuple +from typing import Tuple, Union from pydantic import BaseModel from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import (CompletionRequest, - ChatCompletionRequest) -from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor, - RegexLogitsProcessor, - CFGLogitsProcessor) +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + CompletionRequest) +from vllm.model_executor.guided_logits_processors import (CFGLogitsProcessor, + JSONLogitsProcessor, + RegexLogitsProcessor) class GuidedDecodingMode(Enum): diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py index 2cd1ae1571065..035fe00037328 100644 --- a/vllm/model_executor/guided_logits_processors.py +++ b/vllm/model_executor/guided_logits_processors.py @@ -16,13 +16,13 @@ import json import math from collections import defaultdict -from typing import Union, DefaultDict, Dict, List, Optional, Callable +from typing import Callable, DefaultDict, Dict, List, Optional, Union import torch +from outlines.fsm.fsm import CFGFSM, RegexFSM +from outlines.fsm.json_schema import build_regex_from_schema from pydantic import BaseModel from transformers import PreTrainedTokenizerBase -from outlines.fsm.fsm import RegexFSM, CFGFSM -from outlines.fsm.json_schema import build_regex_from_schema class BaseLogitsProcessor: diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 299ab44f8f3d5..496d69c89c62b 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,7 +1,5 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_moe, - get_config_file_name, -) + fused_moe, get_config_file_name) __all__ = [ "fused_moe", diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 40e681df48f86..f3d4d1789db2d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -5,14 +5,14 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter +from vllm.logger import init_logger +from vllm.model_executor.parallel_utils.communication_op import ( + tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_all_reduce, tensor_model_parallel_all_gather) from vllm.model_executor.parallel_utils.utils import ( divide, split_tensor_along_last_dim) from vllm.model_executor.utils import set_weight_attrs -from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/ops/rand.py b/vllm/model_executor/layers/ops/rand.py index 5b4b7a153351f..4a429e329567d 100644 --- a/vllm/model_executor/layers/ops/rand.py +++ b/vllm/model_executor/layers/ops/rand.py @@ -1,9 +1,9 @@ +from typing import Optional, Union + import torch import triton import triton.language as tl -from typing import Optional, Union - def seeded_uniform( *size, diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py index 0077317282204..a19e9461f41f7 100644 --- a/vllm/model_executor/layers/ops/sample.py +++ b/vllm/model_executor/layers/ops/sample.py @@ -1,5 +1,5 @@ import math -from typing import Tuple, Optional +from typing import Optional, Tuple import torch import triton diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index af27b1844cea4..ad988d48755b0 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,11 +1,11 @@ from typing import Type +from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig -from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig +from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig _QUANTIZATION_CONFIG_REGISTRY = { "awq": AWQConfig, diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index bb69c7235a133..53baf710ed811 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -1,7 +1,7 @@ import enum from enum import Enum -from typing import Any, Dict, List, Optional from fractions import Fraction +from typing import Any, Dict, List, Optional import torch from torch.nn.parameter import Parameter diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 48e44445a4a20..784229878edf4 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -4,7 +4,8 @@ from torch.nn.parameter import Parameter from vllm._C import ops -from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs +from vllm.model_executor.layers.linear import (LinearMethodBase, + set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 5643454060251..ecd2bd0fce3a3 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,9 +1,9 @@ -from typing import Tuple, Optional from functools import cached_property +from typing import Optional, Tuple import torch -import torch.nn as nn import torch.jit +import torch.nn as nn class RejectionSampler(nn.Module): diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 6d13cf818cbfe..73bbfac33ed13 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -4,13 +4,11 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, -) -from vllm.model_executor.parallel_utils.utils import divide from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_all_reduce) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.parallel_utils.utils import divide from vllm.model_executor.utils import set_weight_attrs DEFAULT_VOCAB_PADDING_SIZE = 64 diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 2d5fcf7b9c54f..4ecafa726321d 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -32,11 +32,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 88a1c81008558..15905e2250832 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -15,11 +15,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c66f72db21e9e..2a2182ff4ebad 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -33,14 +33,14 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, - ReplicatedLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_all_reduce) from vllm.model_executor.parallel_utils.parallel_state import ( diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 543e87101f6ea..77c19b227d213 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -32,8 +32,8 @@ LinearMethodBase, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 49a08a62b54ac..a5432a035db6d 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -28,8 +28,8 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index ae5d480cf4bc4..94048efe48420 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -28,11 +28,11 @@ LinearMethodBase, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index e08adf06bf115..673900487cc96 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -28,11 +28,11 @@ LinearMethodBase, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 03b3271daa508..bdb48bf21042e 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -12,11 +12,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index e3f3dce375046..12fc9dbd50732 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -24,29 +24,23 @@ import torch from torch import nn -from vllm.transformers_utils.configs import JAISConfig from vllm.attention import Attention, AttentionMetadata -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear, -) +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ) + VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, - get_tensor_model_parallel_rank, -) -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) -from vllm.sequence import SamplerOutput + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput +from vllm.transformers_utils.configs import JAISConfig class SwiGLUActivation(nn.Module): diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4d53548d5304d..2cd56f0ce59d8 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -35,11 +35,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index f4dae20f9a228..429bc8109b9f8 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -35,11 +35,11 @@ QKVParallelLinear, ReplicatedLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_all_reduce) from vllm.model_executor.parallel_utils.parallel_state import ( diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 15068efb3b0b7..75f86bc134ee3 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -24,24 +24,22 @@ from typing import List, Optional import numpy as np - import torch import torch.nn.functional as F - from torch import nn from transformers import MixtralConfig from vllm.attention import Attention, AttentionMetadata from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, - ReplicatedLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_all_reduce) from vllm.model_executor.parallel_utils.parallel_state import ( diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 237f870dfe4a6..459f11d1d35a7 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -40,32 +40,27 @@ import torch import torch.nn.functional as F +# this model must need this dependency +from hf_olmo import OLMoConfig from torch import nn from vllm.attention import Attention, AttentionMetadata -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, - LinearMethodBase, - QKVParallelLinear, - RowParallelLinear, -) -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, ) + get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import ( - default_weight_loader, - hf_model_weights_iterator, -) +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) from vllm.sequence import SamplerOutput -# this model must need this dependency -from hf_olmo import OLMoConfig - class SwiGLU(nn.Module): diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index ea8119df664cc..ee910563b20df 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -16,11 +16,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 1737e5efb6cb3..40e068acaba7d 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -47,11 +47,11 @@ LinearMethodBase, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index bd7976dfc1d48..a63b9c8d63d13 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -17,11 +17,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index fe34fe113866d..8c92cd773f6b9 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -29,24 +29,24 @@ from transformers import Qwen2Config from vllm.attention import Attention, AttentionMetadata +from vllm.config import LoRAConfig from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig class Qwen2MLP(nn.Module): diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 7d64bcdf3f3ba..b83637fd50dc7 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -31,11 +31,11 @@ MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 82e2cfa961db2..50d23e0a3b6ef 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -25,19 +25,19 @@ from transformers import Starcoder2Config from vllm.attention import Attention, AttentionMetadata -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py index 5ad9040478398..43d17ad373b87 100644 --- a/vllm/model_executor/neuron_model_loader.py +++ b/vllm/model_executor/neuron_model_loader.py @@ -110,8 +110,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: def get_neuron_model(model_config: ModelConfig, parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module: - from transformers_neuronx.config import (NeuronConfig, - ContinuousBatchingConfig) + from transformers_neuronx.config import (ContinuousBatchingConfig, + NeuronConfig) # Create a model instance. model = NeuronCasualLM(model_config.hf_config) diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index 6f00fd001d956..04b30b4d093d7 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -5,14 +5,11 @@ from torch.distributed import ProcessGroup from vllm.model_executor.parallel_utils import cupy_utils -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size, - get_tensor_model_parallel_group, - is_cupy_nccl_enabled_for_all_reduce, -) from vllm.model_executor.parallel_utils.custom_all_reduce import ( custom_all_reduce) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, is_cupy_nccl_enabled_for_all_reduce) def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py index 396be89492367..bf8ee07070c8a 100644 --- a/vllm/model_executor/parallel_utils/custom_all_reduce.py +++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py @@ -6,11 +6,12 @@ from vllm.logger import init_logger from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank) + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) try: - from vllm._C import custom_ar import pynvml + + from vllm._C import custom_ar except ImportError: # For AMD GPUs custom_ar = None diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 0ea850791cf4b..534cb75c2fd2f 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,11 +1,10 @@ +import random from dataclasses import dataclass from typing import Dict, List, Optional, Tuple import torch -import random -from vllm.model_executor.layers.ops.sample import ( - get_num_triton_sampler_splits) +from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData from vllm.utils import is_pin_memory_available diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 7eebe151754ba..9181f298871db 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -1,23 +1,23 @@ """Utilities for downloading and initializing model weights.""" -import filelock +import fnmatch import glob import hashlib -import fnmatch import json import os from collections import defaultdict from typing import Any, Iterator, List, Optional, Tuple -from huggingface_hub import snapshot_download, HfFileSystem +import filelock import numpy as np -from safetensors.torch import load_file, save_file, safe_open import torch +from huggingface_hub import HfFileSystem, snapshot_download +from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import (get_quantization_config, - QuantizationConfig) +from vllm.model_executor.layers.quantization import (QuantizationConfig, + get_quantization_config) logger = init_logger(__name__) diff --git a/vllm/outputs.py b/vllm/outputs.py index b8173fd7a0638..accc18ad41aa8 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,9 +1,9 @@ -from typing import List, Optional import time +from typing import List, Optional -from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup, - SequenceStatus, RequestMetrics) from vllm.lora.request import LoRARequest +from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, + SequenceGroup, SequenceStatus) class CompletionOutput: diff --git a/vllm/sequence.py b/vllm/sequence.py index af18eed959b1e..72f16579c83c6 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -2,14 +2,15 @@ import copy import enum from dataclasses import dataclass -from typing import Dict, List, Optional, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Dict, List, Optional, Union from vllm.block import LogicalTokenBlock -from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingParams if TYPE_CHECKING: import torch + from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 0f698fa346010..e0b75837e8a39 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,16 +1,15 @@ -from typing import Iterator, List, Tuple, Optional, Dict from itertools import chain, count +from typing import Dict, Iterator, List, Optional, Tuple import torch -from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData) -from vllm.worker.worker import Worker -from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, - get_all_seq_ids, +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeScorer, SpeculativeScores) +from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, + sampler_output_to_torch, split_batch_by_proposal_len) -from vllm.spec_decode.interfaces import (SpeculativeScorer, - SpeculativeProposals, - SpeculativeScores) +from vllm.worker.worker import Worker SeqId = int TargetSeqId = int diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 9e53ffb60ac32..2a72974d01bdc 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,6 +1,6 @@ -from typing import List, Tuple, Optional, Dict -from dataclasses import dataclass from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple import torch diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 1d9b00b3e4d38..5df8fc4316d48 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,10 +1,11 @@ -import torch +import time from dataclasses import dataclass +from typing import Callable, Optional + +import torch + from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from typing import Optional from vllm.utils import is_pin_memory_available -import time -from typing import Callable @dataclass diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 0d9a6f9187cbc..73b6e201c67a9 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,13 +1,13 @@ -from typing import List, Dict, Optional, Tuple import copy +from typing import Dict, List, Optional, Tuple import torch from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.worker.worker import Worker from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) from vllm.spec_decode.util import sampler_output_to_torch +from vllm.worker.worker import Worker class MultiStepWorker(Worker): diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 87837ad1aa71b..59f9d5b5107f3 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1,20 +1,20 @@ -from typing import List, Tuple, Optional, Dict from functools import cached_property +from typing import Dict, List, Optional, Tuple import torch -from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.config import CacheConfig +from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceGroupOutput, SequenceOutput) -from vllm.worker.worker import Worker +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeScorer, SpeculativeScores) +from vllm.spec_decode.metrics import AsyncMetricsCollector from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.config import CacheConfig -from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids, +from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range, split_batch_by_proposal_len) -from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.spec_decode.interfaces import SpeculativeScorer +from vllm.worker.worker import Worker class SpecDecodeWorker: diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 2c5f954551905..406568a4bc08c 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,8 +1,10 @@ -import torch -from typing import List, Tuple -from vllm.sequence import SequenceGroupMetadata, SamplerOutput from contextlib import contextmanager from itertools import chain +from typing import List, Tuple + +import torch + +from vllm.sequence import SamplerOutput, SequenceGroupMetadata SeqId = int diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 6fed2fab8c438..22220852c0b20 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,10 +1,10 @@ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig -from vllm.transformers_utils.configs.mpt import MPTConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.jais import JAISConfig +from vllm.transformers_utils.configs.mpt import MPTConfig __all__ = [ "ChatGLMConfig", diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 2c0e45623aa25..497db0ae48c96 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -4,6 +4,7 @@ """A HuggingFace-style model configuration.""" import warnings from typing import Any, Dict, Optional, Union + from transformers import PretrainedConfig attn_config_defaults: Dict = { diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 1f322b3675d02..419687e23b718 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,8 +1,10 @@ -from typing import List, Dict, Optional +from typing import Dict, List, Optional + from transformers import PreTrainedTokenizer -from vllm.sequence import Sequence, Logprob, SequenceGroup, SamplingParams -from vllm.transformers_utils.tokenizer import (detokenize_incrementally, - convert_prompt_ids_to_tokens) + +from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup +from vllm.transformers_utils.tokenizer import (convert_prompt_ids_to_tokens, + detokenize_incrementally) from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index eebdacc4903ca..ad778d192f6a0 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -5,8 +5,8 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.utils import make_async from vllm.transformers_utils.tokenizers import * +from vllm.utils import make_async logger = init_logger(__name__) diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index adc8d9b90ddb6..a3b979e8fbc13 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,10 +1,11 @@ from typing import Optional + from vllm.config import TokenizerPoolConfig +from vllm.engine.ray_utils import ray from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( TokenizerGroup) -from vllm.engine.ray_utils import ray if ray: from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index e048ec05bece7..8ea46f7db1681 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -2,16 +2,16 @@ import os from typing import List, Optional +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from transformers import PreTrainedTokenizer from vllm.config import TokenizerPoolConfig -from vllm.lora.request import LoRARequest from vllm.engine.ray_utils import ray +from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( TokenizerGroup) -from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy class RayTokenizerGroupPool(BaseTokenizerGroup): diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index ec20d0fb713a4..927cbeed073bf 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -4,11 +4,11 @@ from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import (get_lora_tokenizer, - get_lora_tokenizer_async) + get_lora_tokenizer_async, + get_tokenizer) from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) from vllm.utils import LRUCache -from vllm.transformers_utils.tokenizer import get_tokenizer class TokenizerGroup(BaseTokenizerGroup): diff --git a/vllm/utils.py b/vllm/utils.py index 13b3621a89638..4b9558ffe88d8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,27 +1,22 @@ +import asyncio import enum +import gc import os import socket import subprocess import uuid -import gc +import warnings +from collections import OrderedDict +from functools import lru_cache, partial from platform import uname -from typing import List, Tuple, Union, Generic -from packaging.version import parse, Version +from typing import (Any, Awaitable, Callable, Generic, Hashable, List, + Optional, Tuple, TypeVar, Union) import psutil import torch -import asyncio -from functools import partial, lru_cache -from typing import ( - Awaitable, - Callable, - TypeVar, -) -from collections import OrderedDict -from typing import Any, Hashable, Optional +from packaging.version import Version, parse from vllm.logger import init_logger -import warnings T = TypeVar("T") logger = init_logger(__name__) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index b403e28d8934d..27d1727cd16a3 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,7 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import is_pin_memory_available, STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available logger = init_logger(__name__) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6e1fb4ede815c..fd96e752bb15d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,29 +1,28 @@ import contextlib import time -from typing import Dict, List, Optional, Tuple, Set +from typing import Dict, List, Optional, Set, Tuple import numpy as np import torch import torch.nn as nn from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, +from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model -from vllm.model_executor.parallel_utils import cupy_utils +from vllm.model_executor.parallel_utils import cupy_utils, custom_all_reduce from vllm.model_executor.parallel_utils.communication_op import ( broadcast_tensor_dict) from vllm.model_executor.parallel_utils.parallel_state import ( with_cupy_nccl_for_all_reduce) -from vllm.model_executor.parallel_utils import custom_all_reduce from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata -from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager -from vllm.lora.layers import LoRAMapping -from vllm.lora.request import LoRARequest -from vllm.utils import (async_tensor_h2d, CudaMemoryProfiler, +from vllm.utils import (CudaMemoryProfiler, async_tensor_h2d, is_pin_memory_available, make_tensor_with_pad, maybe_expand_dim) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2f9398a701b45..48c276681e9d4 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,13 +1,14 @@ """A GPU worker class.""" import gc import os -from typing import Dict, List, Tuple, Set, Optional +from typing import Dict, List, Optional, Set, Tuple import torch import torch.distributed -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) +from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.model_executor.parallel_utils import cupy_utils from vllm.model_executor.parallel_utils.communication_op import ( @@ -18,7 +19,6 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner -from vllm.lora.request import LoRARequest class Worker: