diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml new file mode 100644 index 0000000000000..7ef052a525963 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -0,0 +1,22 @@ +name: 📚 Documentation +description: Report an issue related to https://docs.vllm.ai/ +title: "[Doc]: " +labels: ["doc"] + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in https://docs.vllm.ai/ is an issue. + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml new file mode 100644 index 0000000000000..4c6c96187cc6c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/200-installation.yml @@ -0,0 +1,39 @@ +name: 🛠️ Installation +description: Report an issue here when you hit errors during installation. +title: "[Installation]: " +labels: ["installation"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How you are installing vllm + description: | + Paste the full command you are trying to execute. + value: | + ```sh + pip install -vvv vllm + ``` +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml new file mode 100644 index 0000000000000..88227b4b2e7b9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -0,0 +1,37 @@ +name: 💻 Usage +description: Raise an issue here if you don't know how to use vllm. +title: "[Usage]: " +labels: ["usage"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How would you like to use vllm + description: | + A detailed description of how you want to use vllm. + value: | + I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml new file mode 100644 index 0000000000000..f1124dfa78bbc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/400-bug report.yml @@ -0,0 +1,81 @@ +name: 🐛 Bug report +description: Raise an issue here if you find a bug. +title: "[Bug]: " +labels: ["bug"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: 🐛 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + + If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM(model="facebook/opt-125m") + + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + placeholder: | + A clear and concise description of what the bug is. + + ```python + # Sample code to reproduce the problem + ``` + + ``` + The error message you got, with the full traceback. + ``` + validations: + required: true +- type: markdown + attributes: + value: > + ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + + - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). + + - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. + + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature request.yml new file mode 100644 index 0000000000000..0dd5a3e5d14de --- /dev/null +++ b/.github/ISSUE_TEMPLATE/500-feature request.yml @@ -0,0 +1,31 @@ +name: 🚀 Feature request +description: Submit a proposal/request for a new vllm feature +title: "[Feature]: " +labels: ["feature"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: 🚀 The feature, motivation and pitch + description: > + A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A description of any alternative solutions or features you've considered, if any. +- type: textarea + attributes: + label: Additional context + description: > + Add any other context or screenshots about the feature request. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new model.yml new file mode 100644 index 0000000000000..bbddbfd67138a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/600-new model.yml @@ -0,0 +1,33 @@ +name: 🤗 Support request for a new model from huggingface +description: Submit a proposal/request for a new model from huggingface +title: "[New Model]: " +labels: ["new model"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). + + #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. +- type: textarea + attributes: + label: The model to consider. + description: > + A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . + validations: + required: true +- type: textarea + attributes: + label: The closest model vllm already supports. + description: > + Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? +- type: textarea + attributes: + label: What's your difficulty of supporting the model you want? + description: > + For example, any new operators or new architecture? +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml new file mode 100644 index 0000000000000..9e8e7b4aa3530 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml @@ -0,0 +1,51 @@ +name: ⚡ Discussion on the performance of vllm +description: Submit a proposal/discussion about the performance of vllm +title: "[Performance]: " +labels: ["performance"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Proposal to improve performance + description: > + How do you plan to improve vllm's performance? + validations: + required: false +- type: textarea + attributes: + label: Report of performance regression + description: > + Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . + validations: + required: false +- type: textarea + attributes: + label: Misc discussion on performance + description: > + Anything about the performance. + validations: + required: false +- type: textarea + attributes: + label: Your current environment (if you think it is necessary) + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: false +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc discussion.yml new file mode 100644 index 0000000000000..ddb10f72db293 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/800-misc discussion.yml @@ -0,0 +1,21 @@ +name: 🎲 Misc/random discussions that do not fit into the above categories. +description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. +title: "[Misc]: " +labels: ["misc"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Anything you want to discuss about vllm. + description: > + Anything you want to discuss about vllm. + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000..3ba13e0cec6cb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.yapfignore b/.yapfignore new file mode 100644 index 0000000000000..2d6dcf8380cac --- /dev/null +++ b/.yapfignore @@ -0,0 +1 @@ +collect_env.py diff --git a/Dockerfile b/Dockerfile index dd4867702d3de..8be03b3567f0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1 RUN python3 setup.py build_ext --inplace #################### EXTENSION Build IMAGE #################### +#################### FLASH_ATTENTION Build IMAGE #################### +FROM dev as flash-attn-builder +# max jobs used for build +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# flash attention version +ARG flash_attn_version=v2.5.6 +ENV FLASH_ATTN_VERSION=${flash_attn_version} + +WORKDIR /usr/src/flash-attention-v2 + +# Download the wheel or build it if a pre-compiled release doesn't exist +RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ + --no-build-isolation --no-deps --no-cache-dir + +#################### FLASH_ATTENTION Build IMAGE #################### #################### TEST IMAGE #################### # image to run unit testing suite @@ -68,6 +84,9 @@ WORKDIR /vllm-workspace # ADD is used to preserve directory structure ADD . /vllm-workspace/ COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose @@ -88,6 +107,11 @@ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt + +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir + #################### RUNTIME BASE IMAGE #################### @@ -96,7 +120,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate + pip install accelerate hf_transfer COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 26d2c24d5655c..8782f5546b21e 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation # This file has been modified by Neural Magic import json diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index a0307439cd5f1..5867e3b171919 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,3 +1,6 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation + import argparse import time diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3f5e2d9c8f4dc..7699304769653 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation """Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py index ba27f110d1736..964eca5aaf72b 100644 --- a/benchmarks/kernels/benchmark_mixtral_moe.py +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -2,7 +2,7 @@ import os import sys -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name import torch import torch.nn.functional as F import triton @@ -103,17 +103,25 @@ def run_grid(bs, method): best_config = config best_time_us = kernel_dur_us - print( - f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}' - ) + print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}' + f' {bs=} {tp_size=} {top_k=} {num_total_experts=} ' + f'{d_model=} {model_intermediate_size=} {num_layers=}') print("best_time_us", best_time_us) print("best_config", best_config) - filename = "/tmp/config.jsonl" + # holds Dict[str, Dict[str, int]] + filename = get_config_file_name(num_total_experts, + model_intermediate_size // tp_size) print(f"writing config to file {filename}") - with open(filename, "a") as f: - f.write(json.dumps({str(bs): best_config}) + "\n") + existing_content = {} + if os.path.exists(filename): + with open(filename, "r") as f: + existing_content = json.load(f) + existing_content[str(bs)] = best_config + with open(filename, "w") as f: + json.dump(existing_content, f, indent=4) + f.write("\n") def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py new file mode 100644 index 0000000000000..f9564dd9588f0 --- /dev/null +++ b/benchmarks/kernels/benchmark_rope.py @@ -0,0 +1,120 @@ +from typing import Optional + +import argparse +import torch +import nvtx +from itertools import accumulate +from vllm.model_executor.layers.rotary_embedding import get_rope + + +def benchmark_rope_kernels_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + # silulating serving 4 LoRAs + scaling_factors = [1, 2, 4, 8] + # batched RoPE can take multiple scaling factors + batched_rope = get_rope(head_size, rotary_dim, max_position, base, + is_neox_style, { + "type": "linear", + "factor": tuple(scaling_factors) + }) + # non-batched RoPE takes only one scaling factor, we create multiple + # instances to simulate the same behavior + non_batched_ropes = [] + for scaling_factor in scaling_factors: + non_batched_ropes.append( + get_rope(head_size, rotary_dim, max_position, base, is_neox_style, + { + "type": "linear", + "factor": (scaling_factor, ) + })) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # create query offsets for batched RoPE, we concat multiple kv cache + # together and each query needs to find the right kv cache of its type + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + # map query types to offsets + query_offsets = offset_map[query_types] + # the kernel takes flattened offsets + flatten_offsets = query_offsets.flatten() + + # batched queries of the same type together for non-batched RoPE + queries = [query[query_types == i] for i in range(len(scaling_factors))] + keys = [key[query_types == i] for i in range(len(scaling_factors))] + packed_qkr = zip(queries, keys, non_batched_ropes) + # synchronize before start timing + torch.cuda.synchronize() + with nvtx.annotate("non-batched", color="yellow"): + for q, k, r in packed_qkr: + r.forward(positions, q, k) + torch.cuda.synchronize() + with nvtx.annotate("batched", color="green"): + batched_rope.forward(positions, query, key, flatten_offsets) + torch.cuda.synchronize() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Benchmark the rotary embedding kernels.") + parser.add_argument("--is-neox-style", type=bool, default=True) + parser.add_argument("--batch-size", type=int, default=16) + parser.add_argument("--seq-len", type=int, default=512) + parser.add_argument("--num-heads", type=int, default=8) + parser.add_argument("--head-size", + type=int, + choices=[64, 80, 96, 112, 128, 256], + default=128) + parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) + parser.add_argument("--dtype", + type=str, + choices=["bfloat16", "float"], + default="float") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--device", + type=str, + choices=["cuda:0", "cuda:1"], + default="cuda:0") + args = parser.parse_args() + print(args) + + benchmark_rope_kernels_multi_lora( + is_neox_style=args.is_neox_style, + batch_size=args.batch_size, + seq_len=args.seq_len, + num_heads=args.num_heads, + head_size=args.head_size, + rotary_dim=args.rotary_dim, + dtype=getattr(torch, args.dtype), + seed=args.seed, + device=args.device, + ) diff --git a/collect_env.py b/collect_env.py new file mode 100644 index 0000000000000..3c914795222ee --- /dev/null +++ b/collect_env.py @@ -0,0 +1,692 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff. +# This file has been modified by Neural Magic + +# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py + +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` +import datetime +import locale +import re +import subprocess +import sys +import os +from collections import namedtuple + + +try: + import torch + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple('SystemEnv', [ + 'torch_version', + 'is_debug_build', + 'cuda_compiled_version', + 'gcc_version', + 'clang_version', + 'cmake_version', + 'os', + 'libc_version', + 'python_version', + 'python_platform', + 'is_cuda_available', + 'cuda_runtime_version', + 'cuda_module_loading', + 'nvidia_driver_version', + 'nvidia_gpu_models', + 'cudnn_version', + 'pip_version', # 'pip' or 'pip3' + 'pip_packages', + 'conda_packages', + 'hip_compiled_version', + 'hip_runtime_version', + 'miopen_runtime_version', + 'caching_allocator_config', + 'is_xnnpack_available', + 'cpu_info', + 'rocm_version', # vllm specific field + 'neuron_sdk_version', # vllm specific field + 'vllm_version', # vllm specific field + 'vllm_build_flags', # vllm specific field + 'gpu_topo', # vllm specific field +]) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "cudatoolkit", + "soumith", + "mkl", + "magma", + "triton", + "optree", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "triton", + "optree", + "onnx", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + p = subprocess.Popen(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=shell) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == 'win32': + enc = 'oem' + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + err = raw_err.decode(enc) + return rc, output.strip(), err.strip() + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split('\n')[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get('CONDA_EXE', 'conda') + out = run_and_read_all(run_lambda, "{} list".format(conda)) + if out is None: + return out + + return "\n".join( + line + for line in out.splitlines() + if not line.startswith("#") + and any(name in line for name in patterns) + ) + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)') + +def get_clang_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'clang --version', r'clang version (.*)') + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)') + + +def get_nvidia_driver_version(run_lambda): + if get_platform() == 'darwin': + cmd = 'kextstat | grep -i cuda' + return run_and_parse_first_match(run_lambda, cmd, + r'com[.]nvidia[.]CUDA [(](.*?)[)]') + smi = get_nvidia_smi() + return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ') + + +def get_gpu_info(run_lambda): + if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(torch.version, 'hip') and torch.version.hip is not None): + if TORCH_AVAILABLE and torch.cuda.is_available(): + if torch.version.hip is not None: + prop = torch.cuda.get_device_properties(0) + if hasattr(prop, "gcnArchName"): + gcnArch = " ({})".format(prop.gcnArchName) + else: + gcnArch = "NoGCNArchNameOnOldPyTorch" + else: + gcnArch = "" + return torch.cuda.get_device_name(None) + gcnArch + return None + smi = get_nvidia_smi() + uuid_regex = re.compile(r' \(UUID: .+?\)') + rc, out, _ = run_lambda(smi + ' -L') + if rc != 0: + return None + # Anonymize GPUs by removing their UUID + return re.sub(uuid_regex, '', out) + + +def get_running_cuda_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'nvcc --version', r'release .+ V(.*)') + + +def get_cudnn_version(run_lambda): + """Return a list of libcudnn.so; it's hard to tell which one is being used.""" + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%") + where_cmd = os.path.join(system_root, 'System32', 'where') + cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) + elif get_platform() == 'darwin': + # CUDA libraries and drivers can be found in /usr/local/cuda/. See + # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install + # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. + cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*' + else: + cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' + rc, out, _ = run_lambda(cudnn_cmd) + # find will return 1 if there are permission errors or if not found + if len(out) == 0 or (rc != 1 and rc != 0): + l = os.environ.get('CUDNN_LIBRARY') + if l is not None and os.path.isfile(l): + return os.path.realpath(l) + return None + files_set = set() + for fn in out.split('\n'): + fn = os.path.realpath(fn) # eliminate symbolic links + if os.path.isfile(fn): + files_set.add(fn) + if not files_set: + return None + # Alphabetize the result because the order is non-deterministic otherwise + files = sorted(files_set) + if len(files) == 1: + return files[0] + result = '\n'.join(files) + return 'Probably one of the following:\n{}'.format(result) + + +def get_nvidia_smi(): + # Note: nvidia-smi is currently available only on Windows and Linux + smi = 'nvidia-smi' + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files') + legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi) + new_path = os.path.join(system_root, 'System32', smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = '"{}"'.format(candidate_smi) + break + return smi + + +def get_rocm_version(run_lambda): + """Returns the ROCm version if available, otherwise 'N/A'.""" + return run_and_parse_first_match(run_lambda, 'hipcc --version', r'HIP version: (\S+)') + + +def get_neuron_sdk_version(run_lambda): + # Adapted from your install script + try: + result = run_lambda(["neuron-ls"]) + return result if result[0] == 0 else 'N/A' + except Exception: + return 'N/A' + + +def get_vllm_version(): + try: + import vllm + return vllm.__version__ + except ImportError: + return 'N/A' + + +def summarize_vllm_build_flags(): + # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. + return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( + os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'), + 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled', + 'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled', + ) + + +def get_gpu_topo(run_lambda): + if get_platform() == 'linux': + return run_and_read_all(run_lambda, 'nvidia-smi topo -m') + return None + + +# example outputs of CPU infos +# * linux +# Architecture: x86_64 +# CPU op-mode(s): 32-bit, 64-bit +# Address sizes: 46 bits physical, 48 bits virtual +# Byte Order: Little Endian +# CPU(s): 128 +# On-line CPU(s) list: 0-127 +# Vendor ID: GenuineIntel +# Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# CPU family: 6 +# Model: 106 +# Thread(s) per core: 2 +# Core(s) per socket: 32 +# Socket(s): 2 +# Stepping: 6 +# BogoMIPS: 5799.78 +# Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr +# sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl +# xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 +# pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand +# hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced +# fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap +# avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 +# xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq +# avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities +# Virtualization features: +# Hypervisor vendor: KVM +# Virtualization type: full +# Caches (sum of all): +# L1d: 3 MiB (64 instances) +# L1i: 2 MiB (64 instances) +# L2: 80 MiB (64 instances) +# L3: 108 MiB (2 instances) +# NUMA: +# NUMA node(s): 2 +# NUMA node0 CPU(s): 0-31,64-95 +# NUMA node1 CPU(s): 32-63,96-127 +# Vulnerabilities: +# Itlb multihit: Not affected +# L1tf: Not affected +# Mds: Not affected +# Meltdown: Not affected +# Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +# Retbleed: Not affected +# Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +# Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +# Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence +# Srbds: Not affected +# Tsx async abort: Not affected +# * win32 +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU0 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 +# +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU1 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 + +def get_cpu_info(run_lambda): + rc, out, err = 0, '', '' + if get_platform() == 'linux': + rc, out, err = run_lambda('lscpu') + elif get_platform() == 'win32': + rc, out, err = run_lambda('wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE') + elif get_platform() == 'darwin': + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = 'None' + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith('linux'): + return 'linux' + elif sys.platform.startswith('win32'): + return 'win32' + elif sys.platform.startswith('cygwin'): + return 'cygwin' + elif sys.platform.startswith('darwin'): + return 'darwin' + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)') + + +def get_windows_version(run_lambda): + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic') + findstr_cmd = os.path.join(system_root, 'System32', 'findstr') + return run_and_read_all(run_lambda, '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd)) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)') + + +def check_release_file(run_lambda): + return run_and_parse_first_match(run_lambda, 'cat /etc/*-release', + r'PRETTY_NAME="(.*)"') + + +def get_os(run_lambda): + from platform import machine + platform = get_platform() + + if platform == 'win32' or platform == 'cygwin': + return get_windows_version(run_lambda) + + if platform == 'darwin': + version = get_mac_version(run_lambda) + if version is None: + return None + return 'macOS {} ({})'.format(version, machine()) + + if platform == 'linux': + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + return '{} ({})'.format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + return platform.platform() + + +def get_libc_version(): + import platform + if get_platform() != 'linux': + return 'N/A' + return '-'.join(platform.libc_ver()) + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + # People generally have `pip` as `pip` or `pip3` + # But here it is invoked as `python -mpip` + def run_with_pip(pip): + out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"]) + return "\n".join( + line + for line in out.splitlines() + if any(name in line for name in patterns) + ) + + pip_version = 'pip3' if sys.version[0] == '3' else 'pip' + out = run_with_pip([sys.executable, '-mpip']) + + return pip_version, out + + +def get_cachingallocator_config(): + ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '') + return ca_config + + +def get_cuda_module_loading_config(): + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.init() + config = os.environ.get('CUDA_MODULE_LOADING', '') + return config + else: + return "N/A" + + +def is_xnnpack_available(): + if TORCH_AVAILABLE: + import torch.backends.xnnpack + return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined] + else: + return "N/A" + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + cuda_available_str = str(torch.cuda.is_available()) + cuda_version_str = torch.version.cuda + if not hasattr(torch.version, 'hip') or torch.version.hip is None: # cuda version + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + else: # HIP version + def get_version_or_na(cfg, prefix): + _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] + return _lst[0] if _lst else 'N/A' + + cfg = torch._C._show_config().split('\n') + hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime') + miopen_runtime_version = get_version_or_na(cfg, 'MIOpen') + cuda_version_str = 'N/A' + hip_compiled_version = torch.version.hip + else: + version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A' + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + rocm_version = get_rocm_version(run_lambda) + neuron_sdk_version = get_neuron_sdk_version(run_lambda) + vllm_version = get_vllm_version() + vllm_build_flags = summarize_vllm_build_flags() + gpu_topo = get_gpu_topo(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version='{} ({}-bit runtime)'.format(sys_version, sys.maxsize.bit_length() + 1), + python_platform=get_python_platform(), + is_cuda_available=cuda_available_str, + cuda_compiled_version=cuda_version_str, + cuda_runtime_version=get_running_cuda_version(run_lambda), + cuda_module_loading=get_cuda_module_loading_config(), + nvidia_gpu_models=get_gpu_info(run_lambda), + nvidia_driver_version=get_nvidia_driver_version(run_lambda), + cudnn_version=get_cudnn_version(run_lambda), + hip_compiled_version=hip_compiled_version, + hip_runtime_version=hip_runtime_version, + miopen_runtime_version=miopen_runtime_version, + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + caching_allocator_config=get_cachingallocator_config(), + is_xnnpack_available=is_xnnpack_available(), + cpu_info=get_cpu_info(run_lambda), + rocm_version=rocm_version, + neuron_sdk_version=neuron_sdk_version, + vllm_version=vllm_version, + vllm_build_flags=vllm_build_flags, + gpu_topo=gpu_topo, + ) + +env_info_fmt = """ +PyTorch version: {torch_version} +Is debug build: {is_debug_build} +CUDA used to build PyTorch: {cuda_compiled_version} +ROCM used to build PyTorch: {hip_compiled_version} + +OS: {os} +GCC version: {gcc_version} +Clang version: {clang_version} +CMake version: {cmake_version} +Libc version: {libc_version} + +Python version: {python_version} +Python platform: {python_platform} +Is CUDA available: {is_cuda_available} +CUDA runtime version: {cuda_runtime_version} +CUDA_MODULE_LOADING set to: {cuda_module_loading} +GPU models and configuration: {nvidia_gpu_models} +Nvidia driver version: {nvidia_driver_version} +cuDNN version: {cudnn_version} +HIP runtime version: {hip_runtime_version} +MIOpen runtime version: {miopen_runtime_version} +Is XNNPACK available: {is_xnnpack_available} + +CPU: +{cpu_info} + +Versions of relevant libraries: +{pip_packages} +{conda_packages} +""".strip() + +env_info_fmt += """ +ROCM Version: {rocm_version} +Neuron SDK Version: {neuron_sdk_version} +vLLM Version: {vllm_version} +vLLM Build Flags: +{vllm_build_flags} +GPU Topology: +{gpu_topo} +""".strip() + + +def pretty_str(envinfo): + def replace_nones(dct, replacement='Could not collect'): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true='Yes', false='No'): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag='[prepend]'): + lines = text.split('\n') + updated_lines = [tag + line for line in lines] + return '\n'.join(updated_lines) + + def replace_if_empty(text, replacement='No relevant packages'): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split('\n')) > 1: + return '\n{}\n'.format(string) + return string + + mutable_dict = envinfo._asdict() + + # If nvidia_gpu_models is multiline, start on the next line + mutable_dict['nvidia_gpu_models'] = \ + maybe_start_on_next_line(envinfo.nvidia_gpu_models) + + # If the machine doesn't have CUDA, report some fields as 'No CUDA' + dynamic_cuda_fields = [ + 'cuda_runtime_version', + 'nvidia_gpu_models', + 'nvidia_driver_version', + ] + all_cuda_fields = dynamic_cuda_fields + ['cudnn_version'] + all_dynamic_cuda_fields_missing = all( + mutable_dict[field] is None for field in dynamic_cuda_fields) + if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing: + for field in all_cuda_fields: + mutable_dict[field] = 'No CUDA' + if envinfo.cuda_compiled_version is None: + mutable_dict['cuda_compiled_version'] = 'None' + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages']) + mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages']) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict['pip_packages']: + mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'], + '[{}] '.format(envinfo.pip_version)) + if mutable_dict['conda_packages']: + mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'], + '[conda] ') + mutable_dict['cpu_info'] = envinfo.cpu_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(torch.utils, '_crash_handler'): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S') + msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \ + "if this is related to your bug please include it when you file a report ***" + print(msg, file=sys.stderr) + + + +if __name__ == '__main__': + main() diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 22b10f0571d1c..24d972702c858 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -33,12 +33,25 @@ template __device__ __forceinline__ T gelu_kernel(const T& x) { // Equivalent to PyTorch GELU with 'none' approximation. // Refer to: - // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38 + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 const float f = (float) x; constexpr float ALPHA = M_SQRT1_2; return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA))); } +template +__device__ __forceinline__ T gelu_tanh_kernel(const T& x) { + // Equivalent to PyTorch GELU with 'tanh' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 + const float f = (float) x; + constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; + constexpr float KAPPA = 0.044715; + float x_cube = f * f * f; + float inner = BETA * (f + KAPPA * x_cube); + return (T) (0.5f * f * (1.0f + ::tanhf(inner))); +} + } // namespace vllm // Launch activation and gating kernel. @@ -73,6 +86,13 @@ void gelu_and_mul( LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); } +void gelu_tanh_and_mul( + torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] +{ + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); +} + namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 8519b82b3580d..3f111ac488066 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -55,6 +55,16 @@ void rotary_embedding( torch::Tensor& cos_sin_cache, bool is_neox); +void batched_rotary_embedding( + torch::Tensor& positions, + torch::Tensor& query, + torch::Tensor& key, + int head_size, + torch::Tensor& cos_sin_cache, + bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets); + void silu_and_mul( torch::Tensor& out, torch::Tensor& input); @@ -63,6 +73,10 @@ void gelu_and_mul( torch::Tensor& out, torch::Tensor& input); +void gelu_tanh_and_mul( + torch::Tensor& out, + torch::Tensor& input); + void gelu_new( torch::Tensor& out, torch::Tensor& input); @@ -88,12 +102,12 @@ torch::Tensor awq_dequantize( int thy); torch::Tensor marlin_gemm( - torch::Tensor& a, + torch::Tensor& a, torch::Tensor& b_q_weight, - torch::Tensor& b_scales, + torch::Tensor& b_scales, torch::Tensor& workspace, - int64_t size_m, - int64_t size_n, + int64_t size_m, + int64_t size_n, int64_t size_k); #endif diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 5f522795619e1..d80cb6973fad6 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -8,7 +8,7 @@ namespace vllm { template -inline __device__ void apply_rotary_embedding( +inline __device__ void apply_token_rotary_embedding( scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr, const scalar_t* __restrict__ sin_ptr, @@ -38,22 +38,18 @@ inline __device__ void apply_rotary_embedding( } template -__global__ void rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] +inline __device__ void apply_rotary_embedding( scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] - const int rot_dim, - const int64_t query_stride, - const int64_t key_stride, + const scalar_t* cache_ptr, + const int head_size, const int num_heads, const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - + const int rot_dim, + const int token_idx, + const int64_t query_stride, + const int64_t key_stride) +{ const int embed_dim = rot_dim / 2; const scalar_t* cos_ptr = cache_ptr; const scalar_t* sin_ptr = cache_ptr + embed_dim; @@ -63,7 +59,7 @@ __global__ void rotary_embedding_kernel( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * query_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_rotary_embedding(query + token_head, cos_ptr, + apply_token_rotary_embedding(query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } @@ -72,11 +68,53 @@ __global__ void rotary_embedding_kernel( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * key_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_rotary_embedding(key + token_head, cos_ptr, + apply_token_rotary_embedding(key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } } +template +__global__ void rotary_embedding_kernel( + const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int rot_dim, + const int64_t query_stride, + const int64_t key_stride, + const int num_heads, + const int num_kv_heads, + const int head_size) { + // Each thread block is responsible for one token. + const int token_idx = blockIdx.x; + int64_t pos = positions[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; + + apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); +} + +template +__global__ void batched_rotary_embedding_kernel( + const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] or [num_tokens] + const int rot_dim, + const int64_t query_stride, + const int64_t key_stride, + const int num_heads, + const int num_kv_heads, + const int head_size) { + // Each thread block is responsible for one token. + const int token_idx = blockIdx.x; + int64_t pos = positions[token_idx]; + int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; + + apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); +} + } // namespace vllm void rotary_embedding( @@ -128,3 +166,61 @@ void rotary_embedding( } }); } + +/* +Batched version of rotary embedding, pack multiple LoRAs together +and process in batched manner. +*/ +void batched_rotary_embedding( + torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] + torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] + torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] + int head_size, + torch::Tensor& cos_sin_cache, // [max_position, rot_dim] + bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets // [num_tokens] +) { + int64_t num_tokens = cos_sin_cache_offsets.size(0); + int num_heads = query.size(-1) / head_size; + int num_kv_heads = key.size(-1) / head_size; + int64_t query_stride = query.stride(-2); + int64_t key_stride = key.stride(-2); + + dim3 grid(num_tokens); + dim3 block(std::min(num_heads * rot_dim / 2, 512)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + query.scalar_type(), + "rotary_embedding", + [&] { + if (is_neox) { + vllm::batched_rotary_embedding_kernel<<>>( + positions.data_ptr(), + query.data_ptr(), + key.data_ptr(), + cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), + rot_dim, + query_stride, + key_stride, + num_heads, + num_kv_heads, + head_size); + } else { + vllm::batched_rotary_embedding_kernel<<>>( + positions.data_ptr(), + query.data_ptr(), + key.data_ptr(), + cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), + rot_dim, + query_stride, + key_stride, + num_heads, + num_kv_heads, + head_size); + } + }); +} diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 4dc90de1ab42a..a7415dfc91369 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -43,6 +43,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 20480) \ + f(in_T, out_T, W_T, narrow, 22016) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 28672) \ f(in_T, out_T, W_T, narrow, 32000) \ diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py index 66de56d74f3e7..7ceaf9e6892a5 100644 --- a/csrc/punica/bgmv/generator.py +++ b/csrc/punica/bgmv/generator.py @@ -10,7 +10,7 @@ #include "bgmv_impl.cuh" FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) -""".lstrip() +""".lstrip() # noqa: E501 (UPSTREAM SYNC nm-automation) for input_dtype in DTYPES: for output_dtype in DTYPES: diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 72e1c5bfea83f..6ab8843ca9e65 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -27,7 +27,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ops.def( "gelu_and_mul", &gelu_and_mul, - "Activation function used in GeGLU."); + "Activation function used in GeGLU with `none` approximation."); + ops.def( + "gelu_tanh_and_mul", + &gelu_tanh_and_mul, + "Activation function used in GeGLU with `tanh` approximation."); ops.def( "gelu_new", &gelu_new, @@ -54,6 +58,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &rotary_embedding, "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); + ops.def( + "batched_rotary_embedding", + &batched_rotary_embedding, + "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)"); + // Quantization ops #ifndef USE_ROCM ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index 210bf0b023ab2..c25464e866e55 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -29,12 +29,22 @@ __inline__ __device__ T warpReduceSum(T val) { return val; } +__inline__ __device__ constexpr int _calculateLaneMask(int warp_size) { + return warp_size - 1; +} + +__inline__ __device__ constexpr int _calculateWidShift(int warp_size) { + return 5 + (warp_size >> 6); +} + /* Calculate the sum of all elements in a block */ template __inline__ __device__ T blockReduceSum(T val) { static __shared__ T shared[WARP_SIZE]; - int lane = threadIdx.x & 0x1f; - int wid = threadIdx.x >> 5; + constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE); + constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE); + int lane = threadIdx.x & LANE_MASK; + int wid = threadIdx.x >> WID_SHIFT; val = warpReduceSum(val); diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst index b550a9b5faa62..1de6d7adc87c6 100644 --- a/docs/source/dev/engine/llm_engine.rst +++ b/docs/source/dev/engine/llm_engine.rst @@ -2,5 +2,5 @@ LLMEngine ================================= .. autoclass:: vllm.engine.llm_engine.LLMEngine - :members: add_request, abort_request, step, _init_cache + :members: add_request, abort_request, step :show-inheritance: \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index c0250bf99f7ae..65bfbbabf8be1 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -73,6 +73,7 @@ Documentation serving/run_on_sky serving/deploying_with_kserve serving/deploying_with_triton + serving/deploying_with_bentoml serving/deploying_with_docker serving/serving_with_langchain serving/metrics diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 71211189d184f..f05fafe9f8279 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -90,9 +90,10 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other LoRA adapter requests if they were provided and ``max_loras`` is set high enough). -The following is an example request +The following is an example request + +.. code-block:: bash -.. code-block::bash curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 3b30dfaf3b47b..4019e0bbd90fb 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -22,23 +22,23 @@ Alongside each architecture, we include some popular models that use it. * - :code:`BaiChuanForCausalLM` - Baichuan - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. - - + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - - + - * - :code:`DeciLMForCausalLM` - DeciLM - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. - - + - * - :code:`BloomForCausalLM` - BLOOM, BLOOMZ, BLOOMChat - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. - - + - * - :code:`FalconForCausalLM` - Falcon - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. @@ -46,19 +46,19 @@ Alongside each architecture, we include some popular models that use it. * - :code:`GPT2LMHeadModel` - GPT-2 - :code:`gpt2`, :code:`gpt2-xl`, etc. - - + - * - :code:`GPTBigCodeForCausalLM` - StarCoder, SantaCoder, WizardCoder - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - - + - * - :code:`GPTJForCausalLM` - GPT-J - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. - - + - * - :code:`GPTNeoXForCausalLM` - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. - - + - * - :code:`InternLMForCausalLM` - InternLM - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. @@ -66,7 +66,7 @@ Alongside each architecture, we include some popular models that use it. * - :code:`InternLM2ForCausalLM` - InternLM2 - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. - - + - * - :code:`LlamaForCausalLM` - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. @@ -82,27 +82,27 @@ Alongside each architecture, we include some popular models that use it. * - :code:`MPTForCausalLM` - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - - + - * - :code:`OLMoForCausalLM` - OLMo - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc. - - + - * - :code:`OPTForCausalLM` - OPT, OPT-IML - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. - - + - * - :code:`OrionForCausalLM` - Orion - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. - - + - * - :code:`PhiForCausalLM` - Phi - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - - + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - + - * - :code:`Qwen2ForCausalLM` - Qwen2 - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. @@ -110,7 +110,7 @@ Alongside each architecture, we include some popular models that use it. * - :code:`StableLmForCausalLM` - StableLM - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. - - + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst new file mode 100644 index 0000000000000..4b9d19f5bdb72 --- /dev/null +++ b/docs/source/serving/deploying_with_bentoml.rst @@ -0,0 +1,8 @@ +.. _deploying_with_bentoml: + +Deploying with BentoML +====================== + +`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py index cd4451481ca83..7b1d580a9a7f6 100644 --- a/examples/multilora_inference.py +++ b/examples/multilora_inference.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation """ This example shows how to use the multi-LoRA functionality for offline inference. diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py index 9b9dc4d94892f..da8874abd92a2 100644 --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference_neuron.py @@ -14,14 +14,16 @@ llm = LLM( model="openlm-research/open_llama_3b", max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as max sequence length, - # when targeting neuron device. Currently, this is a known limitation in continuous batching - # support in transformers-neuronx. + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx. max_model_len=128, block_size=128, # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, or explicitly assigned. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. device="neuron") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 1aa718b88907c..2c6c6aa63944d 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,3 +1,6 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation + from vllm import LLM, SamplingParams prefix = ( diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index f48b6314eb055..071f134c6e5e0 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1,35 +1,4 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.2.3" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -42,6 +11,12 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] @@ -50,14 +25,14 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 29, "links": [], "liveNow": false, "panels": [ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "End to end request latency measured in seconds.", "fieldConfig": { @@ -66,7 +41,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -80,7 +54,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -138,11 +111,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -154,11 +127,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -171,11 +144,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -188,11 +161,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -205,10 +178,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])", + "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Average", @@ -222,7 +195,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Number of tokens processed per second", "fieldConfig": { @@ -231,7 +204,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -245,7 +217,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -302,11 +273,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])", + "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -318,11 +289,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:generation_tokens_total[$__rate_interval])", + "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -339,7 +310,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Inter token latency in seconds.", "fieldConfig": { @@ -348,7 +319,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -362,7 +332,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -420,11 +389,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -436,11 +405,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -453,11 +422,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -470,11 +439,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -487,10 +456,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])", + "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Mean", @@ -504,7 +473,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", "fieldConfig": { @@ -513,7 +482,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -527,7 +495,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -585,11 +552,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_running", + "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -601,11 +568,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_swapped", + "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -618,11 +585,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_waiting", + "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -639,7 +606,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "P50, P90, P95, and P99 TTFT latency in seconds.", "fieldConfig": { @@ -648,7 +615,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -662,7 +628,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -720,11 +685,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -737,11 +702,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -753,11 +718,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -770,11 +735,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -787,10 +752,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])", + "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Average", @@ -804,7 +769,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Percentage of used cache blocks by vLLM.", "fieldConfig": { @@ -813,7 +778,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -827,7 +791,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -885,10 +848,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "vllm:gpu_cache_usage_perc", + "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", "instant": false, "legendFormat": "GPU Cache Usage", "range": true, @@ -897,10 +860,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc", + "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", "hide": false, "instant": false, "legendFormat": "CPU Cache Usage", @@ -913,10 +876,39 @@ } ], "refresh": "", - "schemaVersion": 39, + "schemaVersion": 37, + "style": "dark", "tags": [], "templating": { - "list": [] + "list": [ + { + "current": { + "selected": false, + "text": "vllm", + "value": "vllm" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(model_name)", + "hide": 0, + "includeAll": false, + "label": "model_name", + "multi": false, + "name": "model_name", + "options": [], + "query": { + "query": "label_values(model_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { "from": "now-5m", diff --git a/format.sh b/format.sh index eb2c5ab031626..ff30111123bee 100755 --- a/format.sh +++ b/format.sh @@ -95,13 +95,17 @@ echo 'vLLM yapf: Done' # echo 'vLLM mypy:' # mypy +CODESPELL_EXCLUDES=( + '--skip' '*docs/source/_build/**' +) + # check spelling of specified files spell_check() { codespell "$@" } spell_check_all(){ - codespell --toml pyproject.toml + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" } # Spelling check of files that differ from main branch. @@ -116,7 +120,7 @@ spell_check_changed() { if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell + codespell "${CODESPELL_EXCLUDES[@]}" fi } diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py index 398f8973cc8d2..b0fa4fbe45187 100644 --- a/neuralmagic/benchmarks/common.py +++ b/neuralmagic/benchmarks/common.py @@ -27,8 +27,8 @@ def max_model_length_from_model_id(model: str, def script_args_to_cla(config: NamedTuple) -> Iterable[dict]: - #config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs - + # config is a NamedTuple constructed from some JSON + # in neuralmagic/benchmarks/configs kv = vars(config.script_args) keys = kv.keys() @@ -57,8 +57,8 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[dict]: def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]: """ - Give a path to a config file in `neuralmagic/benchmarks/configs/*` return an Iterable of - (sub)configs in the file + Give a path to a config file in `neuralmagic/benchmarks/configs/*` + return an Iterable of (sub)configs in the file """ assert config_file_path.exists() diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py index 110d47e354e24..0c10219501ea1 100644 --- a/neuralmagic/benchmarks/run_benchmark_serving.py +++ b/neuralmagic/benchmarks/run_benchmark_serving.py @@ -8,7 +8,8 @@ from typing import NamedTuple, Optional from pathlib import Path -from .common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs +from .common import (download_model, max_model_length_from_model_id, + script_args_to_cla, benchmark_configs) from .scripts.common import warmup_server, num_available_gpus from ..tools.call_cmd import call_cmd @@ -67,8 +68,8 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: server_process = subprocess.Popen("exec " + server_cmd, shell=True) if not is_server_running(BENCH_SERVER_HOST, BENCH_SERVER_PORT): raise ValueError( - f"Aborting bench run with : server-cmd {server_cmd} , bench-cmd {bench_cmd}. Reason: Cannot start Server" - ) + f"Aborting bench run with : server-cmd {server_cmd} , " + f"bench-cmd {bench_cmd}. Reason: Cannot start Server") # server warmup warmup_server(server_host=BENCH_SERVER_HOST, @@ -96,14 +97,14 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: supported_max_model_len = max_model_length_from_model_id(model) - # If the requested model-len is too big, try running with the maximum supported for this model. + # If the requested model-len is too big, try running with the + # maximum supported for this model. max_model_lens = set( map(lambda v: min(v, supported_max_model_len), config.max_model_lens)) if (config.max_model_lens != list(max_model_lens)): - print( - f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}" - ) + print(f"WARNING: max_model_len modified to {max_model_lens} " + f"from {config.max_model_lens} for model {model}") for max_model_len in max_model_lens: @@ -120,7 +121,8 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: server_args["sparsity"] = sparsity server_cmd = "python3 -m vllm.entrypoints.api_server " + \ - " ".join([f"--{k} {v}" for k, v in server_args.items()]) + " ".join([f"--{k} {v}" + for k, v in server_args.items()]) for script_args in script_args_to_cla(config): diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py index d6a505df71559..debb98f8a3279 100644 --- a/neuralmagic/benchmarks/run_benchmark_throughput.py +++ b/neuralmagic/benchmarks/run_benchmark_throughput.py @@ -3,7 +3,8 @@ from pathlib import Path from typing import NamedTuple, Optional -from .common import script_args_to_cla, benchmark_configs, max_model_length_from_model_id +from .common import (script_args_to_cla, benchmark_configs, + max_model_length_from_model_id) from ..tools.call_cmd import call_cmd @@ -19,14 +20,14 @@ def run_benchmark_throughput_script(config: NamedTuple, supported_max_model_len = max_model_length_from_model_id(model) - # If the requested model-len is too big, try running with the maximum supported for this model. + # If the requested model-len is too big, try running with + # the maximum supported for this model. max_model_lens = set( map(lambda v: min(v, supported_max_model_len), config.max_model_lens)) if (config.max_model_lens != list(max_model_lens)): - print( - f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}" - ) + print(f"WARNING: max_model_len modified to {max_model_lens} " + f"from {config.max_model_lens} for model {model}") for max_model_len in max_model_lens: for script_args in script_args_to_cla(config): diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py index 078cfd1c6a7fc..b5e0308848e25 100644 --- a/neuralmagic/benchmarks/scripts/backend_request_func.py +++ b/neuralmagic/benchmarks/scripts/backend_request_func.py @@ -135,7 +135,7 @@ async def async_request_vllm( data = part_data output.latency = time.perf_counter() - st - # When streaming, '\0' is appended to the end of the response. + # When streaming, '\0' is appended to the end. body = trim_suffix(data.decode('utf-8'), "\0") output.generated_text = json.loads( body)["text"][0][len(request_func_input.prompt):] @@ -220,7 +220,8 @@ async def async_request_deepspeed_mii( output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len - # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder. + # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, + # will use 0 as placeholder. # https://github.com/microsoft/DeepSpeed-MII/pull/311 output.ttft = 0 diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py index f0c1d8d9951fc..6dc32e9d552ea 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_serving.py +++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py @@ -238,18 +238,14 @@ async def benchmark(backend: str, api_url: str, model_id: str, print(f"Benchmark duration: {metrics.metadata.duration:2f} s") print(f"Total input tokens: {metrics.metadata.total_input}") print(f"Total generated tokens: {metrics.metadata.total_output}") - print( - f"Request throughput: {metrics.metrics.request_throughput:.2f} requests/s" - ) - print( - f"Input token throughput: {metrics.metrics.input_throughput:.2f} tokens/s" - ) - print( - f"Output token throughput: {metrics.metrics.output_throughput:.2f} tokens/s" - ) - print( - f"Median request latency: {metrics.metrics.median_request_latency:.2f} ms" - ) + print(f"Request throughput: " + f"{metrics.metrics.request_throughput:.2f} requests/s") + print(f"Input token throughput: " + f"{metrics.metrics.input_throughput:.2f} tokens/s") + print(f"Output token throughput: " + f"{metrics.metrics.output_throughput:.2f} tokens/s") + print(f"Median request latency: " + f"{metrics.metrics.median_request_latency:.2f} ms") print(f"P90 request latency: {metrics.metrics.p90_request_latency:.2f} ms") print(f"P99 request latency: {metrics.metrics.p99_request_latency:.2f} ms") print(f"Mean TTFT: {metrics.metrics.mean_ttft_ms:.2f} ms") @@ -349,9 +345,9 @@ def script_args_as_json_dict(script_args: argparse.Namespace): result = metrics.update_benchmark_result(result) # Add information about the derived variables as metadata - result[BenchmarkResult.METADATA_KEY_][ - ResultMetadataKeys.num_prompts] = num_prompts - result[BenchmarkResult.METADATA_KEY_][ResultMetadataKeys.request_rate] = \ + metadata_key = BenchmarkResult.METADATA_KEY_ + result[metadata_key][ResultMetadataKeys.num_prompts] = num_prompts + result[metadata_key][ResultMetadataKeys.request_rate] = \ request_rate if request_rate < float("inf") else "inf" # Save to file @@ -387,9 +383,8 @@ def from_str(arg: str): "--description", type=str, default="benchmark-serving", - help= - "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts" - ) + help="Benchmark description. This is primarily useful when " + "we log the benchmark results and process them for plotting charts") parser.add_argument( "--backend", type=str, @@ -437,8 +432,8 @@ def from_str(arg: str): parser.add_argument( "--tokenizer", type=str, - help= - "Name or path of the tokenizer, if not using the default model tokenizer.", + help="Name or path of the tokenizer, " + "if not using the default model tokenizer.", ) parser.add_argument( "--best-of", @@ -485,11 +480,10 @@ def from_str(arg: str): parser.add_argument("--nr-qps-pair_", type=NumPrompts_RequestRate_T.from_str, help=""" - First argument in the pair is num_prompts: Number of prompts to process. - Second argument in the pair is request_rate : Number of requests per second. If this is inf, - then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize - the request arrival times. - """, + First argument in the pair is num_prompts to process. + Second argument in the pair is request_rate per second. + If this is inf, then all the requests are sent at time 0. + Otherwise, we use Poisson process to synthesize""", default=None) # Server command args @@ -498,29 +492,32 @@ def from_str(arg: str): type=int, default=None, help= - "tensor-parallel-size that the benchmarking script was invoked with. It is useful to log this information when storing benchmarking results" + "tensor-parallel-size that the benchmarking script was invoked with. " + "It is useful to log this information when storing benchmarking results" ) parser.add_argument( "--server-args", type=str, default=None, - help= - "When we are logging the output, it is useful to log the arguments passed to the server" - ) + help="When we are logging the output, it is useful to log the " + "arguments passed to the server") def args_sanity_check(args): # Sanity check real-dataset vs synthetic-dataset usecase if args.dataset is None: - assert args.num_input_tokens is not None and args.num_output_tokens is not None + assert (args.num_input_tokens is not None + and args.num_output_tokens is not None) else: - assert args.num_input_tokens is None and args.num_output_tokens is None - # Sanity check num_prompts, request_rate as separate args vs joint args usecase + assert (args.num_input_tokens is None + and args.num_output_tokens is None) + # Sanity check num_prompts, request_rate as separate args vs joint args assert not all([ args.num_prompts_ is None, args.request_rate_ is None, args.nr_qps_pair_ is None ]) if args.nr_qps_pair_ is None: - assert args.num_prompts_ is not None and args.request_rate_ is not None + assert (args.num_prompts_ is not None + and args.request_rate_ is not None) else: assert args.num_prompts_ is None and args.request_rate_ is None # Sanity check required logging args diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py index 9138ea0f8ad47..ba586772d5d09 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py +++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py @@ -12,7 +12,8 @@ from pathlib import Path from typing import List, Optional, Tuple from transformers import AutoTokenizer -from .common import generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs +from .common import (generate_synthetic_requests, warmup_vllm_engine, + num_available_gpus, print_request_outputs) from .datasets_registry import get_dataset, DatasetArgs from .logging.benchmark_result import (BenchmarkResult, BenchmarkThroughputResultMetricTemplates @@ -163,7 +164,7 @@ def main(args: argparse.Namespace): current_dt_str = current_dt.strftime("%Y%m%d-%H%M%S") file_name = Path( args.save_directory - ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" + ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" # noqa: E501 result.store(file_name) @@ -173,9 +174,8 @@ def main(args: argparse.Namespace): "--description", type=str, default="benchmark-throughput", - help= - "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts" - ) + help="Benchmark description. This is primarily useful when " + "we log the benchmark results and process them for plotting charts") parser.add_argument("--backend", type=str, choices=["vllm"], diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py index d4addb99a2878..9333939300e92 100644 --- a/neuralmagic/benchmarks/scripts/common.py +++ b/neuralmagic/benchmarks/scripts/common.py @@ -12,7 +12,8 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.tokenizer import get_tokenizer from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR -from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm +from .backend_request_func import (RequestFuncInput, RequestFuncOutput, + async_request_vllm) from ...tools.call_cmd import call_cmd @@ -23,7 +24,7 @@ def num_available_gpus() -> int: def get_benchmarking_context() -> dict: """ - Return the current python version, pytorch version and CUDA version as a dict + Return the current python, pytorch and CUDA version as a dict """ import sys import torch @@ -100,7 +101,7 @@ def warmup_requests(tokenizer: PreTrainedTokenizerBase, num_input_tokens: int = 128, num_output_tokens: int = 1) -> List[Tuple[str, int, int]]: """ - Given a tokenizer, generate `num_requests` requests that would be used for vllm engine warmup + Given a tokenizer, generate `num_requests` requests used for warmup """ words = list(tokenizer.get_vocab().keys()) requests = [] @@ -187,7 +188,7 @@ async def process_requests(input_requests): def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int, n_output_tokens: int) -> str: - return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" + return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" # noqa: E501 def print_request_outputs(results: List[RequestOutput]) -> None: @@ -202,8 +203,8 @@ def print_request_outputs(results: List[RequestOutput]) -> None: def print_serving_request_io(inputs: List[Tuple[str, int, int]], outputs: List[RequestFuncOutput]) -> None: """ - inputs: list of tuples where the tuple is [prompt, prompt_length, output_length], - outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py) + inputs: list of tuples of form [prompt, prompt_length, output_length], + outputs: list of RequestFuncOutput output from benchmark_serving.py Format and print the inputs and outputs. """ for i, o in zip(inputs, outputs): diff --git a/neuralmagic/benchmarks/scripts/datasets_registry.py b/neuralmagic/benchmarks/scripts/datasets_registry.py index b710c712d24cb..919abb72ee39b 100644 --- a/neuralmagic/benchmarks/scripts/datasets_registry.py +++ b/neuralmagic/benchmarks/scripts/datasets_registry.py @@ -63,8 +63,8 @@ def get_ultrachat(tokenizer: PreTrainedTokenizerBase, prompts = [] completions = [] system_message = { - "content": - "You are a chatbot with the explicit goal of helping the user as best as possible", + "content": "You are a chatbot with the explicit goal of " + "helping the user as best as possible", "role": "system", } for messages in ds["messages"]: diff --git a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py index a997cbb855698..37b9c49aa9fd4 100644 --- a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py +++ b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py @@ -1,5 +1,5 @@ """ -Defines a BenchmarkResult class that all the benchmarks use store the benchmark results. +Defines a BenchmarkResult class that all the benchmarks use to save results. """ import json @@ -16,9 +16,9 @@ # NOTE - PLEASE READ: # Any modifications that adds/removes the keys in the JSON that BenchmarkResult # produces should also update the BENCHMARK_RESULTS_SCHEMA_VERSION. -# The primary use case is to establish a set of keys that can be queried against reliably. -# TODO (varun) : Initial version is named 0.0.0 as things are under development. Update it -# when things are stable. +# The primary use case is to establish a set of keys that can be queried. +# TODO (varun) : Initial version is named 0.0.0 as things are under development. +# Update it when things are stable. BENCHMARK_RESULTS_SCHEMA_VERSION = "0.0.0" @@ -158,7 +158,7 @@ def __init__(self, description: str, date: datetime, script_name: str, dataset if dataset is not None else "synthetic", self.SCRIPT_ARGS_KEY_: script_args, - # Any metadata that the caller script wants to store should be stored here. + # Any metadata that the caller script wants to store. self.METADATA_KEY_: {}, # Any benchmarking metrics should be stored here. self.METRICS_KEY_: {} diff --git a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py index a7564417ba702..a89820da7dae9 100644 --- a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py +++ b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py @@ -10,7 +10,8 @@ from dataclasses import dataclass from typing import List, Iterable, NamedTuple -from .benchmark_result import GHABenchmarkToolName, BenchmarkResult, MetricTemplate +from .benchmark_result import (GHABenchmarkToolName, BenchmarkResult, + MetricTemplate) @dataclass @@ -123,29 +124,29 @@ def dump_to_json(gha_records: List[GHARecord], output_path: Path): Reference : https://github.com/benchmark-action/github-action-benchmark """) - parser.add_argument("-i", - "--input-json-directory", - required=True, - type=str, - help=""" - Path to the directory containing BenchmarkResult jsons. - This is typically the output directory passed to the benchmark - runner scripts like neuralmagic/benchmarks/run_benchmarks.py. - """) - - parser.add_argument("--bigger-is-better-output-file-path", - type=str, - required=True, - help=""" - An output file path, where the GHABenchmarkToolName BiggerIsBetter metrics are to be stored. - """) - - parser.add_argument("--smaller-is-better-output-file-path", - type=str, - required=True, - help=""" - An output file path, where the GHABenchmarkToolName SmallerIsBetter metrics are to be stored - """) + parser.add_argument( + "-i", + "--input-json-directory", + required=True, + type=str, + help="""Path to the directory containing BenchmarkResult + jsons. This is typically the output directory passed + to the benchmark runner scripts like + neuralmagic/benchmarks/run_benchmarks.py.""") + + parser.add_argument( + "--bigger-is-better-output-file-path", + type=str, + required=True, + help="""An output file path, where the GHABenchmarkToolName + BiggerIsBetter metrics are to be stored.""") + + parser.add_argument( + "--smaller-is-better-output-file-path", + type=str, + required=True, + help="""An output file path, where the GHABenchmarkToolName + SmallerIsBetter metrics are to be stored""") args = parser.parse_args() diff --git a/neuralmagic/tools/call_cmd.py b/neuralmagic/tools/call_cmd.py index 2ff84a0c02a5f..1168ab5043bfd 100644 --- a/neuralmagic/tools/call_cmd.py +++ b/neuralmagic/tools/call_cmd.py @@ -1,6 +1,9 @@ # -# Run cmd as a sub-process. Capture stdout, stderr, return status, elapsed time and -# optionally process statistics (user time, system time, peak memory usage, etc.) +# Run cmd as a sub-process. +# +# Capture stdout, stderr, return status, elapsed time and +# optionally process statistics +# (user time, system time, peak memory usage, etc.) # import os import re @@ -12,8 +15,8 @@ def parse_process_stats(str): exp = ( - "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) " - "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)" + "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) " # noqa: E501 + "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)" # noqa: E501 ) results = re.search(exp, str) if results: diff --git a/pyproject.toml b/pyproject.toml index f74e50265be24..d6fa5d7a035ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,6 @@ ignore = [ "F405", "F403", # lambda expression assignment "E731", - # line too long, handled by black formatting - "E501", # .strip() with multi-character strings "B005", # Loop control variable not used within loop body diff --git a/requirements.txt b/requirements.txt index 05ec2e804e13b..d6c33ad85da58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,5 @@ pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 pynvml == 11.5.0 triton >= 2.1.0 -outlines >= 0.0.27 +outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. diff --git a/setup.py b/setup.py index af78769695811..6c1b4a91134d0 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff. # This file has been modified by Neural Magic import contextlib @@ -433,6 +435,12 @@ def get_requirements() -> List[str]: else: with open(get_path("requirements.txt")) as f: requirements = f.read().strip().split("\n") + if nvcc_cuda_version <= Version("11.8"): + # replace cupy-cuda12x with cupy-cuda11x for cuda 11.x + for i in range(len(requirements)): + if requirements[i].startswith("cupy-cuda12x"): + requirements[i] = "cupy-cuda11x" + break return requirements diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index b280fd1d73c2f..44ac05a1430b3 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -274,3 +274,90 @@ def test_reset(): # Resetting block manager frees all allocated blocks. block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks + + +def test_sliding_window_multi_seq(): + """ + Tests that memory allocation and deallocation is handled + correctly with multiple sequences that exceed the sliding + window's capacity. + """ + block_size = 1 + num_cpu_blocks = 8 + num_gpu_blocks = 8 + sliding_window = 2 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + sliding_window=sliding_window, + watermark=0) + + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks + + parent = Sequence(1, "one two three", [0, 1, 2], block_size) + seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(), + None) + block_manager.allocate(seq_group) + + # assert the number of blocks allocated is correct + # the parent seq has len 3, but since sliding_window is 2, + # we will use at most 2 blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # Fork prompt and copy block tables. + child = parent.fork(2) + block_manager.fork(parent, child) + + # assert the number of blocks allocated is correct + # forking does not increase memory consumption + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # assert both parent and child share all blocks + assert block_manager.get_block_table( + parent) == block_manager.get_block_table(child) + + token_id = 4 + # Append token to child. Block is shared so copy on write occurs. + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(child) + + # assert the number of blocks allocated is correct + # we will use now one block more. Each seq will use 2 blocks, + # but only one can be shared + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + token_id = 5 + parent.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(parent) + + # assert the number of blocks allocated is correct + # no change, because both sequences are still just sharing one block + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + block_table_parent = block_manager.get_block_table(parent) + block_table_child = block_manager.get_block_table(child) + + assert block_table_parent != block_table_child + + # assert both blocks are sharing the second-last block + assert block_table_parent[-2] == block_table_child[-2] + + # now let's clean up... + block_manager.free(parent) + + # assert the number of blocks allocated is correct + # We have freed one seq, reducing the ref count of two blocks by one. + # One of the two was only used by the parent seq, so this is now free. + # The child seq still consumes sliding_window blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # free all blocks + block_manager.free(child) + + # assert all blocks are free now + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 6b6903635e110..4a2b89befd93f 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -551,12 +551,6 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI): assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None -# *** UPSTREAM SYNC *** -# This test covers an experimental feature in vLLM, guided generation. -# Currently, there is an upstream issue being debugged -# See: https://github.com/vllm-project/vllm/pull/3383 -# Once this is resolved upstream, turn the test back on. -@pytest.mark.skip("Issue upstream, currently being resolved.") async def test_guided_regex_chat(server, client: openai.AsyncOpenAI): messages = [{ "role": "system", diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index e0dec144eba11..f78913f120aa4 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -16,7 +16,7 @@ ] -@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul]) +@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -24,7 +24,7 @@ @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_act_and_mul( - activation: Type[torch.nn.Module], + activation: str, num_tokens: int, d: int, dtype: torch.dtype, @@ -36,7 +36,12 @@ def test_act_and_mul( torch.cuda.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) - layer = activation() + if activation == "silu": + layer = SiluAndMul() + elif activation == "gelu": + layer = GeluAndMul(approximate="none") + elif activation == "gelu_tanh": + layer = GeluAndMul(approximate="tanh") out = layer(x) ref_out = layer._forward(x) # The SiLU and GELU implementations are equivalent to the native PyTorch diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 0d27bbaff9fc5..ffdcc1e8c80fd 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,8 +1,9 @@ -from typing import Optional +from typing import List, Optional import pytest import torch from allclose_default import get_default_atol, get_default_rtol +from itertools import accumulate from vllm.model_executor.layers.rotary_embedding import get_rope IS_NEOX_STYLE = [True, False] @@ -72,3 +73,135 @@ def test_rotary_embedding( ref_key, atol=get_default_atol(out_key), rtol=get_default_rtol(out_key)) + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_batched_rotary_embedding( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { + "type": "linear", + "factor": (1, ) + }) + rope = rope.to(dtype=dtype) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # NOTE(woosuk): The reference implementation should be executed first + # because the custom kernel is in-place. + ref_query, ref_key = rope._forward(positions, query, key) + out_query, out_key = rope.forward(positions, + query, + key, + offsets=torch.zeros(batch_size * seq_len, + dtype=int, + device=device)) + # Compare the results. + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_batched_rotary_embedding_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + scaling_factors: List[int] = [1, 2, 4] + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { + "type": "linear", + "factor": tuple(scaling_factors) + }) + rope = rope.to(dtype=dtype) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + query_offsets = offset_map[query_types] + + # NOTE(woosuk): The reference implementation should be executed first + # because the custom kernel is in-place. + ref_query, ref_key = rope._forward(positions, query, key, query_offsets) + out_query, out_key = rope.forward(positions, query, key, + query_offsets.flatten()) + # Compare the results. + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 67273144ecd02..30a8ad03c8ada 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -152,4 +152,5 @@ def get_model_patched(model_config, device_config, **kwargs): @pytest.fixture def llama_2_7b_model_extra_embeddings( llama_2_7b_engine_extra_embeddings) -> nn.Module: - yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model + yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker. + model_runner.model) diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index cbe0f6fa2e851..fd707766c6a30 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -45,7 +45,7 @@ def _lora_ref_impl( H1 = H2 = [ 128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120, 5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, - 24576, 32000, 32256, 32512, 32768, 33024 + 22016, 24576, 32000, 32256, 32512, 32768, 33024 ] SEED = [0xabcdabcd987] diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py index aefaf881048b8..44319b6ca45ff 100644 --- a/tests/models/compare_utils.py +++ b/tests/models/compare_utils.py @@ -1,5 +1,5 @@ -"""Compare the logprobs of two sequences generated by different models, which should -be similar but not necessarily equal. +"""Compare the logprobs of two sequences generated by different models, +which should be similar but not necessarily equal. """ @@ -15,15 +15,17 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): - # If generated tokens don't match ... + # If generated tokens don't match, then if output_id_0 != output_id_1: - # ... each predicted token must be in top N logprobs of the other's + # Each predicted token must be in top N logprobs of the other assert output_id_0 in logprobs_1[idx], ( - f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}" - ) + f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") assert output_id_1 in logprobs_0[idx], ( - f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}" - ) + f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") # Break out since sequences will now diverge. break diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py index fed9dfb35e881..aa885661a0af3 100644 --- a/tests/models/test_compressed.py +++ b/tests/models/test_compressed.py @@ -1,4 +1,4 @@ -"""Compare the outputs of a sparse model running sparse vs sparse model running dense. +"""Compare the outputs of a sparse model vs sparse model running dense. Note: sparse kernels do not have bitwise correctness vs the dense models. As a result, in this test, we just confirm that the top selected tokens of the sparse models are in the top N selections of same model running dense. @@ -41,8 +41,6 @@ def test_models( sparse_outputs = sparse_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Note: deleting just the model does not always free the GPU memory, not sure why. - del sparse_model.model.llm_engine.driver_worker del sparse_model gc.collect() @@ -53,8 +51,6 @@ def test_models( dense_outputs = dense_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Note: deleting just the model does not always free the GPU memory, not sure why. - del dense_model.model.llm_engine.driver_worker del dense_model gc.collect() diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py index 1abb9269dc15e..fddebd58104e3 100644 --- a/tests/models/test_compressed_memory.py +++ b/tests/models/test_compressed_memory.py @@ -36,10 +36,9 @@ def test_models( sparsity=None, dtype=dtype, max_model_len=1024) - dense_num_kv_blocks = dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks + dense_num_kv_blocks = (dense_model.model.llm_engine.scheduler. + block_manager.gpu_allocator.num_blocks) - # Note: deleting just the model does not always free the GPU memory, not sure why. - del dense_model.model.llm_engine.driver_worker del dense_model torch.cuda.empty_cache() gc.collect() @@ -48,10 +47,9 @@ def test_models( sparsity=sparsity, dtype=dtype, max_model_len=1024) - sparse_num_kv_blocks = sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks + sparse_num_kv_blocks = (sparse_model.model.llm_engine.scheduler. + block_manager.gpu_allocator.num_blocks) - # Note: deleting just the model does not always free the GPU memory, not sure why. - del sparse_model.model.llm_engine.driver_worker del sparse_model torch.cuda.empty_cache() gc.collect() diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 1dca24ffa9a53..7c0382dfa7b34 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -20,7 +20,8 @@ import gc from compare_utils import check_logprobs_close from dataclasses import dataclass -from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY +from vllm.model_executor.layers.quantization import ( + _QUANTIZATION_CONFIG_REGISTRY) MAX_MODEL_LEN = 1024 @@ -67,11 +68,7 @@ def test_models( marlin_outputs = marlin_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # vllm memory cleanup is poor. This seems to fix things. - # NOTE: upstream sync should use downstream version. - del marlin_model.model.llm_engine.driver_worker del marlin_model - gc.collect() torch.cuda.empty_cache() @@ -82,9 +79,6 @@ def test_models( max_tokens, num_logprobs) - # vllm memory cleanup is poor. This seems to fix things. - # NOTE: upstream sync should use downstream version. - del gptq_model.model.llm_engine.driver_worker del gptq_model gc.collect() torch.cuda.empty_cache() diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 80cbf2a48efc4..8878510bd0a93 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -51,7 +51,6 @@ def test_models( max_tokens, num_logprobs) - del vllm_model.model.llm_engine.driver_worker del vllm_model # loop through the prompts diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000000000..13a9f76212679 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,43 @@ +from vllm.config import ModelConfig + + +def test_get_sliding_window(): + TEST_SLIDING_WINDOW = 4096 + # Test that the sliding window is correctly computed. + # For Qwen1.5/Qwen2, get_sliding_window() should be None + # when use_sliding_window is False. + qwen2_model_config = ModelConfig( + "Qwen/Qwen1.5-7B", + "Qwen/Qwen1.5-7B", + tokenizer_mode="auto", + trust_remote_code=False, + download_dir=None, + load_format="dummy", + seed=0, + dtype="float16", + revision=None, + ) + + qwen2_model_config.hf_config.use_sliding_window = False + qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW + assert qwen2_model_config.get_sliding_window() is None + + qwen2_model_config.hf_config.use_sliding_window = True + assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW + + mistral_model_config = ModelConfig( + "mistralai/Mistral-7B-v0.1", + "mistralai/Mistral-7B-v0.1", + tokenizer_mode="auto", + trust_remote_code=False, + download_dir=None, + load_format="dummy", + seed=0, + dtype="float16", + revision=None, + ) + mistral_model_config.hf_config.sliding_window = None + assert mistral_model_config.get_sliding_window() is None + + mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW + assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW \ No newline at end of file diff --git a/vllm/__init__.py b/vllm/__init__.py index 6aa055bea3c88..501b2dee6e789 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -3,7 +3,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster +from vllm.engine.ray_utils import initialize_ray_cluster from vllm.entrypoints.llm import LLM from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import SamplingParams @@ -19,5 +19,5 @@ "EngineArgs", "AsyncLLMEngine", "AsyncEngineArgs", - "initialize_cluster", + "initialize_ray_cluster", ] diff --git a/vllm/config.py b/vllm/config.py index c8e261afc8377..56fe6b522e7ee 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,6 +1,6 @@ # This file has been modified by Neural Magic -from typing import Optional, Union, ClassVar +from typing import TYPE_CHECKING, Optional, Union, ClassVar from dataclasses import dataclass import os from packaging.version import Version @@ -12,6 +12,9 @@ from vllm.transformers_utils.config import get_config from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + logger = init_logger(__name__) _GB = 1 << 30 @@ -104,6 +107,7 @@ def __init__( # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C + if not os.path.exists(model): model_path = snapshot_download(model_id=model, cache_dir=download_dir, @@ -141,7 +145,7 @@ def _verify_load_format(self) -> None: if (f not in rocm_not_supported_load_format) ] raise ValueError( - f"load format \'{load_format}\' is not supported in ROCm. " + f"load format '{load_format}' is not supported in ROCm. " f"Supported load format are " f"{rocm_supported_load_format}") @@ -168,7 +172,8 @@ def _verify_sparsity(self) -> None: raise ValueError("Both sparsity and quantization detected. Only " "one or the other is supported at a time.") - if self.sparsity is not None and self.sparsity not in supported_sparsity: + if (self.sparsity is not None + and self.sparsity not in supported_sparsity): raise ValueError(f"Unknown sparse method: {self.sparsity}. Must " f"be one of {supported_sparsity}.") @@ -254,6 +259,15 @@ def verify_with_parallel_config( f"({pipeline_parallel_size}).") def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size, or None if disabled. + """ + + # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in + # addition to sliding window size. We check if that field is present + # and if it's False, return None. + if (hasattr(self.hf_config, "use_sliding_window") + and not self.hf_config.use_sliding_window): + return None return getattr(self.hf_config, "sliding_window", None) def get_vocab_size(self) -> int: @@ -427,6 +441,7 @@ def __init__( max_parallel_loading_workers: Optional[int] = None, disable_custom_all_reduce: bool = False, ray_workers_use_nsight: bool = False, + placement_group: Optional["PlacementGroup"] = None, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size if is_neuron(): @@ -442,6 +457,7 @@ def __init__( self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce self.ray_workers_use_nsight = ray_workers_use_nsight + self.placement_group = placement_group self.world_size = pipeline_parallel_size * self.tensor_parallel_size # Ray worker is not supported for Neuron backend. @@ -644,7 +660,7 @@ def _get_and_verify_dtype( k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE) ] - raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " + raise ValueError(f"dtype '{dtype}' is not supported in ROCm. " f"Supported dtypes are {rocm_supported_dtypes}") # Verify the dtype. diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 8bfc14999f0a7..8b089a5650f48 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -312,7 +312,12 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: # Thus, it is always safe from OOM. src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.copy() - for block in src_block_table: + # When using a sliding window, blocks will be eventually reused. + # In this case the block tables will contain repeated blocks. + # When forking, we must make sure that each block's `ref_count` + # is only incremented by one, so we deduplicate them by wrapping + # them in a set. + for block in set(src_block_table): block.ref_count += 1 def _get_physical_blocks( @@ -393,7 +398,15 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: return block_number_mapping def _free_block_table(self, block_table: BlockTable) -> None: - for block in set(block_table): + # when using a sliding window, each seq will only use up + # to `self.block_sliding_window` blocks. When freeing + # the block table, we must make sure to not free blocks more + # than once. If no sliding window is used, there is no block + # reuse in the block table, so we must free all blocks. + blocks_to_free = (block_table[-self.block_sliding_window:] + if self.block_sliding_window is not None else + block_table) + for block in set(blocks_to_free): if block.device == Device.GPU: self.gpu_allocator.free(block) else: diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5629d1a863d04..0cee604c14d45 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,8 +2,8 @@ import os import time from functools import partial -from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator, Callable) +from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, + Union, AsyncIterator) from transformers import PreTrainedTokenizer @@ -11,7 +11,7 @@ from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster, ray +from vllm.engine.ray_utils import initialize_ray_cluster, ray from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -208,17 +208,10 @@ async def step_async(self) -> List[RequestOutput]: if not scheduler_outputs.is_empty(): # Execute the model. - all_outputs = await self._run_workers_async( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] + output = await self.model_executor.execute_model_async( + seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, + scheduler_outputs.blocks_to_swap_out, + scheduler_outputs.blocks_to_copy) else: output = [] @@ -268,37 +261,8 @@ async def add_request_async( lora_request=lora_request, ) - async def _run_workers_async( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Run the driver worker asynchronously. - driver_executor = getattr(self.driver_worker, method) - coros.append(asyncio.get_event_loop().run_in_executor( - None, partial(driver_executor, *driver_args, **driver_kwargs))) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs - - async def check_health_async(self): - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() + async def check_health_async(self) -> None: + self.model_executor.check_health() class AsyncLLMEngine: @@ -353,6 +317,34 @@ def __init__(self, self._request_tracker: Optional[RequestTracker] = None self._errored_with: Optional[BaseException] = None + @classmethod + def from_engine_args(cls, + engine_args: AsyncEngineArgs, + start_engine_loop: bool = True) -> "AsyncLLMEngine": + """Creates an async LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + if parallel_config.worker_use_ray or engine_args.engine_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync + executor_class = RayGPUExecutorAsync + else: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + from vllm.executor.gpu_executor import GPUExecutorAsync + executor_class = GPUExecutorAsync + # Create the async LLM engine. + engine = cls(parallel_config.worker_use_ray, + engine_args.engine_use_ray, + *engine_configs, + executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + max_log_len=engine_args.max_log_len, + start_engine_loop=start_engine_loop) + return engine + @property def is_running(self) -> bool: return (self.background_loop is not None @@ -670,35 +662,13 @@ async def get_model_config(self) -> ModelConfig: else: return self.engine.get_model_config() - @classmethod - def from_engine_args(cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config, - engine_args.engine_use_ray) - # Create the async LLM engine. - engine = cls(parallel_config.worker_use_ray, - engine_args.engine_use_ray, - *engine_configs, - placement_group, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - max_log_len=engine_args.max_log_len, - start_engine_loop=start_engine_loop) - return engine - async def do_log_stats(self) -> None: if self.engine_use_ray: await self.engine.do_log_stats.remote() else: self.engine.do_log_stats() - async def check_health(self): + async def check_health(self) -> None: """Raises an error if engine is unhealthy.""" t = time.perf_counter() logger.debug("Starting health check...") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3e35e04107fa2..cb1c7d48d154a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,13 +1,5 @@ -# This file has been modified by Neural Magic - -import copy -from collections import defaultdict -import os import time -import pickle -import importlib -from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, - Union) +from typing import Dict, Iterable, List, Optional, Tuple, Type, Union from transformers import PreTrainedTokenizer @@ -17,8 +9,9 @@ ParallelConfig, SchedulerConfig, LoRAConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs +from vllm.executor.executor_base import ExecutorBase from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray +from vllm.engine.ray_utils import initialize_ray_cluster from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -26,29 +19,11 @@ SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.tokenizer import (detokenize_incrementally, TokenizerGroup) -from vllm.utils import (Counter, set_cuda_visible_devices, get_ip, - get_open_port, get_distributed_init_method) - -if ray: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup +from vllm.utils import Counter logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 -# A map between the device type (in device config) to its worker module. -DEVICE_TO_WORKER_MODULE_MAP = { - "cuda": "vllm.worker.worker", - "neuron": "vllm.worker.neuron_worker", -} - -# If the env var is set, it uses the Ray's compiled DAG API -# which optimizes the control plane overhead. -# Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. -USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) - class LLMEngine: """An LLM engine that receives requests and generates texts. @@ -73,8 +48,8 @@ class LLMEngine: parallel_config: The configuration related to distributed execution. scheduler_config: The configuration related to the request scheduler. device_config: The configuration related to the device. - placement_group: Ray placement group for distributed execution. - Required for distributed execution. + executor_class: The model executor class for managing distributed + execution. log_stats: Whether to log statistics. """ @@ -86,7 +61,7 @@ def __init__( scheduler_config: SchedulerConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - placement_group: Optional["PlacementGroup"], + executor_class: Type[ExecutorBase], log_stats: bool, ) -> None: logger.info( @@ -124,33 +99,13 @@ def __init__( self._init_tokenizer() self.seq_counter = Counter() - # Create the parallel GPU workers. - if self.parallel_config.worker_use_ray: - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - # Pass additional arguments to initialize the worker - additional_ray_args = {} - if self.parallel_config.ray_workers_use_nsight: - logger.info("Configuring Ray workers to use nsight.") - additional_ray_args = { - "runtime_env": { - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - } - } - self._init_workers_ray(placement_group, **additional_ray_args) - else: - self._init_workers() - - # Profile the memory usage and initialize the cache. - self._init_cache() + self.model_executor = executor_class(model_config, cache_config, + parallel_config, scheduler_config, + device_config, lora_config) # Create the scheduler. + # NOTE: the cache_config here have been updated with the numbers of + # GPU and CPU blocks, which are profiled in the distributed executor. self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) # Metric Logging. @@ -160,9 +115,29 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) - self.forward_dag = None - if USE_RAY_COMPILED_DAG: - self.forward_dag = self._compiled_ray_dag() + @classmethod + def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + + # Initialize the cluster and specify the executor class. + if parallel_config.worker_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_gpu_executor import RayGPUExecutor + executor_class = RayGPUExecutor + else: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + from vllm.executor.gpu_executor import GPUExecutor + executor_class = GPUExecutor + + # Create the LLM engine. + engine = cls(*engine_configs, + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats) + return engine def __reduce__(self): # This is to ensure that the LLMEngine is not referenced in @@ -176,39 +151,6 @@ def get_tokenizer_for_seq(self, sequence: Sequence) -> "PreTrainedTokenizer": return self.tokenizer.get_lora_tokenizer(sequence.lora_request) - def _dispatch_worker(self): - worker_module = DEVICE_TO_WORKER_MODULE_MAP[ - self.device_config.device_type] - imported_worker = importlib.import_module(worker_module) - Worker = imported_worker.Worker - return Worker - - def _init_workers(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - Worker = self._dispatch_worker() - - assert self.parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - - self.workers: List[Worker] = [] - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, - ) - self._run_workers("init_model") - self._run_workers("load_model") - def _init_tokenizer(self, **tokenizer_init_kwargs): init_kwargs = dict( enable_lora=bool(self.lora_config), @@ -221,126 +163,6 @@ def _init_tokenizer(self, **tokenizer_init_kwargs): self.tokenizer: TokenizerGroup = TokenizerGroup( self.model_config.tokenizer, **init_kwargs) - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - if self.parallel_config.tensor_parallel_size == 1: - num_gpus = self.cache_config.gpu_memory_utilization - else: - num_gpus = 1 - - self.driver_dummy_worker: RayWorkerVllm = None - self.workers: List[RayWorkerVllm] = [] - - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - worker = ray.remote( - num_cpus=0, - num_gpus=num_gpus, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerVllm).remote(self.model_config.trust_remote_code) - - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - else: - self.workers.append(worker) - - if self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "GPU node.") - - driver_node_id, driver_gpu_ids = ray.get( - self.driver_dummy_worker.get_node_and_gpu_ids.remote()) - worker_node_and_gpu_ids = ray.get( - [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) - - node_workers = defaultdict(list) - node_gpus = defaultdict(list) - - node_workers[driver_node_id].append(0) - node_gpus[driver_node_id].extend(driver_gpu_ids) - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, - start=1): - node_workers[node_id].append(i) - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - # Set CUDA_VISIBLE_DEVICES for the driver. - set_cuda_visible_devices(node_gpus[driver_node_id]) - for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): - worker.set_cuda_visible_devices.remote(node_gpus[node_id]) - - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - Worker = self._dispatch_worker() - - # Initialize torch distributed process group for the workers. - model_config = copy.deepcopy(self.model_config) - parallel_config = copy.deepcopy(self.parallel_config) - scheduler_config = copy.deepcopy(self.scheduler_config) - device_config = copy.deepcopy(self.device_config) - lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype - - for rank, (worker, (node_id, - _)) in enumerate(zip(self.workers, - worker_node_and_gpu_ids), - start=1): - local_rank = node_workers[node_id].index(rank) - worker.init_worker.remote( - lambda rank=rank, local_rank=local_rank: Worker( - model_config, - parallel_config, - scheduler_config, - device_config, - local_rank, - rank, - distributed_init_method, - lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, - )) - - driver_rank = 0 - driver_local_rank = node_workers[driver_node_id].index(driver_rank) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - driver_local_rank, - driver_rank, - distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=True, - ) - - # don't use cupy for eager mode - self._run_workers("init_model", - cupy_port=get_open_port() - if not model_config.enforce_eager else None) - self._run_workers( - "load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers, - ) - def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) self.cache_config.verify_with_parallel_config(self.parallel_config) @@ -349,81 +171,6 @@ def _verify_args(self) -> None: self.lora_config.verify_with_scheduler_config( self.scheduler_config) - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameters. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - # FIXME(woosuk): Change to debug log. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = self.cache_config.block_size * num_gpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") - - @classmethod - def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config) - # Create the LLM engine. - engine = cls(*engine_configs, - placement_group, - log_stats=not engine_args.disable_log_stats) - return engine - def encode_request( self, request_id: str, # pylint: disable=unused-argument @@ -829,7 +576,7 @@ def step(self) -> List[RequestOutput]: - A Sequence Group (SG) refer to a group of sequences that are generated from the same prompt. - - Step 2: Calls the workers to execute the model. + - Step 2: Calls the distributed executor to execute the model. - Step 3: Processes the model output. This mainly includes: - Decodes the relevant outputs. @@ -865,19 +612,10 @@ def step(self) -> List[RequestOutput]: seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() if not scheduler_outputs.is_empty(): - # Execute the model. - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] + output = self.model_executor.execute_model( + seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, + scheduler_outputs.blocks_to_swap_out, + scheduler_outputs.blocks_to_copy) else: output = [] @@ -1046,111 +784,13 @@ def _finalize_sequence(self, seq: Sequence, seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) + return self.model_executor.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) + return self.model_executor.remove_lora(lora_id) def list_loras(self) -> List[int]: - return self._run_workers("list_loras") - - def _run_workers( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - max_concurrent_workers: Optional[int] = None, - use_ray_compiled_dag: bool = False, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - if use_ray_compiled_dag: - # Right now, compiled DAG can only accept a single - # input. TODO(sang): Fix it. - output_channels = self.forward_dag.execute(1) - else: - # Start the ray workers first. - ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) - for worker in self.workers - ] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Start the driver worker after all the ray workers. - driver_worker_output = getattr(self.driver_worker, - method)(*driver_args, **driver_kwargs) - - # Get the results of the ray workers. - if self.workers: - if use_ray_compiled_dag: - try: - ray_worker_outputs = [ - pickle.loads(chan.begin_read()) - for chan in output_channels - ] - finally: - # Has to call end_read in order to reuse the DAG. - for chan in output_channels: - chan.end_read() - else: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return [driver_worker_output] + ray_worker_outputs - - def _compiled_ray_dag(self): - import pkg_resources - required_version = "2.9" - current_version = pkg_resources.get_distribution("ray").version - if current_version < required_version: - raise ValueError(f"Ray version {required_version} or greater is " - f"required, but found {current_version}") - - from ray.dag import MultiOutputNode, InputNode - assert self.parallel_config.worker_use_ray - - # Right now, compiled DAG requires at least 1 arg. We send - # a dummy value for now. It will be fixed soon. - with InputNode() as input_data: - forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote.bind(input_data) - for worker in self.workers - ]) - return forward_dag.experimental_compile() + return self.model_executor.list_loras() def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() - - def _check_if_any_actor_is_dead(self): - if not self.parallel_config.worker_use_ray: - return - - if not self.workers: - return - - dead_actors = [] - for actor in self.workers: - actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access - if actor_state["State"] == "DEAD": - dead_actors.append(actor) - if dead_actors: - raise RuntimeError("At least one Worker is dead. " - f"Dead Workers: {dead_actors}. ") + self.model_executor.check_health() diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index bbcbbdfea2f00..742f3dc575190 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,6 +1,6 @@ import pickle -from typing import Optional, List, Tuple, TYPE_CHECKING +from typing import Optional, List, Tuple from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -65,45 +65,38 @@ def execute_model_compiled_dag_remote(self, ignored): ray = None RayWorkerVllm = None -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -def initialize_cluster( +def initialize_ray_cluster( parallel_config: ParallelConfig, - engine_use_ray: bool = False, ray_address: Optional[str] = None, -) -> Optional["PlacementGroup"]: - """Initialize the distributed cluster probably with Ray. +): + """Initialize the distributed cluster with Ray. + + it will connect to the Ray cluster and create a placement group + for the workers, which includes the specification of the resources + for each distributed worker. Args: parallel_config: The configurations for parallel execution. - engine_use_ray: Whether to use Ray for async engine. ray_address: The address of the Ray cluster. If None, uses the default Ray cluster address. - - Returns: - An optional `PlacementGroup`. It includes the specification - of the resources for each distributed worker. None if Ray is - not used. """ - if parallel_config.worker_use_ray or engine_use_ray: - if ray is None: - raise ImportError( - "Ray is not installed. Please install Ray to use distributed " - "serving.") - # Connect to a ray cluster. - if is_hip(): - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if not parallel_config.worker_use_ray: - assert parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - return None + if ray is None: + raise ImportError( + "Ray is not installed. Please install Ray to use distributed " + "serving.") + + # Connect to a ray cluster. + if is_hip(): + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) + + if parallel_config.placement_group: + # Placement group is already set. + return # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() @@ -138,4 +131,5 @@ def initialize_cluster( # if they cannot be provisioned. ray.get(current_placement_group.ready(), timeout=1800) - return current_placement_group + # Set the placement group in the parallel config + parallel_config.placement_group = current_placement_group diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 86b6c4c67cfa4..5130586e036b2 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -82,6 +82,14 @@ async def stream_results() -> AsyncGenerator[bytes, None]: parser.add_argument("--port", type=int, default=8000) parser.add_argument("--ssl-keyfile", type=str, default=None) parser.add_argument("--ssl-certfile", type=str, default=None) + parser.add_argument("--ssl-ca-certs", + type=str, + default=None, + help="The CA certificates file") + parser.add_argument("--ssl-cert-reqs", + type=int, + default=0, + help="Whether client certificate is required") parser.add_argument( "--root-path", type=str, @@ -100,4 +108,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]: log_level="debug", timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile) + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs) diff --git a/vllm/executor/__init__.py b/vllm/executor/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py new file mode 100644 index 0000000000000..30717e8a87358 --- /dev/null +++ b/vllm/executor/executor_base.py @@ -0,0 +1,75 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + + +class ExecutorBase(ABC): + """Base class for all executors. + + An executor is responsible for executing the model on a specific device + type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor + that can execute the model on multiple devices. + """ + + @abstractmethod + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + raise NotImplementedError + + @abstractmethod + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + @abstractmethod + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + @abstractmethod + def list_loras(self) -> List[int]: + raise NotImplementedError + + @abstractmethod + def check_health(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError + + +class ExecutorAsyncBase(ExecutorBase): + + @abstractmethod + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + async def check_health_async(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py new file mode 100644 index 0000000000000..9019ee7763c77 --- /dev/null +++ b/vllm/executor/gpu_executor.py @@ -0,0 +1,163 @@ +import importlib +from typing import Dict, List, Optional + +from vllm.lora.request import LoRARequest +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_ip, get_open_port, get_distributed_init_method, + make_async) + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + + +class GPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + # Instantiate the worker and load the model to GPU. + self._init_worker() + + # Profile the memory usage and initialize the cache. + self._init_cache() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + self.driver_worker.init_model() + self.driver_worker.load_model() + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine first profiles the existing memory usage. + Then, it allocates the remaining memory for KV blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_gpu_blocks, num_cpu_blocks = ( + self.driver_worker.profile_num_available_blocks( + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config. + gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + )) + + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self.driver_worker.warm_up_model() + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + output = self.driver_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) + + def list_loras(self) -> List[int]: + return self.driver_worker.list_loras() + + def check_health(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return + + +class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy) + return output + + async def check_health_async(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py new file mode 100644 index 0000000000000..82a2b456895e8 --- /dev/null +++ b/vllm/executor/ray_gpu_executor.py @@ -0,0 +1,441 @@ +import asyncio +import copy +from collections import defaultdict +import os +import pickle +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.engine.ray_utils import RayWorkerVllm, ray +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port, + get_distributed_init_method, make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayGPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + assert self.parallel_config.worker_use_ray + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self._init_cache() + + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: RayWorkerVllm = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerVllm] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + driver_node_id, driver_gpu_ids = ray.get( + self.driver_dummy_worker.get_node_and_gpu_ids.remote()) + worker_node_and_gpu_ids = ray.get( + [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + node_workers[driver_node_id].append(0) + node_gpus[driver_node_id].extend(driver_gpu_ids) + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, + start=1): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + # Set CUDA_VISIBLE_DEVICES for the driver and workers. + set_cuda_visible_devices(node_gpus[driver_node_id]) + for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): + worker.set_cuda_visible_devices.remote(node_gpus[node_id]) + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + model_config = copy.deepcopy(self.model_config) + parallel_config = copy.deepcopy(self.parallel_config) + scheduler_config = copy.deepcopy(self.scheduler_config) + device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype + + # Initialize the actual workers with the Worker class. + for rank, (worker, (node_id, _)) in enumerate( + zip(self.workers, worker_node_and_gpu_ids), + start=1, + ): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: Worker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, + )) + + # Initialize the driver worker with the Worker class. + driver_rank = 0 + driver_local_rank = node_workers[driver_node_id].index(driver_rank) + self.driver_worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + driver_local_rank, + driver_rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=True, + ) + + # FIXME(woosuk): We are not properly initializing cupy NCCL when + # we have multiple nodes. + self._run_workers("init_model", + cupy_port=get_open_port() + if not model_config.enforce_eager else None) + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + from class :class:`~vllm.worker.Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + all_outputs = self._run_workers( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> List[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *args, **kwargs) + for worker in self.workers + ] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Start the driver worker after all the ray workers. + driver_worker_output = getattr(self.driver_worker, + method)(*driver_args, **driver_kwargs) + + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import MultiOutputNode, InputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote.bind(input_data) + for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase): + + async def _run_workers_async( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + coros = [] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Run the driver worker asynchronously. + driver_executor = make_async(getattr(self.driver_worker, method)) + coros.append(driver_executor(*driver_args, **driver_kwargs)) + + # Run the ray workers asynchronously. + for worker in self.workers: + coros.append(worker.execute_method.remote(method, *args, **kwargs)) + + all_outputs = await asyncio.gather(*coros) + return all_outputs + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + all_outputs = await self._run_workers_async( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + async def check_health_async(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py new file mode 100644 index 0000000000000..44976696a77c6 --- /dev/null +++ b/vllm/executor/utils.py @@ -0,0 +1,13 @@ +def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5a3a7b2dbaee7..3eb73ee109f50 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -47,16 +47,25 @@ class GeluAndMul(nn.Module): return: (batch_size, seq_len, d) or (num_tokens, d) """ + def __init__(self, approximate: str = "none"): + super().__init__() + self.approximate = approximate + if approximate not in ("none", "tanh"): + raise ValueError(f"Unknown approximate mode: {approximate}") + def _forward(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" d = x.shape[-1] // 2 - return F.gelu(x[..., :d]) * x[..., d:] + return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.gelu_and_mul(out, x) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) return out diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py deleted file mode 100644 index 2a82325b80213..0000000000000 --- a/vllm/model_executor/layers/attention.py +++ /dev/null @@ -1,349 +0,0 @@ -"""Multi-head attention.""" -from typing import List, Optional - -import importlib -import torch -import torch.nn as nn -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, - LowerTriangularMaskWithTensorBias) - -from vllm._C import ops -from vllm._C import cache_ops -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( - context_attention_fwd) -from vllm.utils import is_hip - -_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] -# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. -_PARTITION_SIZE = 512 - - -class PagedAttention(nn.Module): - """MHA/MQA/GQA layer with PagedAttention. - - This class takes query, key, and value tensors as input. The input tensors - can either contain prompt tokens or generation tokens. - The class does the following: - - 1. Reshape and store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention using either - xformers or the PagedAttention custom op. - 3. Return the output tensor. - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[List[float]] = None, - sliding_window: Optional[int] = None, - ) -> None: - super().__init__() - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads - self.sliding_window = sliding_window - if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) - - assert self.num_heads % self.num_kv_heads == 0 - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - if self.head_size not in _SUPPORTED_HEAD_SIZES: - raise ValueError(f"head_size ({self.head_size}) is not supported. " - f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.") - - self.use_ref_attention = self.check_use_ref_attention() - - def check_use_ref_attention(self) -> bool: - if not is_hip(): - return False - # For ROCm, check whether flash attention is installed or not. - # if not, use_ref_attention needs to be True - return importlib.util.find_spec("flash_attn") is None - - def ref_masked_attention( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - ) -> torch.Tensor: - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - seq_len, _, _ = query.shape - attn_mask = torch.triu(torch.ones(seq_len, - seq_len, - dtype=query.dtype, - device=query.device), - diagonal=1) - attn_mask = attn_mask * torch.finfo(query.dtype).min - - attn_weights = self.scale * torch.einsum("qhd,khd->hqk", query, - key).float() - attn_weights = attn_weights + attn_mask.float() - attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) - out = torch.einsum("hqk,khd->qhd", attn_weights, value) - return out - - def forward( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - key_cache: Optional[torch.Tensor], - value_cache: Optional[torch.Tensor], - input_metadata: InputMetadata, - ) -> torch.Tensor: - """PagedAttention forward pass. - - Args: - query: shape = [batch_size, seq_len, num_heads * head_size] - key: shape = [batch_size, seq_len, num_kv_heads * head_size] - value: shape = [batch_size, seq_len, num_kv_heads * head_size] - key_cache: shape = [num_blocks, num_kv_heads, head_size/x, - block_size, x] - value_cache: shape = [num_blocks, num_kv_heads, head_size, - block_size] - input_metadata: metadata for the inputs. - Returns: - shape = [batch_size, seq_len, num_heads * head_size] - """ - batch_size, seq_len, hidden_size = query.shape - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - # Reshape the keys and values and store them in the cache. - # If key_cache and value_cache are not provided, the new key and value - # vectors will not be cached. This happens during the initial memory - # profiling run. - if key_cache is not None and value_cache is not None: - cache_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - input_metadata.slot_mapping.flatten(), - input_metadata.kv_cache_dtype, - ) - - if input_metadata.is_prompt: - # normal attention - if (key_cache is None or value_cache is None - or input_metadata.block_tables.numel() == 0): - if self.num_kv_heads != self.num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # TODO(woosuk): Use MQA/GQA kernels for higher performance. - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, - None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) - - # Set attention bias if not provided. This typically happens at - # the very attention layer of every iteration. - # FIXME(woosuk): This is a hack. - if input_metadata.attn_bias is None: - if self.alibi_slopes is None: - attn_bias = BlockDiagonalCausalMask.from_seqlens( - [seq_len] * batch_size) - if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) - input_metadata.attn_bias = attn_bias - else: - input_metadata.attn_bias = _make_alibi_bias( - self.alibi_slopes, self.num_kv_heads, batch_size, - seq_len, query.dtype) - - if self.use_ref_attention: - output = self.ref_masked_attention( - query, - key, - value, - ) - # Using view got RuntimeError: view size is not compatible with input tensor's size and stride - # (at least one dimension spans across two contiguous subspaces). Use reshape instead - return output.reshape(batch_size, seq_len, hidden_size) - - # TODO(woosuk): Too many view operations. Let's try to reduce - # them in the future for code readability. - if self.alibi_slopes is None: - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - else: - query = query.unflatten(0, (batch_size, seq_len)) - key = key.unflatten(0, (batch_size, seq_len)) - value = value.unflatten(0, (batch_size, seq_len)) - - out = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=input_metadata.attn_bias, - p=0.0, - scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if - (is_hip()) else None, - ) - output = out.view_as(query) - else: - # prefix-enabled attention - output = torch.empty_like(query) - context_attention_fwd( - query, - key, - value, - output, - key_cache, - value_cache, - input_metadata.block_tables, # [BS, max_block_per_request] - input_metadata.start_loc, - input_metadata.prompt_lens, - input_metadata.context_lens, - input_metadata.max_seq_len, - getattr(self, "alibi_slopes", None), - ) - - else: - # Decoding run. - output = _paged_attention( - query, - key_cache, - value_cache, - input_metadata, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - ) - - # Reshape the output tensor. - return output.view(batch_size, seq_len, hidden_size) - - -def _make_alibi_bias( - alibi_slopes: torch.Tensor, - num_kv_heads: int, - batch_size: int, - seq_len: int, - dtype: torch.dtype, -) -> LowerTriangularMaskWithTensorBias: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(prompt_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - bias = bias[None, :] - bias[:, None] - - # When using custom attention bias, xformers requires the bias to - # be sliced from a tensor whose length is a multiple of 8. - padded_len = (seq_len + 7) // 8 * 8 - num_heads = alibi_slopes.shape[0] - bias = torch.empty( - batch_size, - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias.mul_(alibi_slopes[:, None, None]) - if num_heads != num_kv_heads: - bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) - attn_bias = LowerTriangularMaskWithTensorBias(bias) - return attn_bias - - -def _paged_attention( - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - input_metadata: InputMetadata, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], -) -> torch.Tensor: - output = torch.empty_like(query) - - block_size = value_cache.shape[3] - num_seqs, num_heads, head_size = query.shape - max_num_partitions = ( - (input_metadata.max_context_len + _PARTITION_SIZE - 1) // - _PARTITION_SIZE) - # NOTE(woosuk): We use a simple heuristic to decide whether to use - # PagedAttention V1 or V2. If the number of partitions is 1, we use - # V1 to avoid the overhead of reduction. Also, if the number of - # sequences or heads is large, we use V1 since there is enough work - # to parallelize. - # TODO(woosuk): Tune this heuristic. - # For context len > 8192, use V2 kernel to avoid shared memory shortage. - use_v1 = input_metadata.max_context_len <= 8192 and ( - max_num_partitions == 1 or num_seqs * num_heads > 512) - if use_v1: - # Run PagedAttention V1. - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - ) - else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - ) - return output diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 1391d43c8abeb..299ab44f8f3d5 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,5 +1,9 @@ -from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_moe, + get_config_file_name, +) __all__ = [ "fused_moe", + "get_config_file_name", ] diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..5c8185cfdeec1 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..97c9f4445b166 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json index 1fefb5ff7e42d..edf2a38d12ad3 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -1,20 +1,146 @@ { - "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "96": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, - "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}, - "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, - "2048": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "3072": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}, - "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4} + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..b2100cebb7f58 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..f578c8d0160ac --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json index 64d49ca66c1c8..e341a67917d51 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json @@ -1,24 +1,146 @@ { - "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 4}, - "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "80": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "200": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, - "208": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, - "216": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "224": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, - "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, - "512": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "2048": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "3072": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "4096": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4} + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } } diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3e6dd0dfe2eb3..1ec09f0cd4c28 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -245,6 +245,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, ) +def get_config_file_name(E: int, N: int) -> str: + device_name = torch.cuda.get_device_name().replace(" ", "_") + return f"E={E},N={N},device_name={device_name}.json" + + @functools.lru_cache def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: """ @@ -258,11 +263,10 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: # First look up if an optimized configuration is available in the configs # directory - device_name = torch.cuda.get_device_name().replace(" ", "_") + json_file_name = get_config_file_name(E, N) config_file_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "configs", - f"E={E},N={N},device_name={device_name}.json") + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) if os.path.exists(config_file_path): with open(config_file_path) as f: logger.info( diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 8d26271bd60a5..131f1ea2208b2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -76,7 +76,7 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: - if bias: + if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) return F.linear(x, weight, bias) @@ -338,9 +338,10 @@ def weight_loader(self, assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) - # This is super hacky for now but we basically want to only compress once all - # of the shards are loaded, right now we just check if the number of shards - # loaded matches the number of outputs expected, assuming one shard per output + # This is super hacky for now but we basically want to only compress + # once all of the shards are loaded, right now we just check if the + # number of shards loaded matches the number of outputs expected, + # assuming one shard per output all_shards_loaded = (len(self.loaded_shards) == len(self.output_sizes)) if all_shards_loaded and isinstance(param, LazyCompressedParameter): param.compress() @@ -489,9 +490,9 @@ def weight_loader(self, self.loaded_shards.add(loaded_shard_id) - # This is super hacky for now but we basically want to only compress once - # all of the shards are loaded, for the QKV matrix this means - # loading shards "q", "k" and "v" + # This is super hacky for now but we basically want to only + # compress once all of the shards are loaded, for the QKV matrix + # this means loading shards "q", "k" and "v" all_shards_loaded = (self.loaded_shards == set(["q", "k", "v"])) if all_shards_loaded and isinstance(param, LazyCompressedParameter): param.compress() diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py index c05cdf56e27a4..6cb53db01d3f6 100644 --- a/vllm/model_executor/layers/parameters/__init__.py +++ b/vllm/model_executor/layers/parameters/__init__.py @@ -1,4 +1,5 @@ -from vllm.model_executor.layers.parameters.lazy_compressed import LazyCompressedParameter +from vllm.model_executor.layers.parameters.lazy_compressed import ( + LazyCompressedParameter) __all__ = [ "LazyCompressedParameter", diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py index 37128a6ed54b7..05d6bfb27008f 100644 --- a/vllm/model_executor/layers/parameters/lazy_compressed.py +++ b/vllm/model_executor/layers/parameters/lazy_compressed.py @@ -66,7 +66,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs): def unwrap(e): nonlocal ret_storage_format_cls if isinstance(e, LazyCompressedParameter): - assert ret_storage_format_cls is None or ret_storage_format_cls == e.storage_format_cls + assert (ret_storage_format_cls is None + or ret_storage_format_cls == e.storage_format_cls) ret_storage_format_cls = e.storage_format_cls if e.is_empty: @@ -86,7 +87,8 @@ def wrap(e): torch.Tensor) and ret_storage_format_cls is not None: return LazyCompressedParameter( e, - # Here, "e" is the output of "func" so it is real data and we store it + # Here, "e" is the output of "func" so it is real + # data and we store it is_empty=False, storage_format_cls=ret_storage_format_cls) return e @@ -98,9 +100,10 @@ def compress(self) -> None: from magic_wand import SparseSemiStructuredStorageFormat if self.storage_format_cls == SparseSemiStructuredStorageFormat: - # Semi-structured sparsity assumes a 2:4 pattern, where each 4 elements - # have at minimum 2 zeros. We need to validate this pattern exists, so - # we check the whole tensor before committing to compression. + # Semi-structured sparsity assumes a 2:4 pattern, where + # each 4 elements have at minimum 2 zeros. We need to validate + # this pattern exists, so we check the whole tensor + # before committing to compression. # Count zeros in each group of 4 reshaped_tensor = self.uncompressed_data.view(-1, 4) @@ -112,8 +115,8 @@ def compress(self) -> None: if not has_semi_structured_sparsity: logger.warning( - f"Called compress() on tensor of shape {self.shape} but does not " - "have 2:4 sparsity, skipping compression") + f"Called compress() on tensor of shape {self.shape} but " + "does not have 2:4 sparsity, skipping compression") return else: @@ -123,8 +126,8 @@ def compress(self) -> None: # Only compress if we have sufficient sparsity (>=40%) if sparsity < 0.4: logger.warning( - f"Called compress() on tensor of shape {self.shape} but only has " - f"{sparsity:.2}% sparsity, skipping compression") + f"Called compress() on tensor of shape {self.shape}, but " + f"only has {sparsity:.2}% sparsity, skipping compression") return if self.uncompressed_data is None: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 13749570f28a2..71af9b26e2e93 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -22,7 +22,7 @@ # limitations under the License. """Rotary Positional Embeddings.""" import math -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -96,6 +96,7 @@ def _forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward().""" query = query.view(*query.shape[:-1], -1, self.head_size) @@ -107,7 +108,9 @@ def _forward( query_pass = query[..., self.rotary_dim:] key_pass = key[..., self.rotary_dim:] - cos_sin = self.cos_sin_cache[positions] + self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) + cos_sin = self.cos_sin_cache[torch.add(positions, offsets) + if offsets is not None else positions] cos, sin = cos_sin.chunk(2, dim=-1) if self.is_neox_style: # NOTE(woosuk): Here we assume that the positions tensor has the @@ -137,11 +140,19 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - # ops.rotary_embedding() is an in-place operation that - # updates the query and key tensors. - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key @@ -158,27 +169,32 @@ def __init__( max_position_embeddings: int, base: int, is_neox_style: bool, - scaling_factor: float, + scaling_factors: Union[List[float], float], ) -> None: - self.scaling_factor = scaling_factor + if isinstance(scaling_factors, float): + scaling_factors = [scaling_factors] + self.scaling_factors = scaling_factors super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style) def _compute_cos_sin_cache(self) -> torch.Tensor: inv_freq = self._compute_inv_freq(self.base) - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor - t = torch.arange(max_len, dtype=torch.float) - t = t / self.scaling_factor - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache + cache_list = [] + for scaling_factor in self.scaling_factors: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * scaling_factor + t = torch.arange(max_len, dtype=torch.float) + t = t / scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + cache_list.append(cache) + return torch.cat(cache_list, dim=0) class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py index 874819f343373..df2ca0f1b773f 100644 --- a/vllm/model_executor/layers/sparsity/__init__.py +++ b/vllm/model_executor/layers/sparsity/__init__.py @@ -9,7 +9,8 @@ from vllm.model_executor.layers.sparsity.base_config import SparsityConfig # noqa: E402 from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config # noqa: E402 -from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import SemiStructuredSparseW16A16Config # noqa: E402 +from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import ( # noqa: E402 + SemiStructuredSparseW16A16Config) _SPARSITY_CONFIG_REGISTRY = { "sparse_w16a16": SparseW16A16Config, diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py old mode 100644 new mode 100755 index 75c2ae1e9f48e..bc3b6a582d53d --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -62,8 +62,11 @@ "Sliding window attention is not yet supported in ROCm's flash attention", } -# Models not supported by Neuron. -_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"} +# Models supported by Neuron. +_NEURON_SUPPORTED_MODELS = { + "LlamaForCausalLM": "neuron.llama", + "MistralForCausalLM": "neuron.mistral" +} class ModelRegistry: diff --git a/vllm/model_executor/models/neuron/mistral.py b/vllm/model_executor/models/neuron/mistral.py new file mode 100755 index 0000000000000..a302cce30abab --- /dev/null +++ b/vllm/model_executor/models/neuron/mistral.py @@ -0,0 +1,82 @@ +"""Inference-only Mistral model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import MistralConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput +import os + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class MistralForCausalLM(nn.Module): + + def __init__( + self, + config: MistralConfig, + linear_method=None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = None + self.lm_head = None + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> SamplerOutput: + with torch.inference_mode(): + seq_ids = [] + block_size = self.model.context_buckets[-1] + if input_metadata.is_prompt: + seq_ids = input_metadata.slot_mapping[:, 0] // block_size + else: + seq_ids = input_metadata.block_tables + + logits = self.model(input_ids, + cache_ids=positions, + start_ids=seq_ids) + return logits + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.model.chkpt_model.lm_head, + hidden_states, sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + **kwargs): + from transformers_neuronx.mistral.model import MistralForSampling + + split_model_dir = f"{model_name_or_path}-split" + if os.path.isdir(os.path.join(model_name_or_path, + "pytorch_model.bin")): + split_model_dir = model_name_or_path + elif not os.path.exists(f"{model_name_or_path}-split"): + from transformers import MistralForCausalLM + from transformers_neuronx.module import save_pretrained_split + + hf_model = MistralForCausalLM.from_pretrained( + model_name_or_path, low_cpu_mem_usage=True) + save_pretrained_split(hf_model, f"{model_name_or_path}-split") + + self.model = MistralForSampling.from_pretrained( + split_model_dir, **kwargs) + self.model.to_neuron() diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py deleted file mode 100644 index ab3e28389a04c..0000000000000 --- a/vllm/worker/spec_decode/multi_step_worker.py +++ /dev/null @@ -1,178 +0,0 @@ -from typing import List, Dict -import copy - -import torch - -from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.worker.worker import Worker - - -class MultiStepWorker(Worker): - """The MultiStepWorker is equivalent to a Worker except that it allows - multiple forward passes in a single call, assuming the scheduler has - allocated enough space to store the additional KV. This reduces overhead - by invoking the scheduler less. - - The MultiStepWorker does not support cache swap operations, or beam search. - Cache swap operations do not require large modifications. On the other hand, - beam search requires memory allocations during sequence forks and thus - requires more thought for MultiStepWorker support. - """ - - @torch.inference_mode() - def execute_model_multi_step( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_steps: int, - ) -> List[SamplerOutput]: - """Run the model forward pass num_steps times. Returns the list of - sampler output, one per model forward pass. - """ - self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, - blocks_to_swap_out, blocks_to_copy) - - # Shallow copy input data so modifications (such as appending tokens) - # do not cause side-effects. - copied_seq_group_metadata_list = self._shallow_copy_inputs( - seq_group_metadata_list) - - # Assert enough KV space for num_steps tokens per sequence. - self._assert_enough_kv_space(seq_group_metadata_list, num_steps) - - # Run model num_steps times. - model_outputs = [] - for _ in range(num_steps): - model_output = super().execute_model( - seq_group_metadata_list=copied_seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) - - self._append_new_tokens(model_output, - copied_seq_group_metadata_list) - model_outputs.append(model_output) - - return model_outputs - - def _append_new_tokens( - self, model_output: SamplerOutput, - seq_group_metadata_list: SequenceGroupMetadata) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - - seq.append_token_id(token_id, token_logprob.logprob) - - def _shallow_copy_inputs( - self, seq_group_metadata_list: List[SequenceGroupMetadata] - ) -> List[SequenceGroupMetadata]: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - The multi-step worker must be able to append tokens to sequences after - a forward pass. This necessitates modification of the data structures - used by the worker. Since these data structures are shared with other - parts of vLLM, like the scheduler, we must take care not to introduce - unexpected side-effects. - - When Ray is used to orchestrate worker processes (such as when the - tensor-parallel degree is >1), this is not a problem because the input - datastructures will be serialized and created anew in the worker - process. - - However, when Ray is not used to orchestrate the worker processes (such - as when the tensor-parallel degree is 1), this is a problem. We avoid - the problem by shallow-copying the input datastructures (specifically, - the parts that will change in multiple steps). - """ - - # Shallow-copy the list of SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] - - for old_seq_group_metadata in seq_group_metadata_list: - # We must shallow-copy seq_group_metadata as is_prompt could change. - seq_group_metadata = copy.copy(old_seq_group_metadata) - new_seq_group_metadata_list.append(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[ - seq_id].output_token_ids = old_seq_data.output_token_ids[:] - - seq_group_metadata.seq_data = new_seq_data - - return new_seq_group_metadata_list - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _raise_if_unsupported( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.")