From d7f396486e3e9b4dd31020c81c6eb446593b586d Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Thu, 22 Feb 2024 04:18:37 +0200 Subject: [PATCH 001/113] Update comment (#2934) --- benchmarks/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index cdcfb8582143c..ff5609c37febf 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -7,7 +7,7 @@ --disable-log-requests (TGI backend) - ./launch_hf_server.sh + ./launch_tgi_server.sh On the client side, run: python benchmarks/benchmark_serving.py \ From 5574081c49c9a5ac51662981aff80250119a97bd Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 21 Feb 2024 21:24:01 -0500 Subject: [PATCH 002/113] Added early stopping to completion APIs (#2939) --- vllm/entrypoints/openai/protocol.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 727fec870293c..7c2aa707775ff 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -72,6 +72,7 @@ class ChatCompletionRequest(BaseModel): top_k: Optional[int] = -1 ignore_eos: Optional[bool] = False use_beam_search: Optional[bool] = False + early_stopping: Optional[bool] = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) skip_special_tokens: Optional[bool] = True spaces_between_special_tokens: Optional[bool] = True @@ -99,6 +100,7 @@ def to_sampling_params(self) -> SamplingParams: top_k=self.top_k, ignore_eos=self.ignore_eos, use_beam_search=self.use_beam_search, + early_stopping=self.early_stopping, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, @@ -129,6 +131,7 @@ class CompletionRequest(BaseModel): top_k: Optional[int] = -1 ignore_eos: Optional[bool] = False use_beam_search: Optional[bool] = False + early_stopping: Optional[bool] = False stop_token_ids: Optional[List[int]] = Field(default_factory=list) skip_special_tokens: Optional[bool] = True spaces_between_special_tokens: Optional[bool] = True @@ -157,6 +160,7 @@ def to_sampling_params(self): max_tokens=self.max_tokens if not echo_without_generation else 1, logprobs=self.logprobs, use_beam_search=self.use_beam_search, + early_stopping=self.early_stopping, prompt_logprobs=self.logprobs if self.echo else None, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=(self.spaces_between_special_tokens), From 344020c926ad19d9d147f5ab6b8929669296edcb Mon Sep 17 00:00:00 2001 From: Roy Date: Thu, 22 Feb 2024 10:25:05 +0800 Subject: [PATCH 003/113] Migrate MistralForCausalLM to LlamaForCausalLM (#2868) --- vllm/model_executor/models/__init__.py | 2 +- vllm/model_executor/models/llama.py | 6 +- vllm/model_executor/models/mistral.py | 377 ------------------------- 3 files changed, 6 insertions(+), 379 deletions(-) delete mode 100644 vllm/model_executor/models/mistral.py diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 17d8d69ba8672..411814f2f5d09 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -30,7 +30,7 @@ "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), - "MistralForCausalLM": ("mistral", "MistralForCausalLM"), + "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), # transformers's mpt class has lower case diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1d0353d7d396e..b7f6b8f3ec374 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -92,6 +92,7 @@ def __init__( max_position_embeddings: int = 8192, linear_method: Optional[LinearMethodBase] = None, bias: bool = False, + sliding_window: Optional[int] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -141,7 +142,8 @@ def __init__( self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling, - num_kv_heads=self.num_kv_heads) + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window) def forward( self, @@ -172,6 +174,7 @@ def __init__( rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + sliding_window = getattr(config, "sliding_window", None) self.self_attn = LlamaAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -182,6 +185,7 @@ def __init__( max_position_embeddings=max_position_embeddings, linear_method=linear_method, bias=getattr(config, "bias", False), + sliding_window=sliding_window, ) self.mlp = LlamaMLP( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py deleted file mode 100644 index 2347ed752d781..0000000000000 --- a/vllm/model_executor/models/mistral.py +++ /dev/null @@ -1,377 +0,0 @@ -# coding=utf-8 -# Adapted from -# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py -# Copyright 2023 The vLLM team. -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Mistral model compatible with HuggingFace weights.""" -from typing import List, Optional, Tuple - -import torch -from torch import nn -from transformers import MistralConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class MistralMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class MistralAttention(nn.Module): - - def __init__(self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - max_position: int = 4096 * 32, - rope_theta: float = 10000, - linear_method: Optional[LinearMethodBase] = None, - sliding_window: Optional[int] = None) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: - # Number of KV heads is greater than TP size, so we partition - # the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - else: - # Number of KV heads is less than TP size, so we replicate - # the KV heads across multiple tensor parallel GPUs. - assert tp_size % self.total_num_kv_heads == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.sliding_window = sliding_window - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=False, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=False, - linear_method=linear_method, - ) - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position, - base=self.rope_theta, - ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class MistralDecoderLayer(nn.Module): - - def __init__( - self, - config: MistralConfig, - linear_method: Optional[LinearMethodBase] = None, - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - self.self_attn = MistralAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - max_position=config.max_position_embeddings, - num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - linear_method=linear_method, - sliding_window=config.sliding_window) - self.mlp = MistralMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class MistralModel(nn.Module): - - def __init__( - self, - config: MistralConfig, - linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - - self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - ) - self.layers = nn.ModuleList([ - MistralDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class MistralForCausalLM(nn.Module): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - # LoRA specific attributes - supported_lora_modules = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", - "embed_tokens", - "lm_head", - ] - embedding_modules = { - "embed_tokens": "input_embeddings", - "lm_head": "output_embeddings", - } - embedding_padding_modules = ["lm_head"] - - def __init__( - self, - config: MistralConfig, - linear_method: Optional[LinearMethodBase] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = MistralModel(config, - linear_method, - lora_config=lora_config) - unpadded_vocab_size = config.vocab_size - if lora_config: - unpadded_vocab_size += lora_config.lora_extra_vocab_size - self.lm_head = ParallelLMHead( - unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config else lora_config.lora_vocab_padding_size, - ) - self.sampler = Sampler(unpadded_vocab_size, config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) From 95529e32537287831cddd800280a20d7c2417163 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 21 Feb 2024 18:28:23 -0800 Subject: [PATCH 004/113] Use Llama RMSNorm custom op for Gemma (#2974) --- vllm/model_executor/models/gemma.py | 60 +++++++++++++---------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index affe54c448a2c..03bd149c001d3 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -22,6 +22,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -40,21 +41,6 @@ KVCache = Tuple[torch.Tensor, torch.Tensor] -class GemmaRMSNorm(nn.Module): - - def __init__(self, dim: int, eps: float = 1e-6): - super().__init__() - self.eps = eps - self.weight = nn.Parameter(torch.zeros(dim)) - - def _norm(self, x): - return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) - - def forward(self, x): - output = self._norm(x.float()).type_as(x) - return output * (1 + self.weight) - - class GemmaMLP(nn.Module): def __init__( @@ -185,10 +171,10 @@ def __init__( intermediate_size=config.intermediate_size, linear_method=linear_method, ) - self.input_layernorm = GemmaRMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size, - eps=config.rms_norm_eps) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) def forward( self, @@ -196,25 +182,27 @@ def forward( hidden_states: torch.Tensor, kv_cache: KVCache, input_metadata: InputMetadata, + residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: # Self Attention - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) hidden_states = self.self_attn( positions=positions, hidden_states=hidden_states, kv_cache=kv_cache, input_metadata=input_metadata, ) - hidden_states = residual + hidden_states # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - return hidden_states + return hidden_states, residual class GemmaModel(nn.Module): @@ -235,7 +223,7 @@ def __init__( GemmaDecoderLayer(config, linear_method) for _ in range(config.num_hidden_layers) ]) - self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) def forward( self, @@ -246,17 +234,19 @@ def forward( ) -> torch.Tensor: hidden_states = self.embed_tokens(input_ids) # Normalize the embedding by sqrt(hidden_size) - hidden_states = hidden_states * (self.config.hidden_size**0.5) + hidden_states *= self.config.hidden_size**0.5 + residual = None for i in range(len(self.layers)): layer = self.layers[i] - hidden_states = layer( + hidden_states, residual = layer( positions, hidden_states, kv_caches[i], input_metadata, + residual, ) - hidden_states = self.norm(hidden_states) + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states @@ -321,6 +311,10 @@ def load_weights(self, # Skip loading extra layer for lora models. if "lm_head" in name: continue + # GemmaRMSNorm is different from Llama's in that it multiplies + # (1 + weight) to the output, instead of just weight. + if "norm.weight" in name: + loaded_weight += 1.0 param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -329,5 +323,5 @@ def load_weights(self, unloaded_params = params_dict.keys() - loaded_params if unloaded_params: raise RuntimeError( - f"Some weights are not initialized from checkpoints: {unloaded_params}" - ) + "Some weights are not initialized from checkpoints: " + f"{unloaded_params}") From 93dc5a287086299a124e9f1f6fac75458ae0acbd Mon Sep 17 00:00:00 2001 From: Massimiliano Pronesti Date: Thu, 22 Feb 2024 02:56:01 +0000 Subject: [PATCH 005/113] chore(vllm): codespell for spell checking (#2820) --- .github/workflows/ruff.yml | 5 +- benchmarks/benchmark_serving.py | 2 +- format.sh | 51 +++++++++++++++++-- mypy.ini | 8 --- pyproject.toml | 18 +++++++ requirements-dev.txt | 2 + tests/lora/test_layers.py | 2 +- tests/lora/test_llama.py | 4 +- vllm/core/block_manager.py | 2 +- vllm/core/scheduler.py | 2 +- vllm/lora/punica.py | 2 +- .../layers/triton_kernel/prefix_prefill.py | 2 +- vllm/model_executor/models/decilm.py | 2 +- .../parallel_utils/custom_all_reduce.py | 4 +- .../parallel_utils/parallel_state.py | 2 +- vllm/utils.py | 2 +- 16 files changed, 85 insertions(+), 25 deletions(-) delete mode 100644 mypy.ini diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index bd38d11872dc4..8f8f5ee3cc70c 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -25,7 +25,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff==0.1.5 + pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 - name: Analysing the code with ruff run: | ruff vllm tests + - name: Spelling check with codespell + run: | + codespell --toml pyproject.toml \ No newline at end of file diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index ff5609c37febf..7d389a9c7d703 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -375,7 +375,7 @@ def main(args: argparse.Namespace): parser.add_argument( "--disable-tqdm", action="store_true", - help="Specify to disbale tqdm progress bar.", + help="Specify to disable tqdm progress bar.", ) parser.add_argument( "--save-result", diff --git a/format.sh b/format.sh index c78108869659d..eb2c5ab031626 100755 --- a/format.sh +++ b/format.sh @@ -24,6 +24,7 @@ builtin cd "$ROOT" || exit 1 YAPF_VERSION=$(yapf --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') +CODESPELL_VERSION=$(codespell --version) # # params: tool name, tool version, required version tool_version_check() { @@ -36,6 +37,7 @@ tool_version_check() { tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" +tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)" YAPF_FLAGS=( '--recursive' @@ -93,6 +95,47 @@ echo 'vLLM yapf: Done' # echo 'vLLM mypy:' # mypy +# check spelling of specified files +spell_check() { + codespell "$@" +} + +spell_check_all(){ + codespell --toml pyproject.toml +} + +# Spelling check of files that differ from main branch. +spell_check_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + codespell + fi +} + +# Run Codespell +## This flag runs spell check of individual files. --files *must* be the first command line +## arg to use this option. +if [[ "$1" == '--files' ]]; then + spell_check "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + spell_check_all +else + # Check spelling only of the files that changed in last commit. + spell_check_changed +fi +echo 'vLLM codespell: Done' + + # Lint specified files lint() { ruff "$@" @@ -117,9 +160,9 @@ lint_changed() { } # Run Ruff -echo 'vLLM Ruff:' -## This flag lints individual files. --files *must* be the first command line -## arg to use this option. +echo 'vLLM ruff:' +### This flag lints individual files. --files *must* be the first command line +### arg to use this option. if [[ "$1" == '--files' ]]; then lint "${@:2}" # If `--all` is passed, then any further arguments are ignored and the @@ -139,3 +182,5 @@ if ! git diff --quiet &>/dev/null; then exit 1 fi + + diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 55c4248ea9d26..0000000000000 --- a/mypy.ini +++ /dev/null @@ -1,8 +0,0 @@ -[mypy] -python_version = 3.8 - -ignore_missing_imports = True - -files = vllm -# TODO(woosuk): Include the code from Megatron and HuggingFace. -exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/ diff --git a/pyproject.toml b/pyproject.toml index b197256f6ff55..c5db016cebdb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,4 +31,22 @@ ignore = [ "E731", # line too long, handled by black formatting "E501", + # .strip() with multi-character strings + "B005", + # Loop control variable not used within loop body + "B007", ] + +[tool.mypy] +python_version = "3.8" + +ignore_missing_imports = true + +files = "vllm" +# TODO(woosuk): Include the code from Megatron and HuggingFace. +exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/" + + +[tool.codespell] +ignore-words-list = "dout, te, indicies" +skip = "./tests/prompts" diff --git a/requirements-dev.txt b/requirements-dev.txt index f8126008d0794..b54a2773249cf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,9 @@ # formatting yapf==0.32.0 toml==0.10.2 +tomli==2.0.1 ruff==0.1.5 +codespell==2.2.6 # type checking mypy==0.991 diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index f739bbeaab334..18ce300449dbf 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -279,7 +279,7 @@ def create_random_embedding_layer(): 256, org_num_embeddings=512) expanded_embedding.weight.data[:512, :] = embedding_data - # We need to deepcopy the embedding as it will be modifed + # We need to deepcopy the embedding as it will be modified # in place lora_embedding = VocabParallelEmbeddingWithLoRA( deepcopy(expanded_embedding)) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 06fbf19eea824..dfaf8c700695a 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -15,7 +15,7 @@ def do_sample(llm, lora_path: str, lora_id: int): "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]" + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, @@ -53,7 +53,7 @@ def test_llama_lora(sql_lora_files, tp_size): "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", ] expected_lora_output = [ " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 7f91051f03ac1..3946096d4296a 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -178,7 +178,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: if len(block_table) < len(logical_blocks): if (self.block_sliding_window and len(block_table) >= self.block_sliding_window): - # re-use a block + # reuse a block block_table.append(block_table[len(block_table) % self.block_sliding_window]) else: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f4ac2d6dc59fe..5e7cc3091d775 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -158,7 +158,7 @@ def get_num_unfinished_seq_groups(self) -> int: return len(self.waiting) + len(self.running) + len(self.swapped) def _schedule(self) -> SchedulerOutputs: - # Blocks that need to be swaped or copied before model execution. + # Blocks that need to be swapped or copied before model execution. blocks_to_swap_in: Dict[int, int] = {} blocks_to_swap_out: Dict[int, int] = {} blocks_to_copy: Dict[int, List[int]] = {} diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 307a33dcf2820..fc74269e55876 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -87,7 +87,7 @@ def add_lora(y: torch.Tensor, r = wb_t_all.size(-1) if buffer is None: # We set the buffer to be float32 by default to avoid - # numerical innacuracies that would otherwise happen + # numerical inaccuracies that would otherwise happen # due to downcasting. buffer = torch.zeros((x.size(0), r), dtype=torch.float32, diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py index ba40d42307fab..a1a2ab0c4805c 100644 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py @@ -537,7 +537,7 @@ def _fwd_kernel_alibi( alibi_start_q = tl.arange( 0, BLOCK_M) + block_start_loc + cur_batch_ctx_len alibi_start_k = cur_batch_ctx_len - # # init debuger + # # init debugger # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc # offset_db_k = tl.arange(0, BLOCK_N) # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL] diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index 07aa4b72bf7a0..abf4a462871b0 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -41,7 +41,7 @@ class DeciLMForCausalLM(LlamaForCausalLM): Based on the llama executor. The main difference is that DeciLM uses Variable Grouped Query Attention. - The constant number of GQA heads in the decoder is overriden with a value + The constant number of GQA heads in the decoder is overridden with a value per layer. Usually, in the HuggingFace implementation, instead of diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py index ce4c8d02f7694..0c749c0484fc5 100644 --- a/vllm/model_executor/parallel_utils/custom_all_reduce.py +++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py @@ -36,14 +36,14 @@ def init_custom_ar() -> None: if world_size not in _SUPPORTED_WORLD_SIZES: logger.warn( "Custom allreduce is disabled due to an unsupported world size: " - "%d. Supported world sizes: %s. To slience this warning, specify" + "%d. Supported world sizes: %s. To silence this warning, specify" "disable_custom_all_reduce=True explicitly.", world_size, str(_SUPPORTED_WORLD_SIZES)) return if not _can_p2p(rank, world_size): logger.warn( "Custom allreduce is disabled because your platform lacks GPU P2P" - " capability. To slience this warning, specify" + " capability. To silence this warning, specify" "disable_custom_all_reduce=True explicitly.") return _CA_HANDLE = CustomAllreduce(rank, world_size) diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py index aeb07f64c37dc..c821936d06e4e 100644 --- a/vllm/model_executor/parallel_utils/parallel_state.py +++ b/vllm/model_executor/parallel_utils/parallel_state.py @@ -189,7 +189,7 @@ def get_pipeline_model_parallel_next_rank(): def get_pipeline_model_parallel_prev_rank(): - """Return the global rank that preceeds the caller in the pipeline""" + """Return the global rank that precedes the caller in the pipeline""" assert _PIPELINE_GLOBAL_RANKS is not None, ( "Pipeline parallel group is not initialized") rank_in_pipeline = get_pipeline_model_parallel_rank() diff --git a/vllm/utils.py b/vllm/utils.py index d7a3a3a2a9ef9..6206879929061 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -204,7 +204,7 @@ def _generate_random_fp8_e5m2( # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type, # it may occur Inf or NaN if we directly use torch.randint # to generate random data for fp8 data. - # For example, s.11111.00 in fp8e5m2 format repesents Inf. + # For example, s.11111.00 in fp8e5m2 format represents Inf. # | E4M3 | E5M2 #-----|-------------|------------------- # Inf | N/A | s.11111.00 From fd5dcc5c816b7392821d3d4c02b13a7cf820d962 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 21 Feb 2024 20:17:52 -0800 Subject: [PATCH 006/113] Optimize GeGLU layer in Gemma (#2975) --- csrc/activation_kernels.cu | 73 ++++++++++++++++-------- csrc/ops.h | 4 ++ csrc/pybind.cpp | 4 ++ tests/kernels/test_activation.py | 50 +++++----------- vllm/model_executor/layers/activation.py | 23 ++++++++ vllm/model_executor/models/gemma.py | 31 +++++----- 6 files changed, 108 insertions(+), 77 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 5ba9ab178d5a4..22b10f0571d1c 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -2,19 +2,16 @@ #include #include +#include + #include "cuda_compat.h" #include "dispatch_utils.h" namespace vllm { -template -__device__ __forceinline__ T silu(const T& x) { - // x * sigmoid(x) - return (T) (((float) x) / (1.0f + expf((float) -x))); -} - -template -__global__ void silu_and_mul_kernel( +// Activation and gating kernel template. +template +__global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] const int d) { @@ -22,32 +19,58 @@ __global__ void silu_and_mul_kernel( for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); - out[token_idx * d + idx] = silu(x) * y; + out[token_idx * d + idx] = ACT_FN(x) * y; } } +template +__device__ __forceinline__ T silu_kernel(const T& x) { + // x * sigmoid(x) + return (T) (((float) x) / (1.0f + expf((float) -x))); +} + +template +__device__ __forceinline__ T gelu_kernel(const T& x) { + // Equivalent to PyTorch GELU with 'none' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38 + const float f = (float) x; + constexpr float ALPHA = M_SQRT1_2; + return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA))); +} + } // namespace vllm +// Launch activation and gating kernel. +#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), \ + "act_and_mul_kernel", \ + [&] { \ + vllm::act_and_mul_kernel><<>>( \ + out.data_ptr(), \ + input.data_ptr(), \ + d); \ + }); + void silu_and_mul( torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - int64_t num_tokens = input.numel() / input.size(-1); - int d = input.size(-1) / 2; - - dim3 grid(num_tokens); - dim3 block(std::min(d, 1024)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input.scalar_type(), - "silu_and_mul_kernel", - [&] { - vllm::silu_and_mul_kernel<<>>( - out.data_ptr(), - input.data_ptr(), - d); - }); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel); +} + +void gelu_and_mul( + torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] +{ + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); } namespace vllm { diff --git a/csrc/ops.h b/csrc/ops.h index 2bcd0c2efc5c6..dbdd2c2c57945 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -57,6 +57,10 @@ void silu_and_mul( torch::Tensor& out, torch::Tensor& input); +void gelu_and_mul( + torch::Tensor& out, + torch::Tensor& input); + void gelu_new( torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index b36d259697167..24c22020131e8 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -22,6 +22,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU."); + ops.def( + "gelu_and_mul", + &gelu_and_mul, + "Activation function used in GeGLU."); ops.def( "gelu_new", &gelu_new, diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 8e216c293f070..e0dec144eba11 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,7 +1,10 @@ +from typing import Type + import pytest import torch -from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul +from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, + NewGELU, SiluAndMul) from allclose_default import get_default_atol, get_default_rtol DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -13,13 +16,15 @@ ] +@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() -def test_silu_and_mul( +def test_act_and_mul( + activation: Type[torch.nn.Module], num_tokens: int, d: int, dtype: torch.dtype, @@ -31,48 +36,23 @@ def test_silu_and_mul( torch.cuda.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) - layer = SiluAndMul() + layer = activation() out = layer(x) ref_out = layer._forward(x) - assert torch.allclose(out, - ref_out, - atol=get_default_atol(out), - rtol=get_default_rtol(out)) + # The SiLU and GELU implementations are equivalent to the native PyTorch + # implementations, so we can do exact comparison. + assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0) +@pytest.mark.parametrize("activation", [FastGELU, NewGELU]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() -def test_gelu_new( - num_tokens: int, - d: int, - dtype: torch.dtype, - seed: int, - device: str, -) -> None: - torch.random.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - torch.set_default_device(device) - x = torch.randn(num_tokens, d, dtype=dtype) - layer = NewGELU() - out = layer(x) - ref_out = layer._forward(x) - assert torch.allclose(out, - ref_out, - atol=get_default_atol(out), - rtol=get_default_rtol(out)) - - -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("d", D) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_gelu_fast( +def test_activation( + activation: Type[torch.nn.Module], num_tokens: int, d: int, dtype: torch.dtype, @@ -84,7 +64,7 @@ def test_gelu_fast( torch.cuda.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) - layer = FastGELU() + layer = activation() out = layer(x) ref_out = layer._forward(x) assert torch.allclose(out, diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 95902ae38e256..5a3a7b2dbaee7 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -37,6 +37,29 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return out +class GeluAndMul(nn.Module): + """An activation function for GeGLU. + + The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2. + + Shapes: + x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) + return: (batch_size, seq_len, d) or (num_tokens, d) + """ + + def _forward(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + d = x.shape[-1] // 2 + return F.gelu(x[..., :d]) * x[..., d:] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.gelu_and_mul(out, x) + return out + + class NewGELU(nn.Module): def _forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 03bd149c001d3..d8b515993d8ff 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -21,10 +21,11 @@ from transformers import GemmaConfig from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.attention import PagedAttention from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearMethodBase, +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.rotary_embedding import get_rope @@ -50,27 +51,21 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, ) -> None: super().__init__() - self.gate_proj = ColumnParallelLinear(hidden_size, - intermediate_size, - bias=False, - linear_method=linear_method) - self.up_proj = ColumnParallelLinear(hidden_size, - intermediate_size, - bias=False, - linear_method=linear_method) + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, linear_method=linear_method) - self.act_fn = nn.GELU() + self.act_fn = GeluAndMul() def forward(self, x): - gate, _ = self.gate_proj(x) - gate = self.act_fn(gate) - up, _ = self.up_proj(x) - fuse = gate * up - outputs, _ = self.down_proj(fuse) - return outputs + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x class GemmaAttention(nn.Module): @@ -294,6 +289,8 @@ def load_weights(self, ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) loaded_params = set() From c530e2cfe3b3d7e60130ff817cee7f3a395af232 Mon Sep 17 00:00:00 2001 From: 44670 <44670@users.noreply.github.com> Date: Thu, 22 Feb 2024 17:40:05 +0800 Subject: [PATCH 007/113] [FIX] Fix a bug in initializing Yarn RoPE (#2983) --- vllm/model_executor/layers/rotary_embedding.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 93ec5c12536fb..87068644112c0 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -245,13 +245,11 @@ def _yarn_find_correction_range(low_rot: int, def _yarn_linear_ramp_mask(low: float, high: float, dim: int, - dtype: torch.dtype, - device: torch.device) -> torch.Tensor: + dtype: torch.dtype) -> torch.Tensor: if low == high: high += 0.001 # Prevent singularity - linear_func = (torch.arange(dim, dtype=dtype, device=device) - - low) / (high - low) + linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low) ramp_func = torch.clamp(linear_func, 0, 1) return ramp_func From 6f32cddf1c795e74a47e84620462431154718f49 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 22 Feb 2024 09:58:29 -0800 Subject: [PATCH 008/113] Remove Flash Attention in test env (#2982) --- requirements-dev.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index b54a2773249cf..80d66530f47f0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,7 +17,6 @@ pytest-forked pytest-asyncio httpx einops # required for MPT -flash_attn # required for HuggingFace's llama implementation openai requests -ray \ No newline at end of file +ray From 4caf7044e052399f07089aa8f586d5bd641f7d53 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Fri, 23 Feb 2024 00:00:12 +0200 Subject: [PATCH 009/113] Include tokens from prompt phase in `counter_generation_tokens` (#2802) --- .buildkite/test-pipeline.yaml | 3 +++ tests/metrics/test_metrics.py | 34 +++++++++++++++++++++++++++++++++- vllm/engine/llm_engine.py | 3 +++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a91dcdfaf2ea5..efcc4d2d07a12 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -52,6 +52,9 @@ steps: - label: LoRA Test command: pytest -v -s lora +- label: Metrics Test + command: pytest -v -s metrics + - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" commands: diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index da608a6a18f92..fe09aa8237f24 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -9,13 +9,16 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [128]) -def test_metrics( +def test_metric_counter_prompt_tokens( vllm_runner, example_prompts, model: str, dtype: str, max_tokens: int, ) -> None: + # Reset metric + vllm.engine.metrics.counter_prompt_tokens.set_value({}, 0) + vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False) tokenizer = vllm_model.model.get_tokenizer() prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] @@ -31,3 +34,32 @@ def test_metrics( assert vllm_prompt_token_count == metric_count, ( f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}" ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_metric_counter_generation_tokens( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + # Reset metric + vllm.engine.metrics.counter_generation_tokens.set_value({}, 0) + + vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False) + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + tokenizer = vllm_model.model.get_tokenizer() + metric_count = vllm.engine.metrics.counter_generation_tokens.get_value({}) + vllm_generation_count = 0 + for i in range(len(example_prompts)): + vllm_output_ids, vllm_output_str = vllm_outputs[i] + prompt_ids = tokenizer.encode(example_prompts[i]) + # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens. + vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) + + assert vllm_generation_count == metric_count, ( + f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}" + ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f0de40f54db61..81c9281c55416 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -872,6 +872,9 @@ def _get_stats(self, num_prompt_tokens = sum( len(seq_group.prompt_token_ids) for seq_group in scheduler_outputs.scheduled_seq_groups) + num_generation_tokens = sum( + seq_group.num_seqs() + for seq_group in scheduler_outputs.scheduled_seq_groups) else: num_generation_tokens = scheduler_outputs.num_batched_tokens From 57f044945f25d90d1b434014b2719ba6b06fdc44 Mon Sep 17 00:00:00 2001 From: zhaoyang-star Date: Fri, 23 Feb 2024 06:25:07 +0800 Subject: [PATCH 010/113] Fix nvcc not found in vlm-openai image (#2781) --- vllm/config.py | 2 +- vllm/utils.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 0b8a2a27f6d43..bd0dc89b585f7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -319,7 +319,7 @@ def _verify_cache_dtype(self) -> None: pass elif self.cache_dtype == "fp8_e5m2": nvcc_cuda_version = get_nvcc_cuda_version() - if nvcc_cuda_version < Version("11.8"): + if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"): raise ValueError( "FP8 is not supported when cuda version is lower than 11.8." ) diff --git a/vllm/utils.py b/vllm/utils.py index 6206879929061..8ca95e148eb39 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -181,13 +181,18 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids)) -def get_nvcc_cuda_version() -> Version: +def get_nvcc_cuda_version() -> Optional[Version]: cuda_home = os.environ.get('CUDA_HOME') if not cuda_home: cuda_home = '/usr/local/cuda' - logger.info( - f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' - ) + if os.path.isfile(cuda_home + '/bin/nvcc'): + logger.info( + f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' + ) + else: + logger.warning( + f'Not found nvcc in {cuda_home}. Skip cuda version check!') + return None nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True) output = nvcc_output.split() From f7c1234990793008f3d44790fd274040f26c4ee4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 23 Feb 2024 12:57:48 -0800 Subject: [PATCH 011/113] [Fix] Fissertion on YaRN model len (#2984) --- vllm/model_executor/layers/rotary_embedding.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 87068644112c0..13749570f28a2 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -354,7 +354,6 @@ def get_rope( elif scaling_type == "yarn": original_max_position = rope_scaling[ "original_max_position_embeddings"] - assert max_position == original_max_position * scaling_factor extra_kwargs = { k: v for k, v in rope_scaling.items() From ef978fe4111b0eb91c81eceba4d9791b94c7ffbf Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 25 Feb 2024 19:54:00 +0000 Subject: [PATCH 012/113] Port metrics from `aioprometheus` to `prometheus_client` (#2730) --- docs/source/conf.py | 2 +- requirements-neuron.txt | 2 +- requirements-rocm.txt | 2 +- requirements.txt | 2 +- tests/conftest.py | 2 + tests/metrics/test_metrics.py | 25 ++-- vllm/engine/llm_engine.py | 3 +- vllm/engine/metrics.py | 170 ++++++++++++++++---------- vllm/entrypoints/openai/api_server.py | 12 +- 9 files changed, 133 insertions(+), 87 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index adbe67b21a0c8..5a45c6f9d1e0a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,7 +72,7 @@ # Mock out external dependencies here. autodoc_mock_imports = [ - "torch", "transformers", "psutil", "aioprometheus", "sentencepiece", + "torch", "transformers", "psutil", "prometheus_client", "sentencepiece", "vllm.cuda_utils", "vllm._C" ] diff --git a/requirements-neuron.txt b/requirements-neuron.txt index 3f30ed08f037d..36e629add664d 100644 --- a/requirements-neuron.txt +++ b/requirements-neuron.txt @@ -6,4 +6,4 @@ neuronx-cc fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. -aioprometheus[starlette] +prometheus_client diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 42b89ae84aa45..e759ba7d028d9 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -10,4 +10,4 @@ transformers >= 4.38.0 # Required for Gemma. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. -aioprometheus[starlette] +prometheus_client diff --git a/requirements.txt b/requirements.txt index de08bd29beaf9..de93ba6354cda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ xformers == 0.0.23.post1 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. -aioprometheus[starlette] +prometheus_client pynvml == 11.5.0 triton >= 2.1.0 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. diff --git a/tests/conftest.py b/tests/conftest.py index 6af9b36b6febe..30a3df89d9f12 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -165,6 +165,7 @@ def __init__( dtype: str = "half", disable_log_stats: bool = True, tensor_parallel_size: int = 1, + **kwargs, ) -> None: self.model = LLM( model=model_name, @@ -174,6 +175,7 @@ def __init__( swap_space=0, disable_log_stats=disable_log_stats, tensor_parallel_size=tensor_parallel_size, + **kwargs, ) def generate( diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index fe09aa8237f24..410bdfa5c69e2 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,5 +1,4 @@ import pytest -import vllm.engine.metrics MODELS = [ "facebook/opt-125m", @@ -16,10 +15,10 @@ def test_metric_counter_prompt_tokens( dtype: str, max_tokens: int, ) -> None: - # Reset metric - vllm.engine.metrics.counter_prompt_tokens.set_value({}, 0) - - vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False) + vllm_model = vllm_runner(model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4) tokenizer = vllm_model.model.get_tokenizer() prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding. @@ -29,7 +28,9 @@ def test_metric_counter_prompt_tokens( vllm_prompt_token_count = sum(prompt_token_counts) _ = vllm_model.generate_greedy(example_prompts, max_tokens) - metric_count = vllm.engine.metrics.counter_prompt_tokens.get_value({}) + stat_logger = vllm_model.model.llm_engine.stat_logger + metric_count = stat_logger.metrics.counter_prompt_tokens.labels( + **stat_logger.labels)._value.get() assert vllm_prompt_token_count == metric_count, ( f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}" @@ -46,13 +47,15 @@ def test_metric_counter_generation_tokens( dtype: str, max_tokens: int, ) -> None: - # Reset metric - vllm.engine.metrics.counter_generation_tokens.set_value({}, 0) - - vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False) + vllm_model = vllm_runner(model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) tokenizer = vllm_model.model.get_tokenizer() - metric_count = vllm.engine.metrics.counter_generation_tokens.get_value({}) + stat_logger = vllm_model.model.llm_engine.stat_logger + metric_count = stat_logger.metrics.counter_generation_tokens.labels( + **stat_logger.labels)._value.get() vllm_generation_count = 0 for i in range(len(example_prompts)): vllm_output_ids, vllm_output_str = vllm_outputs[i] diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 81c9281c55416..c1a75924c6d72 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -128,7 +128,8 @@ def __init__( # Metric Logging. if self.log_stats: self.stat_logger = StatLogger( - local_interval=_LOCAL_LOGGING_INTERVAL_SEC) + local_interval=_LOCAL_LOGGING_INTERVAL_SEC, + labels=dict(model_name=model_config.model)) self.forward_dag = None if USE_RAY_COMPILED_DAG: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e613b9f551b2f..83e66a9372272 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,66 +1,94 @@ from vllm.logger import init_logger -from aioprometheus import Counter, Gauge, Histogram +from prometheus_client import Counter, Gauge, Histogram, REGISTRY, disable_created_metrics import time import numpy as np -from typing import List +from typing import Dict, List from dataclasses import dataclass logger = init_logger(__name__) -labels = {} - - -def add_global_metrics_labels(**kwargs): - labels.update(kwargs) - +disable_created_metrics() # The begin-* and end* here are used by the documentation generator # to extract the metrics definitions. + # begin-metrics-definitions -gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s", - "Average prefill throughput in tokens/s.") -gauge_avg_generation_throughput = Gauge( - "vllm:avg_generation_throughput_toks_per_s", - "Average generation throughput in tokens/s.") -counter_prompt_tokens = Counter("vllm:prompt_tokens_total", - "Number of prefill tokens processed.") -counter_generation_tokens = Counter("vllm:generation_tokens_total", - "Number of generation tokens processed.") - -gauge_scheduler_running = Gauge( - "vllm:num_requests_running", - "Number of requests currently running on GPU.") -gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", - "Number of requests swapped to CPU.") -gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", - "Number of requests waiting to be processed.") - -gauge_gpu_cache_usage = Gauge( - "vllm:gpu_cache_usage_perc", - "GPU KV-cache usage. 1 means 100 percent usage.") -gauge_cpu_cache_usage = Gauge( - "vllm:cpu_cache_usage_perc", - "CPU KV-cache usage. 1 means 100 percent usage.") - -histogram_time_to_first_token = Histogram( - "vllm:time_to_first_token_seconds", - "Histogram of time to first token in seconds.", - buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0, - 2.5, 5.0, 7.5, 10.0 - ]) -histogram_time_per_output_tokens = Histogram( - "vllm:time_per_output_token_seconds", - "Histogram of time per output token in seconds.", - buckets=[ - 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5 - ]) -histogram_e2e_request_latency = Histogram( - "vllm:e2e_request_latency_seconds", - "Histogram of end to end request latency in seconds.", - buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) +class Metrics: + + def __init__(self, labelnames: List[str]): + # Unregister any existing vLLM collectors + for collector in list(REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + REGISTRY.unregister(collector) + + # System stats + self.gauge_scheduler_running = Gauge( + name="vllm:num_requests_running", + documentation="Number of requests currently running on GPU.", + labelnames=labelnames) + self.gauge_scheduler_swapped = Gauge( + name="vllm:num_requests_swapped", + documentation="Number of requests swapped to CPU.", + labelnames=labelnames) + self.gauge_scheduler_waiting = Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames) + self.gauge_gpu_cache_usage = Gauge( + name="vllm:gpu_cache_usage_perc", + documentation="GPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames) + self.gauge_cpu_cache_usage = Gauge( + name="vllm:cpu_cache_usage_perc", + documentation="CPU KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames) + + # Raw stats from last model iteration + self.counter_prompt_tokens = Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames) + self.counter_generation_tokens = Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames) + self.histogram_time_to_first_token = Histogram( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + labelnames=labelnames, + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + ]) + self.histogram_time_per_output_token = Histogram( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + labelnames=labelnames, + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5 + ]) + self.histogram_e2e_request_latency = Histogram( + name="vllm:e2e_request_latency_seconds", + documentation="Histogram of end to end request latency in seconds.", + labelnames=labelnames, + buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) + + # Legacy metrics + self.gauge_avg_prompt_throughput = Gauge( + name="vllm:avg_prompt_throughput_toks_per_s", + documentation="Average prefill throughput in tokens/s.", + labelnames=labelnames, + ) + self.gauge_avg_generation_throughput = Gauge( + name="vllm:avg_generation_throughput_toks_per_s", + documentation="Average generation throughput in tokens/s.", + labelnames=labelnames, + ) + + # end-metrics-definitions @@ -87,7 +115,7 @@ class Stats: class StatLogger: """StatLogger is used LLMEngine to log to Promethus and Stdout.""" - def __init__(self, local_interval: float) -> None: + def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: # Metadata for logging locally. self.last_local_log = time.monotonic() self.local_interval = local_interval @@ -96,6 +124,10 @@ def __init__(self, local_interval: float) -> None: self.num_prompt_tokens: List[int] = [] self.num_generation_tokens: List[int] = [] + # Prometheus metrics + self.labels = labels + self.metrics = Metrics(labelnames=list(labels.keys())) + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: return float(np.sum(tracked_stats) / (now - self.last_local_log)) @@ -105,23 +137,33 @@ def _local_interval_elapsed(self, now: float) -> bool: def _log_prometheus(self, stats: Stats) -> None: # Set system stat gauges. - gauge_scheduler_running.set(labels, stats.num_running) - gauge_scheduler_swapped.set(labels, stats.num_swapped) - gauge_scheduler_waiting.set(labels, stats.num_waiting) - gauge_gpu_cache_usage.set(labels, stats.gpu_cache_usage) - gauge_cpu_cache_usage.set(labels, stats.cpu_cache_usage) + self.metrics.gauge_scheduler_running.labels(**self.labels).set( + stats.num_running) + self.metrics.gauge_scheduler_swapped.labels(**self.labels).set( + stats.num_swapped) + self.metrics.gauge_scheduler_waiting.labels(**self.labels).set( + stats.num_waiting) + self.metrics.gauge_gpu_cache_usage.labels(**self.labels).set( + stats.gpu_cache_usage) + self.metrics.gauge_cpu_cache_usage.labels(**self.labels).set( + stats.cpu_cache_usage) # Add to token counters. - counter_prompt_tokens.add(labels, stats.num_prompt_tokens) - counter_generation_tokens.add(labels, stats.num_generation_tokens) + self.metrics.counter_prompt_tokens.labels(**self.labels).inc( + stats.num_prompt_tokens) + self.metrics.counter_generation_tokens.labels(**self.labels).inc( + stats.num_generation_tokens) # Observe request level latencies in histograms. for ttft in stats.time_to_first_tokens: - histogram_time_to_first_token.observe(labels, ttft) + self.metrics.histogram_time_to_first_token.labels( + **self.labels).observe(ttft) for tpot in stats.time_per_output_tokens: - histogram_time_per_output_tokens.observe(labels, tpot) + self.metrics.histogram_time_per_output_token.labels( + **self.labels).observe(tpot) for e2e in stats.time_e2e_requests: - histogram_e2e_request_latency.observe(labels, e2e) + self.metrics.histogram_e2e_request_latency.labels( + **self.labels).observe(e2e) def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: @@ -130,8 +172,10 @@ def _log_prometheus_interval(self, prompt_throughput: float, # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens # Which log raw data and calculate summaries using rate() on the grafana/prometheus side. # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 - gauge_avg_prompt_throughput.set(labels, prompt_throughput) - gauge_avg_generation_throughput.set(labels, generation_throughput) + self.metrics.gauge_avg_prompt_throughput.labels( + **self.labels).set(prompt_throughput) + self.metrics.gauge_avg_generation_throughput.labels( + **self.labels).set(generation_throughput) def log(self, stats: Stats) -> None: """Called by LLMEngine. diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a217605452e3a..b2f040114a078 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -6,8 +6,7 @@ import importlib import inspect -from aioprometheus import MetricsMiddleware -from aioprometheus.asgi.starlette import metrics +from prometheus_client import make_asgi_app import fastapi import uvicorn from http import HTTPStatus @@ -18,7 +17,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.metrics import add_global_metrics_labels from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse from vllm.logger import init_logger from vllm.entrypoints.openai.serving_chat import OpenAIServingChat @@ -141,8 +139,9 @@ def parse_args(): return parser.parse_args() -app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics -app.add_route("/metrics", metrics) # Exposes HTTP metrics +# Add prometheus asgi middleware to route /metrics requests +metrics_app = make_asgi_app() +app.mount("/metrics", metrics_app) @app.exception_handler(RequestValidationError) @@ -242,9 +241,6 @@ async def authentication(request: Request, call_next): openai_serving_completion = OpenAIServingCompletion( engine, served_model, args.lora_modules) - # Register labels for metrics - add_global_metrics_labels(model_name=engine_args.model) - app.root_path = args.root_path uvicorn.run(app, host=args.host, From 70f3e8e3a1ed081003c0a2b70de151bb144f98e0 Mon Sep 17 00:00:00 2001 From: Jared Moore <27744679+jlcmoore@users.noreply.github.com> Date: Sun, 25 Feb 2024 18:39:34 -0800 Subject: [PATCH 013/113] Add LogProbs for Chat Completions in OpenAI (#2918) --- tests/entrypoints/test_openai_server.py | 25 ++++++++-------- vllm/entrypoints/openai/protocol.py | 8 ++++++ vllm/entrypoints/openai/serving_chat.py | 38 +++++++++++++++++++++++-- 3 files changed, 57 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 3a359502c39d5..29d0e6fd537d5 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -155,15 +155,18 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - ) + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=10) assert chat_completion.id is not None assert chat_completion.choices is not None and len( chat_completion.choices) == 1 assert chat_completion.choices[0].message is not None + assert chat_completion.choices[0].logprobs is not None + assert chat_completion.choices[0].logprobs.top_logprobs is not None + assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 10 message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" @@ -198,13 +201,11 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI, single_output = single_completion.choices[0].text single_usage = single_completion.usage - stream = await client.completions.create( - model=model_name, - prompt=prompt, - max_tokens=5, - temperature=0.0, - stream=True, - ) + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True) chunks = [] async for chunk in stream: chunks.append(chunk.choices[0].text) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 7c2aa707775ff..f57a2fb775783 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -63,6 +63,8 @@ class ChatCompletionRequest(BaseModel): seed: Optional[int] = None stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stream: Optional[bool] = False + logprobs: Optional[bool] = False + top_logprobs: Optional[int] = None presence_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0 logit_bias: Optional[Dict[str, float]] = None @@ -84,6 +86,8 @@ class ChatCompletionRequest(BaseModel): length_penalty: Optional[float] = 1.0 def to_sampling_params(self) -> SamplingParams: + if self.logprobs and not self.top_logprobs: + raise ValueError("Top logprobs must be set when logprobs is.") return SamplingParams( n=self.n, presence_penalty=self.presence_penalty, @@ -96,6 +100,8 @@ def to_sampling_params(self) -> SamplingParams: stop=self.stop, stop_token_ids=self.stop_token_ids, max_tokens=self.max_tokens, + logprobs=self.top_logprobs if self.logprobs else None, + prompt_logprobs=self.top_logprobs if self.echo else None, best_of=self.best_of, top_k=self.top_k, ignore_eos=self.ignore_eos, @@ -216,6 +222,7 @@ class ChatMessage(BaseModel): class ChatCompletionResponseChoice(BaseModel): index: int message: ChatMessage + logprobs: Optional[LogProbs] = None finish_reason: Optional[Literal["stop", "length"]] = None @@ -236,6 +243,7 @@ class DeltaMessage(BaseModel): class ChatCompletionResponseStreamChoice(BaseModel): index: int delta: DeltaMessage + logprobs: Optional[LogProbs] = None finish_reason: Optional[Literal["stop", "length"]] = None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 850797ae4b9b6..dd152583c2329 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -101,7 +101,10 @@ async def chat_completion_stream_generator( role = self.get_chat_request_role(request) for i in range(request.n): choice_data = ChatCompletionResponseStreamChoice( - index=i, delta=DeltaMessage(role=role), finish_reason=None) + index=i, + delta=DeltaMessage(role=role), + logprobs=None, + finish_reason=None) chunk = ChatCompletionStreamResponse(id=request_id, object=chunk_object_type, created=created_time, @@ -118,6 +121,7 @@ async def chat_completion_stream_generator( "content") and request.messages[-1].get( "role") == role: last_msg_content = request.messages[-1]["content"] + if last_msg_content: for i in range(request.n): choice_data = ChatCompletionResponseStreamChoice( @@ -129,6 +133,7 @@ async def chat_completion_stream_generator( object=chunk_object_type, created=created_time, choices=[choice_data], + logprobs=None, model=model_name) data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -145,15 +150,29 @@ async def chat_completion_stream_generator( if finish_reason_sent[i]: continue + delta_token_ids = output.token_ids[previous_num_tokens[i]:] + top_logprobs = output.logprobs[ + previous_num_tokens[i]:] if output.logprobs else None + + if request.logprobs: + logprobs = self._create_logprobs( + token_ids=delta_token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + initial_text_offset=len(previous_texts[i]), + ) + else: + logprobs = None + delta_text = output.text[len(previous_texts[i]):] previous_texts[i] = output.text previous_num_tokens[i] = len(output.token_ids) - if output.finish_reason is None: # Send token-by-token response for each request.n choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(content=delta_text), + logprobs=logprobs, finish_reason=None) chunk = ChatCompletionStreamResponse( id=request_id, @@ -174,6 +193,7 @@ async def chat_completion_stream_generator( choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(content=delta_text), + logprobs=logprobs, finish_reason=output.finish_reason) chunk = ChatCompletionStreamResponse( id=request_id, @@ -208,11 +228,25 @@ async def chat_completion_full_generator( assert final_res is not None choices = [] + role = self.get_chat_request_role(request) for output in final_res.outputs: + token_ids = output.token_ids + top_logprobs = output.logprobs + + if request.logprobs: + logprobs = self._create_logprobs( + token_ids=token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + ) + else: + logprobs = None + choice_data = ChatCompletionResponseChoice( index=output.index, message=ChatMessage(role=role, content=output.text), + logprobs=logprobs, finish_reason=output.finish_reason, ) choices.append(choice_data) From cfc15a1031ef0197a1b291d2ed93717a9bdad268 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Mon, 26 Feb 2024 13:48:56 -0800 Subject: [PATCH 014/113] Optimize Triton MoE Kernel (#2979) Co-authored-by: Cade Daniel --- benchmarks/kernels/benchmark_mixtral_moe.py | 172 ++++++++++++++++++ setup.py | 4 +- .../layers/fused_moe/__init__.py | 5 + ...584,device_name=NVIDIA_A100-SXM4-80GB.json | 20 ++ ...168,device_name=NVIDIA_H100_80GB_HBM3.json | 24 +++ .../layers/fused_moe/configs/README | 10 + .../layers/{ => fused_moe}/fused_moe.py | 77 ++++++-- 7 files changed, 297 insertions(+), 15 deletions(-) create mode 100644 benchmarks/kernels/benchmark_mixtral_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/__init__.py create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/README rename vllm/model_executor/layers/{ => fused_moe}/fused_moe.py (85%) diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py new file mode 100644 index 0000000000000..9e08df76947f8 --- /dev/null +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -0,0 +1,172 @@ +import json +import os +import sys + +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + +from vllm.model_executor.layers.fused_moe import fused_moe +import torch +import torch.nn.functional as F +import triton + + +def main(): + method = fused_moe + for bs in [ + 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, + 2048, 3072, 4096 + ]: + run_grid(bs, method=method) + + +def run_grid(bs, method): + d_model = 4096 + num_total_experts = 8 + top_k = 2 + tp_size = 2 + model_intermediate_size = 14336 + num_layers = 32 + num_calls = 100 + + num_warmup_trials = 1 + num_trials = 1 + + configs = [] + if bs <= 16: + BLOCK_SIZES_M = [16] + elif bs <= 32: + BLOCK_SIZES_M = [16, 32] + elif bs <= 64: + BLOCK_SIZES_M = [16, 32, 64] + elif bs <= 128: + BLOCK_SIZES_M = [16, 32, 64, 128] + else: + BLOCK_SIZES_M = [16, 32, 64, 128, 256] + + for block_size_n in [32, 64, 128, 256]: + for block_size_m in BLOCK_SIZES_M: + for block_size_k in [64, 128, 256]: + for group_size_m in [1, 16, 32, 64]: + for num_warps in [4, 8]: + configs.append({ + "BLOCK_SIZE_M": block_size_m, + "BLOCK_SIZE_N": block_size_n, + "BLOCK_SIZE_K": block_size_k, + "GROUP_SIZE_M": group_size_m, + "num_warps": num_warps, + "num_stages": 4, + }) + + best_config = None + best_time_us = 1e20 + + for config in configs: + print(f'{tp_size=} {bs=}') + print(f'{config}') + # warmup + print(f'warming up') + try: + for _ in range(num_warmup_trials): + run_timing( + num_calls=num_calls, + bs=bs, + d_model=d_model, + num_total_experts=num_total_experts, + top_k=top_k, + tp_size=tp_size, + model_intermediate_size=model_intermediate_size, + method=method, + config=config, + ) + except triton.runtime.autotuner.OutOfResources: + continue + + # trial + print(f'benchmarking') + for _ in range(num_trials): + kernel_dur_ms = run_timing( + num_calls=num_calls, + bs=bs, + d_model=d_model, + num_total_experts=num_total_experts, + top_k=top_k, + tp_size=tp_size, + model_intermediate_size=model_intermediate_size, + method=method, + config=config, + ) + + kernel_dur_us = 1000 * kernel_dur_ms + model_dur_ms = kernel_dur_ms * num_layers + + if kernel_dur_us < best_time_us: + best_config = config + best_time_us = kernel_dur_us + + print( + f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}' + ) + + print("best_time_us", best_time_us) + print("best_config", best_config) + + filename = "/tmp/config.jsonl" + print(f"writing config to file {filename}") + with open(filename, "a") as f: + f.write(json.dumps({str(bs): best_config}) + "\n") + + +def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, + top_k: int, tp_size: int, model_intermediate_size: int, method, + config) -> float: + shard_intermediate_size = model_intermediate_size // tp_size + + hidden_states = torch.rand( + (bs, d_model), + device="cuda:0", + dtype=torch.bfloat16, + ) + + ws = torch.rand( + (num_total_experts, 2 * shard_intermediate_size, d_model), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + w2s = torch.rand( + (num_total_experts, d_model, shard_intermediate_size), + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + + gating_output = F.softmax(torch.rand( + (num_calls, bs, num_total_experts), + device=hidden_states.device, + dtype=torch.float32, + ), + dim=-1) + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + start_event.record() + for i in range(num_calls): + hidden_states = method( + hidden_states=hidden_states, + w1=ws, + w2=w2s, + gating_output=gating_output[i], + topk=2, + renormalize=True, + inplace=True, + override_config=config, + ) + end_event.record() + end_event.synchronize() + + dur_ms = start_event.elapsed_time(end_event) / num_calls + return dur_ms + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/setup.py b/setup.py index 8fcb86394f76d..16978d74e0425 100644 --- a/setup.py +++ b/setup.py @@ -432,7 +432,9 @@ def get_requirements() -> List[str]: return requirements -package_data = {"vllm": ["py.typed"]} +package_data = { + "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] +} if os.environ.get("VLLM_USE_PRECOMPILED"): ext_modules = [] package_data["vllm"].append("*.so") diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py new file mode 100644 index 0000000000000..1391d43c8abeb --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -0,0 +1,5 @@ +from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe + +__all__ = [ + "fused_moe", +] diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..1fefb5ff7e42d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,20 @@ +{ + "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, + "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, + "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, + "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, + "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "96": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, + "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, + "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, + "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, + "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}, + "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, + "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, + "2048": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, + "3072": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}, + "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4} +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..64d49ca66c1c8 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,24 @@ +{ + "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, + "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 4}, + "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, + "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, + "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "80": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, + "200": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, + "208": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, + "216": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, + "224": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, + "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, + "512": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, + "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, + "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, + "2048": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, + "3072": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, + "4096": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4} +} diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README new file mode 100644 index 0000000000000..45d40cbfb1a2e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/README @@ -0,0 +1,10 @@ +This directory contains tuned configurations for different settings of the fused_moe kernel. +For different settings of +- E (number of experts) +- N (intermediate size) +- device_name (torch.cuda.get_device_name()) +the JSON file contains a mapping from M (batch size) to the chosen configuration. + +The example configurations provided are for the Mixtral model for TP2 on H100 +and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have +N = 7168 and for TP4 we have N = 3584. diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py similarity index 85% rename from vllm/model_executor/layers/fused_moe.py rename to vllm/model_executor/layers/fused_moe/fused_moe.py index bc3aef1887ef8..830fde6c4eb6d 100644 --- a/vllm/model_executor/layers/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1,11 +1,19 @@ """Fused MoE kernel.""" +import functools +import json +import os +from typing import Any, Dict, Optional + import torch import triton import triton.language as tl from vllm._C import ops +from vllm.logger import init_logger from vllm.utils import is_hip +logger = init_logger(__name__) + @triton.jit def fused_moe_kernel( @@ -210,6 +218,34 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, ) +@functools.lru_cache +def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: + """ + Return optimized configurations for the fused MoE kernel. + + The return value will be a dictionary that maps an irregular grid of batch sizes + to configurations of the fused_moe kernel. To evaluate the kernel on a given batch + size bs, the closest batch size in the grid should be picked and the associated + configuration chosen to invoke the kernel. + """ + + # First look up if an optimized configuration is available in the configs directory + device_name = torch.cuda.get_device_name().replace(" ", "_") + + config_file_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "configs", + f"E={E},N={N},device_name={device_name}.json") + if os.path.exists(config_file_path): + with open(config_file_path) as f: + logger.info( + f"Using configuration from {config_file_path} for MoE layer.") + # If a configuration has been found, return it + return {int(key): val for key, val in json.load(f).items()} + + # If no optimized configuration is available, we will use the default configuration + return None + + def fused_moe( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -218,6 +254,7 @@ def fused_moe( topk: int, renormalize: bool, inplace: bool = False, + override_config: Optional[Dict[str, Any]] = None, ) -> torch.Tensor: """ This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. @@ -230,6 +267,7 @@ def fused_moe( - topk (int): The number of top-k experts to select. - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - inplace (bool): If True, perform the operation in-place. Defaults to False. + - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration. Returns: - torch.Tensor: The output tensor after applying the MoE layer. @@ -279,20 +317,31 @@ def fused_moe( if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) - config = { - 'BLOCK_SIZE_M': 64, - 'BLOCK_SIZE_N': 64, - 'BLOCK_SIZE_K': 32, - 'GROUP_SIZE_M': 8 - } - - if topk_ids.numel() <= w1.shape[0]: - config = { - 'BLOCK_SIZE_M': 16, - 'BLOCK_SIZE_N': 32, - 'BLOCK_SIZE_K': 64, - 'GROUP_SIZE_M': 1 - } + if override_config: + config = override_config + else: + # First try to load optimal config from the file + configs = get_moe_configs(E, w2.shape[2]) + + if configs: + # If an optimal configuration map has been found, look up the optimal config + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Else use the default config + config = { + 'BLOCK_SIZE_M': 64, + 'BLOCK_SIZE_N': 64, + 'BLOCK_SIZE_K': 32, + 'GROUP_SIZE_M': 8 + } + + if M <= E: + config = { + 'BLOCK_SIZE_M': 16, + 'BLOCK_SIZE_N': 32, + 'BLOCK_SIZE_K': 64, + 'GROUP_SIZE_M': 1 + } intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N), device=hidden_states.device, From d6e4a130b028f42a7f413d99eb91a4395fa7a04a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 26 Feb 2024 15:00:54 -0800 Subject: [PATCH 015/113] [Minor] Remove gather_cached_kv kernel (#3043) --- csrc/cache.h | 7 -- csrc/cache_kernels.cu | 161 ------------------------------------------ csrc/pybind.cpp | 4 -- 3 files changed, 172 deletions(-) diff --git a/csrc/cache.h b/csrc/cache.h index 21c71830f7942..765e231abd26f 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -23,13 +23,6 @@ void reshape_and_cache( torch::Tensor& slot_mapping, const std::string& kv_cache_dtype); -void gather_cached_kv( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping); - // Just for unittest void convert_fp8_e5m2( torch::Tensor& src_cache, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index ceb7347d94670..7254010b8e3a9 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -269,167 +269,6 @@ void reshape_and_cache( namespace vllm { -// Grid: (num_blocks, block_size). -template -__global__ void gather_cached_kv_kernel( - scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size] - scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads, head_size] - const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int* __restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) { - const int token_idx = blockIdx.x; - const int slot_idx = slot_mapping[token_idx]; - const int block_idx = slot_idx / block_size; - const int block_offset = slot_idx % block_size; - - const int num_tokens = num_heads * head_size; - for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) { - const int tgt_key_idx = token_idx * key_stride + i; - const int tgt_value_idx = token_idx * value_stride + i; - - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int x_idx = head_offset / x; // the offset of the [head_size/x] dimension - const int x_offset = head_offset % x; - - const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int src_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - - key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]); - value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]); - } -} - -template -__global__ void gather_cached_kv_kernel_optimized( - scalar_t *__restrict__ key, // [num_tokens, [stride], num_heads, head_size] - scalar_t *__restrict__ value, // [num_tokens, [stride], num_heads, head_size] - const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int *__restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) -{ - const int token_idx = blockIdx.x; - const int slot_idx = slot_mapping[token_idx]; - const int block_idx = slot_idx / block_size; - const int block_offset = slot_idx % block_size; - - const int dim = num_heads * head_size; - assert(dim % 4 == 0); // this is true for known use cases - const int unroll_factor = 4; - const int unrolled_dim = dim / unroll_factor; - - for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x) - { - int tgt_key_indices[unroll_factor]; - int tgt_value_indices[unroll_factor]; - int src_key_indices[unroll_factor]; - int src_value_indices[unroll_factor]; - scalar_t keys_to_store[unroll_factor]; - scalar_t values_to_store[unroll_factor]; - - #pragma unroll - for (int j = 0; j < unroll_factor; ++j) - { - int index = i + j * unrolled_dim; - - const int tgt_key_idx = token_idx * key_stride + index; - const int tgt_value_idx = token_idx * value_stride + index; - - const int head_idx = index / head_size; - const int head_offset = index % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - - const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int src_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - - tgt_key_indices[j] = tgt_key_idx; - tgt_value_indices[j] = tgt_value_idx; - src_key_indices[j] = src_key_idx; - src_value_indices[j] = src_value_idx; - - keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]); - values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]); - } - - #pragma unroll - for (int j = 0; j < unroll_factor; ++j) - { - key[tgt_key_indices[j]] = keys_to_store[j]; - value[tgt_value_indices[j]] = values_to_store[j]; - } - } -} - -} // namespace vllm - -void gather_cached_kv( - torch::Tensor& key, // [out] [num_tokens, num_heads, head_size] - torch::Tensor& value, // [out] [num_tokens, num_heads, head_size] - torch::Tensor& key_cache, // [in] [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [in] [num_blocks, num_heads, head_size, block_size] - torch::Tensor& slot_mapping) // [in] [num_tokens] -{ - int num_tokens = key.size(0); - int num_heads = key.size(1); - int head_size = key.size(2); - int block_size = key_cache.size(3); - int x = key_cache.size(4); - - int key_stride = key.stride(0); - int value_stride = value.stride(0); - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * head_size, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key.scalar_type(), - "gather_cached_kv_kernel_optimized", - [&] { - vllm::gather_cached_kv_kernel_optimized<<>>( - key.data_ptr(), - value.data_ptr(), - key_cache.data_ptr(), - value_cache.data_ptr(), - slot_mapping.data_ptr(), - key_stride, - value_stride, - num_heads, - head_size, - block_size, - x); - }); -} - -namespace vllm { - template __global__ void convert_fp8_e5m2_kernel( const Tin* __restrict__ src_cache, diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 24c22020131e8..5d062bb5700bc 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -79,10 +79,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "reshape_and_cache", &reshape_and_cache, "Reshape the key and value tensors and cache them"); - cache_ops.def( - "gather_cached_kv", - &gather_cached_kv, - "Gather key and value from the cache into contiguous QKV tensors"); cache_ops.def( "convert_fp8_e5m2", &convert_fp8_e5m2, From d9f726c4d0920e705069c005fb3b1042368961ae Mon Sep 17 00:00:00 2001 From: Roy Date: Tue, 27 Feb 2024 09:25:22 +0800 Subject: [PATCH 016/113] [Minor] Remove unused config files (#3039) --- vllm/model_executor/models/baichuan.py | 6 +- vllm/model_executor/models/olmo.py | 4 +- vllm/model_executor/models/qwen.py | 8 +-- vllm/transformers_utils/config.py | 2 - vllm/transformers_utils/configs/__init__.py | 6 -- vllm/transformers_utils/configs/baichuan.py | 62 ------------------ vllm/transformers_utils/configs/olmo.py | 72 --------------------- vllm/transformers_utils/configs/qwen.py | 60 ----------------- 8 files changed, 10 insertions(+), 210 deletions(-) delete mode 100644 vllm/transformers_utils/configs/baichuan.py delete mode 100644 vllm/transformers_utils/configs/olmo.py delete mode 100644 vllm/transformers_utils/configs/qwen.py diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f08c3c8d257ff..550dec6487f9e 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -23,6 +23,7 @@ import torch from torch import nn +from transformers import PretrainedConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul @@ -42,7 +43,6 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.baichuan import BaiChuanConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -186,7 +186,7 @@ def forward( class BaiChuanDecoderLayer(nn.Module): def __init__(self, - config: BaiChuanConfig, + config: PretrainedConfig, position_embedding: str, linear_method: Optional[LinearMethodBase] = None): super().__init__() @@ -245,7 +245,7 @@ def forward( class BaiChuanModel(nn.Module): def __init__(self, - config: BaiChuanConfig, + config: PretrainedConfig, position_embedding: str, linear_method: Optional[LinearMethodBase] = None): super().__init__() diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 2eb42935e8bfd..9d563039208c8 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -61,7 +61,9 @@ hf_model_weights_iterator, ) from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.olmo import OLMoConfig + +# this model must need this dependency +from hf_olmo import OLMoConfig KVCache = Tuple[torch.Tensor, torch.Tensor] diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index fbc7320fb45a4..37af84c7cd53f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -8,6 +8,7 @@ import torch from torch import nn +from transformers import PretrainedConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul @@ -27,7 +28,6 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput -from vllm.transformers_utils.configs.qwen import QWenConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -127,7 +127,7 @@ class QWenBlock(nn.Module): def __init__( self, - config: QWenConfig, + config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None, ): super().__init__() @@ -179,7 +179,7 @@ class QWenModel(nn.Module): def __init__( self, - config: QWenConfig, + config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None, ): super().__init__() @@ -222,7 +222,7 @@ class QWenLMHeadModel(nn.Module): def __init__( self, - config: QWenConfig, + config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None, ): super().__init__() diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 491cb4d9a427c..6b0413f440a0e 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -5,10 +5,8 @@ from vllm.transformers_utils.configs import * _CONFIG_REGISTRY = { - "baichuan": BaiChuanConfig, "chatglm": ChatGLMConfig, "mpt": MPTConfig, - "qwen": QWenConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 47bcc2b9594be..ef955f75cedaa 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,18 +1,12 @@ -from vllm.transformers_utils.configs.baichuan import BaiChuanConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.mpt import MPTConfig -from vllm.transformers_utils.configs.olmo import OLMoConfig -from vllm.transformers_utils.configs.qwen import QWenConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig __all__ = [ - "BaiChuanConfig", "ChatGLMConfig", "MPTConfig", - "OLMoConfig", - "QWenConfig", "RWConfig", ] diff --git a/vllm/transformers_utils/configs/baichuan.py b/vllm/transformers_utils/configs/baichuan.py deleted file mode 100644 index 869817525c11a..0000000000000 --- a/vllm/transformers_utils/configs/baichuan.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from transformers.configuration_utils import PretrainedConfig - - -class BaiChuanConfig(PretrainedConfig): - model_type = "baichuan" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=64000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) diff --git a/vllm/transformers_utils/configs/olmo.py b/vllm/transformers_utils/configs/olmo.py deleted file mode 100644 index a9dfc6ec88ca6..0000000000000 --- a/vllm/transformers_utils/configs/olmo.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding=utf-8 -# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py -"""OLMo configuration""" -from transformers import PretrainedConfig - - -class OLMoConfig(PretrainedConfig): - model_type = 'olmo' - attribute_map = { - 'num_attention_heads': 'n_heads', - 'hidden_size': 'd_model', - 'num_hidden_layers': 'n_layers', - } - - # Note that the defaults for these attributes are equivalent to the base GPT2 model. - def __init__( - self, - d_model=768, - n_heads=12, - n_layers=12, - mlp_ratio=4, - mlp_hidden_size=None, - activation_type="swiglu", - block_type="sequential", - block_group_size=1, - alibi=False, - alibi_bias_max=8.0, - rope=False, - rope_full_precision=True, - multi_query_attention=False, - attention_layer_norm=False, - layer_norm_type="default", - layer_norm_with_affine=True, - attention_layer_norm_with_affine=True, - max_sequence_length=1024, - include_bias=True, - bias_for_layer_norm=None, - scale_logits=False, - vocab_size=50257, - embedding_size=50304, - weight_tying=True, - eos_token_id=50256, - pad_token_id=50256, - **kwargs, - ): - self.d_model = d_model - self.n_heads = n_heads - self.n_layers = n_layers - self.mlp_ratio = mlp_ratio - self.mlp_hidden_size = mlp_hidden_size - self.activation_type = activation_type - self.block_type = block_type - self.block_group_size = block_group_size - self.alibi = alibi - self.alibi_bias_max = alibi_bias_max - self.rope = rope - self.rope_full_precision = rope_full_precision - self.multi_query_attention = multi_query_attention - self.attention_layer_norm = attention_layer_norm - self.layer_norm_type = layer_norm_type - self.layer_norm_with_affine = layer_norm_with_affine - self.attention_layer_norm_with_affine = attention_layer_norm_with_affine - self.max_sequence_length = max_sequence_length - self.include_bias = include_bias - self.bias_for_layer_norm = bias_for_layer_norm - self.scale_logits = scale_logits - self.vocab_size = vocab_size - self.embedding_size = embedding_size - self.weight_tying = weight_tying - self.eos_token_id = eos_token_id - self.pad_token_id = pad_token_id - super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/qwen.py b/vllm/transformers_utils/configs/qwen.py deleted file mode 100644 index bb033a337ad04..0000000000000 --- a/vllm/transformers_utils/configs/qwen.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) Alibaba Cloud. -# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE - -from transformers import PretrainedConfig - - -class QWenConfig(PretrainedConfig): - model_type = "qwen" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=151936, - hidden_size=4096, - num_hidden_layers=32, - num_attention_heads=32, - emb_dropout_prob=0.0, - attn_dropout_prob=0.0, - layer_norm_epsilon=1e-6, - initializer_range=0.02, - max_position_embeddings=8192, - scale_attn_weights=True, - use_cache=True, - bf16=False, - fp16=False, - fp32=False, - kv_channels=128, - rotary_pct=1.0, - rotary_emb_base=10000, - use_dynamic_ntk=True, - use_logn_attn=True, - use_flash_attn="auto", - intermediate_size=22016, - no_bias=True, - tie_word_embeddings=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.emb_dropout_prob = emb_dropout_prob - self.attn_dropout_prob = attn_dropout_prob - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.scale_attn_weights = scale_attn_weights - self.use_cache = use_cache - self.max_position_embeddings = max_position_embeddings - self.bf16 = bf16 - self.fp16 = fp16 - self.fp32 = fp32 - self.kv_channels = kv_channels - self.rotary_pct = rotary_pct - self.rotary_emb_base = rotary_emb_base - self.use_dynamic_ntk = use_dynamic_ntk - self.use_logn_attn = use_logn_attn - self.use_flash_attn = use_flash_attn - self.no_bias = no_bias - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) From c1c0d00b88320f97e00a3175fac235a232893da5 Mon Sep 17 00:00:00 2001 From: Roy Date: Tue, 27 Feb 2024 09:33:38 +0800 Subject: [PATCH 017/113] Don't use cupy when `enforce_eager=True` (#3037) --- vllm/engine/llm_engine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c1a75924c6d72..f5b2145c22d6f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -284,7 +284,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", is_driver_worker=True, ) - self._run_workers("init_model", cupy_port=get_open_port()) + # don't use cupy for eager mode + self._run_workers("init_model", + cupy_port=get_open_port() + if not model_config.enforce_eager else None) self._run_workers( "load_model", max_concurrent_workers=self.parallel_config. From 4dd6416faf7cc3035ac3f5c8375eb27e6b0eee80 Mon Sep 17 00:00:00 2001 From: Roy Date: Tue, 27 Feb 2024 10:31:10 +0800 Subject: [PATCH 018/113] Fix stablelm (#3038) --- vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/stablelm.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 411814f2f5d09..40b375bb6fbea 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -43,6 +43,7 @@ "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), + "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), } # Models not supported by ROCm. diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 95e5ad8ede63e..44c57e5a6d4f9 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -94,7 +94,9 @@ def __init__(self, 1, self.total_num_key_value_heads // tp_size) self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rotary_ndims = int(self.head_dim * self.config.rope_pct) + rope_pct = getattr(config, "rope_pct", + getattr(config, "partial_rotary_factor", 1)) + self.rotary_ndims = int(self.head_dim * rope_pct) self.scaling = self.head_dim**-0.5 self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_key_value_heads * self.head_dim @@ -114,7 +116,6 @@ def __init__(self, self.hidden_size, bias=False, linear_method=linear_method) - self.rotary_ndims = int(self.head_dim * self.config.rope_pct) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.rotary_ndims, @@ -152,10 +153,11 @@ def __init__( super().__init__() self.self_attn = StablelmAttention(config) self.mlp = StablelmMLP(config, linear_method) - self.input_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.norm_eps) + norm_eps = getattr(config, "norm_eps", + getattr(config, "layer_norm_eps", 1e-05)) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.norm_eps) + eps=norm_eps) def forward( self, @@ -199,7 +201,9 @@ def __init__(self, StablelmDecoderLayer(config, linear_method) for _ in range(config.num_hidden_layers) ]) - self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps) + norm_eps = getattr(config, "norm_eps", + getattr(config, "layer_norm_eps", 1e-05)) + self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps) def forward( self, From 48a8f4a7fd18d516ffc0a304219ef722613ea792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E5=A4=A7=E6=88=90?= <1345739055@qq.com> Date: Tue, 27 Feb 2024 11:17:06 +0800 Subject: [PATCH 019/113] Support Orion model (#2539) Co-authored-by: zhangdacheng Co-authored-by: Woosuk Kwon --- README.md | 1 + docs/source/models/supported_models.rst | 3 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/orion.py | 322 ++++++++++++++++++++++++ 4 files changed, 327 insertions(+) create mode 100644 vllm/model_executor/models/orion.py diff --git a/README.md b/README.md index 7a16bb1fef044..f771788db2b89 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) - OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.) - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) +- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.) - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.) - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index c1639ca9e056a..35b548d2737ce 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -71,6 +71,9 @@ Alongside each architecture, we include some popular models that use it. * - :code:`OPTForCausalLM` - OPT, OPT-IML - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. + * - :code:`OrionForCausalLM` + - Orion + - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. * - :code:`PhiForCausalLM` - Phi - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 40b375bb6fbea..66d28207d664f 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -38,6 +38,7 @@ "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), + "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py new file mode 100644 index 0000000000000..0b067d4fc8802 --- /dev/null +++ b/vllm/model_executor/models/orion.py @@ -0,0 +1,322 @@ +# coding=utf-8 +# Adapted from +# https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py +# Copyright (c) OrionStar Inc. +# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE +"""Inference-only Orion-14B model compatible with HuggingFace weights.""" +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class OrionMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class OrionAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + linear_method=linear_method, + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class OrionDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = OrionAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + ) + self.mlp = OrionMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states, None + + +class OrionModel(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + OrionDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states = self.norm(hidden_states) + return hidden_states + + +class OrionForCausalLM(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + linear_method: Optional[LinearMethodBase] = None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = OrionModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) From 2410e320b35cd704059b7c6ba8d8ba7643fe46ee Mon Sep 17 00:00:00 2001 From: Jingru Date: Tue, 27 Feb 2024 11:22:16 +0800 Subject: [PATCH 020/113] fix `get_ip` error in pure ipv6 environment (#2931) --- vllm/utils.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 8ca95e148eb39..c8ac57de6f5f5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -162,9 +162,16 @@ def _async_wrapper(*args, **kwargs) -> asyncio.Future: def get_ip() -> str: + # try ipv4 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable - return s.getsockname()[0] + try: + s.connect(("dns.google", 80)) # Doesn't need to be reachable + return s.getsockname()[0] + except OSError: + # try ipv6 + s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + s.connect(("dns.google", 80)) + return s.getsockname()[0] def get_distributed_init_method(ip: str, port: int) -> str: @@ -172,9 +179,16 @@ def get_distributed_init_method(ip: str, port: int) -> str: def get_open_port() -> int: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - return s.getsockname()[1] + # try ipv4 + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + except OSError: + # try ipv6 + with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] def set_cuda_visible_devices(device_ids: List[int]) -> None: From 4bd18ec0c719d2910040e22fa60503fdbfce1332 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 26 Feb 2024 19:44:29 -0800 Subject: [PATCH 021/113] [Minor] Fix type annotation in fused moe (#3045) --- vllm/model_executor/layers/fused_moe/fused_moe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 830fde6c4eb6d..08e3c2d5b706e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -2,7 +2,7 @@ import functools import json import os -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple import torch import triton @@ -137,7 +137,7 @@ def fused_moe_kernel( def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, - num_experts: int) -> (torch.Tensor, torch.Tensor, torch.Tensor): + num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns the token distribution across experts to be compatible with block size for matrix multiplication. @@ -185,7 +185,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, num_tokens_post_padded: torch.Tensor, - mul_routed_weight: bool, top_k: int, config: dict): + mul_routed_weight: bool, top_k: int, + config: Dict[str, Any]) -> None: assert topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 From e0ade06d6305cf84b41c1962cdd9dfdbfee16ac9 Mon Sep 17 00:00:00 2001 From: Dylan Hawk <51147702+dylanwhawk@users.noreply.github.com> Date: Mon, 26 Feb 2024 19:51:53 -0800 Subject: [PATCH 022/113] Support logit bias for OpenAI API (#3027) --- tests/entrypoints/test_openai_server.py | 48 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 33 +++++++++++++ vllm/entrypoints/openai/serving_chat.py | 8 +--- vllm/entrypoints/openai/serving_completion.py | 6 +-- 4 files changed, 83 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 29d0e6fd537d5..72e2374899793 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -9,6 +9,8 @@ import openai # use the official client for correctness check from huggingface_hub import snapshot_download # downloading lora to test lora requests +from vllm.transformers_utils.tokenizer import get_tokenizer + MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here LORA_NAME = "typeof/zephyr-7b-beta-lora" # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here @@ -310,5 +312,51 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, assert texts[0] == texts[1] +async def test_logits_bias(server, client: openai.AsyncOpenAI): + prompt = "Hello, my name is" + max_tokens = 5 + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + + # Test exclusive selection + token_id = 1000 + completion = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=max_tokens, + temperature=0.0, + logit_bias={str(token_id): 100}, + ) + assert completion.choices[0].text is not None and len( + completion.choices[0].text) >= 5 + response_tokens = tokenizer(completion.choices[0].text, + add_special_tokens=False)["input_ids"] + expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), + add_special_tokens=False)["input_ids"] + assert all([ + response == expected + for response, expected in zip(response_tokens, expected_tokens) + ]) + + # Test ban + completion = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=max_tokens, + temperature=0.0, + ) + response_tokens = tokenizer(completion.choices[0].text, + add_special_tokens=False)["input_ids"] + first_response = completion.choices[0].text + completion = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=max_tokens, + temperature=0.0, + logit_bias={str(token): -100 + for token in response_tokens}, + ) + assert first_response != completion.choices[0].text + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f57a2fb775783..e85e7e2b1ede9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -8,6 +8,8 @@ from vllm.utils import random_uuid from vllm.sampling_params import SamplingParams +import torch + class ErrorResponse(BaseModel): object: str = "error" @@ -88,6 +90,21 @@ class ChatCompletionRequest(BaseModel): def to_sampling_params(self) -> SamplingParams: if self.logprobs and not self.top_logprobs: raise ValueError("Top logprobs must be set when logprobs is.") + + logits_processors = None + if self.logit_bias: + + def logit_bias_logits_processor( + token_ids: List[int], + logits: torch.Tensor) -> torch.Tensor: + for token_id, bias in self.logit_bias.items(): + # Clamp the bias between -100 and 100 per OpenAI API spec + bias = min(100, max(-100, bias)) + logits[int(token_id)] += bias + return logits + + logits_processors = [logit_bias_logits_processor] + return SamplingParams( n=self.n, presence_penalty=self.presence_penalty, @@ -111,6 +128,7 @@ def to_sampling_params(self) -> SamplingParams: spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, length_penalty=self.length_penalty, + logits_processors=logits_processors, ) @@ -149,6 +167,20 @@ class CompletionRequest(BaseModel): def to_sampling_params(self): echo_without_generation = self.echo and self.max_tokens == 0 + logits_processors = None + if self.logit_bias: + + def logit_bias_logits_processor( + token_ids: List[int], + logits: torch.Tensor) -> torch.Tensor: + for token_id, bias in self.logit_bias.items(): + # Clamp the bias between -100 and 100 per OpenAI API spec + bias = min(100, max(-100, bias)) + logits[int(token_id)] += bias + return logits + + logits_processors = [logit_bias_logits_processor] + return SamplingParams( n=self.n, best_of=self.best_of, @@ -172,6 +204,7 @@ def to_sampling_params(self): spaces_between_special_tokens=(self.spaces_between_special_tokens), include_stop_str_in_output=self.include_stop_str_in_output, length_penalty=self.length_penalty, + logits_processors=logits_processors, ) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index dd152583c2329..5635ac6c9e106 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -39,19 +39,13 @@ async def create_chat_completion( See https://platform.openai.com/docs/api-reference/chat/create for the API specification. This API mimics the OpenAI ChatCompletion API. - NOTE: Currently we do not support the following features: + NOTE: Currently we do not support the following feature: - function_call (Users should implement this by themselves) - - logit_bias (to be supported by vLLM engine) """ error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret - if request.logit_bias is not None and len(request.logit_bias) > 0: - # TODO: support logit_bias in vLLM engine. - return self.create_error_response( - "logit_bias is not currently supported") - try: prompt = self.tokenizer.apply_chat_template( conversation=request.messages, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 667b659f81e9e..610f53549da48 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -264,10 +264,9 @@ async def create_completion(self, request: CompletionRequest, See https://platform.openai.com/docs/api-reference/completions/create for the API specification. This API mimics the OpenAI Completion API. - NOTE: Currently we do not support the following features: + NOTE: Currently we do not support the following feature: - suffix (the language models we currently support do not support suffix) - - logit_bias (to be supported by vLLM engine) """ error_check_ret = await self._check_model(request) if error_check_ret is not None: @@ -277,9 +276,6 @@ async def create_completion(self, request: CompletionRequest, if request.suffix is not None: return self.create_error_response( "suffix is not currently supported") - if request.logit_bias is not None and len(request.logit_bias) > 0: - return self.create_error_response( - "logit_bias is not currently supported") model_name = request.model request_id = f"cmpl-{random_uuid()}" From 8b430d7dea5695324636fc458c1cce52213bd499 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 26 Feb 2024 20:23:50 -0800 Subject: [PATCH 023/113] [Minor] Fix StableLMEpochForCausalLM -> StableLmForCausalLM (#3046) --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 35b548d2737ce..9d4ec663a16e5 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -83,7 +83,7 @@ Alongside each architecture, we include some popular models that use it. * - :code:`Qwen2ForCausalLM` - Qwen2 - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. - * - :code:`StableLMEpochForCausalLM` + * - :code:`StableLmForCausalLM` - StableLM - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. From 71bcaf99e2cb2c677bf3a9addb9e8039cbcab22a Mon Sep 17 00:00:00 2001 From: Tao He Date: Tue, 27 Feb 2024 17:14:31 +0800 Subject: [PATCH 024/113] Enable GQA support in the prefix prefill kernels (#3007) Signed-off-by: Tao He --- tests/kernels/test_prefix_prefill.py | 61 +++++++++++++------ vllm/model_executor/layers/attention.py | 34 ++++++----- .../layers/triton_kernel/prefix_prefill.py | 39 ++++++++---- 3 files changed, 87 insertions(+), 47 deletions(-) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index ac93b32588cca..c068b38a66910 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -8,7 +8,8 @@ from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask -NUM_HEADS = [12] +NUM_HEADS = [64] +NUM_QUERIES_PER_KV = [1, 8, 64] HEAD_SIZES = [128] DTYPES = [torch.float16] CUDA_DEVICES = [ @@ -17,12 +18,14 @@ @pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_queries_per_kv", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_contexted_kv_attention( num_heads: int, + num_queries_per_kv: int, head_size: int, dtype: torch.dtype, device: str, @@ -41,28 +44,29 @@ def test_contexted_kv_attention( subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)] ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)] seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)] + num_kv_heads = num_heads // num_queries_per_kv num_tokens = sum(subquery_lens) query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) query.uniform_(-1e-3, 1e-3) output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) - kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype) + kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) kv.uniform_(-1e-3, 1e-3) key, value = kv.unbind(dim=1) k_cache = torch.zeros(cache_size, block_size, - num_heads, + num_kv_heads, head_size, dtype=dtype) v_cache = torch.zeros(cache_size, block_size, - num_heads, + num_kv_heads, head_size, dtype=dtype) - k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype) - v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype) + k = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype) + v = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype) values = torch.arange(0, cache_size, dtype=torch.long) values = values[torch.randperm(cache_size)] block_table = values[:BS * max_block_per_request].view( @@ -93,19 +97,21 @@ def test_contexted_kv_attention( end_loc = start_loc + block_size start_slot = block_table[i, block_id] * block_size end_slot = start_slot + end_loc - start_loc - k_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_( - key[start_loc:end_loc]) - v_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_( - value[start_loc:end_loc]) + k_cache.view(-1, num_kv_heads, + head_size)[start_slot:end_slot].copy_( + key[start_loc:end_loc]) + v_cache.view(-1, num_kv_heads, + head_size)[start_slot:end_slot].copy_( + value[start_loc:end_loc]) cur_ctx += block_size block_id += 1 # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size] # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8] - k_cache = k_cache.view(-1, block_size, num_heads, head_size // 8, + k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8, 8).permute(0, 2, 3, 1, 4).contiguous() # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size] # to V_cache[num_blocks, num_kv_heads, head_size, block_size] - v_cache = v_cache.view(-1, block_size, num_heads, + v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() # Warm up the Triton kernel by calling it once before actually measuring generation time @@ -123,12 +129,29 @@ def test_contexted_kv_attention( attn_op = xops.fmha.cutlass.FwOp() + if num_kv_heads != num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # + # see also: vllm/model_executor/layers/attention.py + query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv, + query.shape[-1]) + key = key[:, :, None, :].expand(key.shape[0], num_kv_heads, + num_queries_per_kv, key.shape[-1]) + value = value[:, :, + None, :].expand(value.shape[0], num_kv_heads, + num_queries_per_kv, value.shape[-1]) + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) + attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens( subquery_lens, seq_lens) output_ref = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), + query, + key, + value, attn_bias=attn_bias, p=0.0, scale=scale, @@ -137,9 +160,9 @@ def test_contexted_kv_attention( torch.cuda.synchronize() start_time = time.time() output_ref = xops.memory_efficient_attention_forward( - query.unsqueeze(0), - key.unsqueeze(0), - value.unsqueeze(0), + query, + key, + value, attn_bias=attn_bias, p=0.0, scale=scale, @@ -148,5 +171,5 @@ def test_contexted_kv_attention( torch.cuda.synchronize() end_time = time.time() print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms") - output_ref = output_ref.squeeze(0) + output_ref = output_ref.squeeze(0, 2) assert torch.allclose(output_ref, output, atol=1e-6, rtol=0) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 0622a54db1bc0..2a82325b80213 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -137,25 +137,27 @@ def forward( ) if input_metadata.is_prompt: - # Prompt run. - if self.num_kv_heads != self.num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # TODO(woosuk): Use MQA/GQA kernels for higher performance. - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) # normal attention if (key_cache is None or value_cache is None or input_metadata.block_tables.numel() == 0): + if self.num_kv_heads != self.num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # TODO(woosuk): Use MQA/GQA kernels for higher performance. + query = query.view(query.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + query.shape[-1]) + key = key[:, :, + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, + None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) + # Set attention bias if not provided. This typically happens at # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py index a1a2ab0c4805c..70f09224f1cf6 100644 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py @@ -45,6 +45,7 @@ def _fwd_kernel( stride_v_cache_h, stride_v_cache_d, stride_v_cache_bl, + num_queries_per_kv: int, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, @@ -53,6 +54,8 @@ def _fwd_kernel( cur_head = tl.program_id(1) start_m = tl.program_id(2) + cur_kv_head = cur_head // num_queries_per_kv + cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) @@ -85,13 +88,14 @@ def _fwd_kernel( mask=(start_n + offs_n) < cur_batch_ctx_len, other=0) off_k = (bn[None, :] * stride_k_cache_bs + - cur_head * stride_k_cache_h + + cur_kv_head * stride_k_cache_h + (offs_d[:, None] // x) * stride_k_cache_d + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl + (offs_d[:, None] % x) * stride_k_cache_x) off_v = ( - bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h + + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + offs_d[None, :] * stride_v_cache_d + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) k = tl.load(K_cache + off_k, @@ -131,9 +135,9 @@ def _fwd_kernel( l_i = l_i_new m_i = m_i_new - off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh + + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] * stride_vd) k_ptrs = K + off_k v_ptrs = V + off_v @@ -232,6 +236,7 @@ def _fwd_kernel_flash_attn_v2( stride_v_cache_h, stride_v_cache_d, stride_v_cache_bl, + num_queries_per_kv: int, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, @@ -240,6 +245,8 @@ def _fwd_kernel_flash_attn_v2( cur_head = tl.program_id(1) start_m = tl.program_id(2) + cur_kv_head = cur_head // num_queries_per_kv + cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch) cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) @@ -272,13 +279,14 @@ def _fwd_kernel_flash_attn_v2( mask=(start_n + offs_n) < cur_batch_ctx_len, other=0) off_k = (bn[None, :] * stride_k_cache_bs + - cur_head * stride_k_cache_h + + cur_kv_head * stride_k_cache_h + (offs_d[:, None] // x) * stride_k_cache_d + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl + (offs_d[:, None] % x) * stride_k_cache_x) off_v = ( - bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h + + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + offs_d[None, :] * stride_v_cache_d + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) k = tl.load(K_cache + off_k, @@ -317,9 +325,9 @@ def _fwd_kernel_flash_attn_v2( l_i = l_i_new m_i = m_i_new - off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh + + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] * stride_vd) k_ptrs = K + off_k v_ptrs = V + off_v @@ -420,6 +428,7 @@ def _fwd_kernel_alibi( stride_v_cache_h, stride_v_cache_d, stride_v_cache_bl, + num_queries_per_kv: int, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, @@ -429,6 +438,8 @@ def _fwd_kernel_alibi( cur_head = tl.program_id(1) start_m = tl.program_id(2) + cur_kv_head = cur_head // num_queries_per_kv + # cur_batch_seq_len: the length of prompts # cur_batch_ctx_len: the length of prefix # cur_batch_in_all_start_index: the start id of the dim=0 @@ -468,13 +479,14 @@ def _fwd_kernel_alibi( mask=(start_n + offs_n) < cur_batch_ctx_len, other=0) off_k = (bn[None, :] * stride_k_cache_bs + - cur_head * stride_k_cache_h + + cur_kv_head * stride_k_cache_h + (offs_d[:, None] // x) * stride_k_cache_d + ((start_n + offs_n[None, :]) % block_size) * stride_k_cache_bl + (offs_d[:, None] % x) * stride_k_cache_x) off_v = ( - bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h + + bn[:, None] * stride_v_cache_bs + + cur_kv_head * stride_v_cache_h + offs_d[None, :] * stride_v_cache_d + (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl) k = tl.load(K_cache + off_k, @@ -522,9 +534,9 @@ def _fwd_kernel_alibi( l_i = l_i_new m_i = m_i_new - off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh + + off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] * stride_kd) - off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh + + off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] * stride_vd) k_ptrs = K + off_k v_ptrs = V + off_v @@ -628,6 +640,7 @@ def context_attention_fwd(q, sm_scale = 1.0 / (Lq**0.5) batch, head = b_seq_len.shape[0], q.shape[1] + num_queries_per_kv = q.shape[1] // k.shape[1] grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, @@ -674,6 +687,7 @@ def context_attention_fwd(q, v_cache.stride(2), v_cache.stride( 3), #[num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, BLOCK_M=BLOCK, BLOCK_DMODEL=Lk, BLOCK_N=BLOCK, @@ -721,6 +735,7 @@ def context_attention_fwd(q, v_cache.stride(2), v_cache.stride( 3), #[num_blocks, num_kv_heads, head_size, block_size] + num_queries_per_kv=num_queries_per_kv, BLOCK_M=BLOCK, BLOCK_DMODEL=Lk, BLOCK_N=BLOCK, From a8683102cc0ab9c1a0c3ae1ba2b7954f78eba1b3 Mon Sep 17 00:00:00 2001 From: Ganesh Jagadeesan Date: Wed, 28 Feb 2024 00:26:15 -0500 Subject: [PATCH 025/113] multi-lora documentation fix (#3064) --- docs/source/models/lora.rst | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 1910f26506611..21b18c75fc552 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -58,7 +58,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server. .. code-block:: bash - python -m vllm.entrypoints.api_server \ + python -m vllm.entrypoints.openai.api_server \ --model meta-llama/Llama-2-7b-hf \ --enable-lora \ --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/ @@ -89,3 +89,15 @@ with its base model: Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other LoRA adapter requests if they were provided and ``max_loras`` is set high enough). + +The following is an example request + +.. code-block::bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sql-lora", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' | jq From e46fa5d52e02ee48d5fdd12b35e39993008b4bd6 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Wed, 28 Feb 2024 13:38:26 +0800 Subject: [PATCH 026/113] Restrict prometheus_client >= 0.18.0 to prevent errors when importing pkgs (#3070) --- requirements-neuron.txt | 2 +- requirements-rocm.txt | 2 +- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements-neuron.txt b/requirements-neuron.txt index 36e629add664d..858472c20ca8c 100644 --- a/requirements-neuron.txt +++ b/requirements-neuron.txt @@ -6,4 +6,4 @@ neuronx-cc fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. -prometheus_client +prometheus_client >= 0.18.0 diff --git a/requirements-rocm.txt b/requirements-rocm.txt index e759ba7d028d9..53bd11de7c9de 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -10,4 +10,4 @@ transformers >= 4.38.0 # Required for Gemma. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. -prometheus_client +prometheus_client >= 0.18.0 diff --git a/requirements.txt b/requirements.txt index de93ba6354cda..d4599ec95d945 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ xformers == 0.0.23.post1 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. -prometheus_client +prometheus_client >= 0.18.0 pynvml == 11.5.0 triton >= 2.1.0 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. From 3b7178cfa4a317922d4aef9dd3b2647b8d950e7d Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Wed, 28 Feb 2024 09:34:34 -0800 Subject: [PATCH 027/113] [Neuron] Support inference with transformers-neuronx (#2569) --- examples/offline_inference_neuron.py | 33 ++++ tests/lora/conftest.py | 8 +- vllm/config.py | 41 ++++- vllm/engine/arg_utils.py | 16 +- vllm/engine/llm_engine.py | 21 ++- vllm/lora/layers.py | 4 + vllm/model_executor/__init__.py | 3 +- vllm/model_executor/layers/sampler.py | 18 +- vllm/model_executor/model_loader.py | 10 +- vllm/model_executor/models/__init__.py | 12 +- vllm/model_executor/models/neuron/llama.py | 79 +++++++++ vllm/model_executor/neuron_model_loader.py | 66 +++++++ vllm/model_executor/sampling_metadata.py | 4 +- vllm/model_executor/utils.py | 17 ++ vllm/utils.py | 8 + vllm/worker/cache_engine.py | 11 +- vllm/worker/model_runner.py | 16 +- vllm/worker/neuron_worker.py | 191 +++++++++++++++++++++ 18 files changed, 516 insertions(+), 42 deletions(-) create mode 100644 examples/offline_inference_neuron.py create mode 100644 vllm/model_executor/models/neuron/llama.py create mode 100644 vllm/model_executor/neuron_model_loader.py create mode 100644 vllm/worker/neuron_worker.py diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py new file mode 100644 index 0000000000000..9b9dc4d94892f --- /dev/null +++ b/examples/offline_inference_neuron.py @@ -0,0 +1,33 @@ +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM( + model="openlm-research/open_llama_3b", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as max sequence length, + # when targeting neuron device. Currently, this is a known limitation in continuous batching + # support in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=128, + block_size=128, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, or explicitly assigned. + device="neuron") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0ca0715334c25..75f4e41290c36 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -131,9 +131,11 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module: cleanup() get_model_old = get_model - def get_model_patched(model_config, device_config, lora_config=None): - return get_model_old(model_config, device_config, - LoRAConfig(max_loras=4, max_lora_rank=8)) + def get_model_patched(model_config, device_config, **kwargs): + return get_model_old(model_config, + device_config, + lora_config=LoRAConfig(max_loras=4, + max_lora_rank=8)) with patch("vllm.worker.model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) diff --git a/vllm/config.py b/vllm/config.py index bd0dc89b585f7..fc848b72d7f2a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -8,7 +8,7 @@ from vllm.logger import init_logger from vllm.transformers_utils.config import get_config -from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version +from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version logger = init_logger(__name__) @@ -380,13 +380,21 @@ def __init__( disable_custom_all_reduce: bool = False, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size - self.tensor_parallel_size = tensor_parallel_size + if is_neuron(): + # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly. + # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload + # to multiple NeuronCores. + self.tensor_parallel_size = 1 + self.neuron_tp_degree = tensor_parallel_size + else: + self.tensor_parallel_size = tensor_parallel_size self.worker_use_ray = worker_use_ray self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce - self.world_size = pipeline_parallel_size * tensor_parallel_size - if self.world_size > 1: + self.world_size = pipeline_parallel_size * self.tensor_parallel_size + # Ray worker is not supported for Neuron backend. + if self.world_size > 1 and not is_neuron(): self.worker_use_ray = True self._verify_args() @@ -465,8 +473,29 @@ def _verify_args(self) -> None: class DeviceConfig: - def __init__(self, device: str = "cuda") -> None: - self.device = torch.device(device) + def __init__(self, device: str = "auto") -> None: + if device == "auto": + # Automated device type detection + if torch.cuda.is_available(): + self.device_type = "cuda" + elif is_neuron(): + self.device_type = "neuron" + else: + raise RuntimeError("No supported device detected.") + else: + # Device type is assigned explicitly + self.device_type = device + + # Some device types require processing inputs on CPU + if self.device_type in ["neuron"]: + self.device = torch.device("cpu") + else: + # Set device with device type + self.device = torch.device(self.device_type) + + @property + def is_neuron(self): + return self.device_type == "neuron" @dataclass diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a4efd171b871d..c01e7311fb89a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -44,7 +44,7 @@ class EngineArgs: lora_extra_vocab_size: int = 256 lora_dtype = 'auto' max_cpu_loras: Optional[int] = None - device: str = 'cuda' + device: str = 'auto' def __post_init__(self): if self.tokenizer is None: @@ -171,7 +171,7 @@ def add_cli_args( parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32], + choices=[8, 16, 32, 128], help='token block size') parser.add_argument('--seed', type=int, @@ -264,13 +264,11 @@ def add_cli_args( help=('Maximum number of LoRAs to store in CPU memory. ' 'Must be >= than max_num_seqs. ' 'Defaults to max_num_seqs.')) - parser.add_argument( - "--device", - type=str, - default=EngineArgs.device, - choices=["cuda"], - help=('Device type for vLLM execution. ' - 'Currently, only CUDA-compatible devices are supported.')) + parser.add_argument("--device", + type=str, + default=EngineArgs.device, + choices=["auto", "cuda", "neuron"], + help='Device type for vLLM execution.') return parser @classmethod diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f5b2145c22d6f..f0fd7efdef813 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -3,6 +3,7 @@ import os import time import pickle +import importlib from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union) @@ -20,7 +21,8 @@ SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.tokenizer import (detokenize_incrementally, TokenizerGroup) -from vllm.utils import Counter, set_cuda_visible_devices, get_ip, get_open_port, get_distributed_init_method +from vllm.utils import (Counter, set_cuda_visible_devices, get_ip, + get_open_port, get_distributed_init_method) if ray: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -31,6 +33,12 @@ logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. @@ -138,10 +146,17 @@ def __init__( def get_tokenizer_for_seq(self, sequence: Sequence): return self.tokenizer.get_lora_tokenizer(sequence.lora_request) + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + def _init_workers(self): # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker + Worker = self._dispatch_worker() assert self.parallel_config.world_size == 1, ( "Ray is required if parallel_config.world_size > 1.") @@ -243,7 +258,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Lazy import the Worker to avoid importing torch.cuda/xformers # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.worker import Worker + Worker = self._dispatch_worker() # Initialize torch distributed process group for the workers. model_config = copy.deepcopy(self.model_config) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e1aac20b038b4..e667d70f71e39 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -795,6 +795,10 @@ def __init__( self.dtype = dtype self.device = device + @property + def logits_as_hidden_states(self): + return self.base_layer.logits_as_hidden_states + @property def vocab_size(self): return self.base_layer.vocab_size diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 0d5b2004ad7cb..cd6dbde5f54cf 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -1,7 +1,6 @@ from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.model_loader import get_model from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_random_seed +from vllm.model_executor.utils import set_random_seed, get_model __all__ = [ "InputMetadata", diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 884d84387e505..71655b216fb3d 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -10,6 +10,7 @@ from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceData, SequenceGroupOutput, SequenceOutput) +from vllm.utils import is_neuron class Sampler(nn.Module): @@ -32,6 +33,8 @@ def __init__(self, org_vocab_size: Optional[int] = None) -> None: super().__init__() self.vocab_size = vocab_size + # Transformers-neuronx generate outputs as logits directly. + self.logits_as_hidden_states = is_neuron() # original vocabulary size (without LoRA). self.org_vocab_size = org_vocab_size or vocab_size @@ -55,10 +58,14 @@ def forward( embedding_bias: Optional[torch.Tensor] = None, ) -> Optional[SamplerOutput]: # Get the hidden states that we use for sampling. - hidden_states = _prune_hidden_states(hidden_states, sampling_metadata) + if self.logits_as_hidden_states: + logits = hidden_states + else: + hidden_states = _prune_hidden_states(hidden_states, + sampling_metadata) - # Get the logits for the next tokens. - logits = self._get_logits(hidden_states, embedding, embedding_bias) + # Get the logits for the next tokens. + logits = self._get_logits(hidden_states, embedding, embedding_bias) # Only perform sampling in the driver worker. # Note: `_get_logits` is still distributed across TP workers because @@ -395,7 +402,8 @@ def _sample( sample_metadata[sampling_type] = (seq_group_ids, seq_groups, is_prompts, sample_indices) if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[sample_indices], dim=-1) + greedy_samples = torch.argmax(logprobs[sample_indices.long()], + dim=-1) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): max_best_of = 1 for seq_group, is_prompt in zip(seq_groups, is_prompts): @@ -407,7 +415,7 @@ def _sample( "generators": sampling_metadata.generators, } multinomial_samples[sampling_type] = _multinomial( - probs[sample_indices], max_best_of, **seeded_args) + probs[sample_indices.long()], max_best_of, **seeded_args) elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] else: diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py index ebe092b5d62ba..cb64d80c8147d 100644 --- a/vllm/model_executor/model_loader.py +++ b/vllm/model_executor/model_loader.py @@ -1,11 +1,11 @@ """Utilities for selecting and loading models.""" import contextlib -from typing import Optional, Type +from typing import Type import torch import torch.nn as nn -from vllm.config import DeviceConfig, ModelConfig, LoRAConfig +from vllm.config import DeviceConfig, ModelConfig from vllm.model_executor.models import ModelRegistry from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights) @@ -37,9 +37,9 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]: f"Supported architectures: {ModelRegistry.get_supported_archs()}") -def get_model(model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig] = None) -> nn.Module: +def get_model(model_config: ModelConfig, device_config: DeviceConfig, + **kwargs) -> nn.Module: + lora_config = kwargs.get("lora_config", None) model_class = _get_model_architecture(model_config) # Get the (maybe quantized) linear method. diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 66d28207d664f..e4f3a785cd99a 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -4,7 +4,7 @@ import torch.nn as nn from vllm.logger import init_logger -from vllm.utils import is_hip +from vllm.utils import is_hip, is_neuron logger = init_logger(__name__) @@ -61,6 +61,9 @@ "Sliding window attention is not yet supported in ROCm's flash attention", } +# Models not supported by Neuron. +_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"} + class ModelRegistry: @@ -77,8 +80,15 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: logger.warning( f"Model architecture {model_arch} is partially supported " "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) + elif is_neuron(): + if model_arch not in _NEURON_SUPPORTED_MODELS: + raise ValueError( + f"Model architecture {model_arch} is not supported by " + "Neuron for now.") module_name, model_cls_name = _MODELS[model_arch] + if is_neuron(): + module_name = _NEURON_SUPPORTED_MODELS[model_arch] module = importlib.import_module( f"vllm.model_executor.models.{module_name}") return getattr(module, model_cls_name, None) diff --git a/vllm/model_executor/models/neuron/llama.py b/vllm/model_executor/models/neuron/llama.py new file mode 100644 index 0000000000000..e2856da99d9b1 --- /dev/null +++ b/vllm/model_executor/models/neuron/llama.py @@ -0,0 +1,79 @@ +"""Inference-only LLaMA model compatible with HuggingFace weights.""" +import os +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import LlamaConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class LlamaForCausalLM(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method=None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = None + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + with torch.inference_mode(): + block_size = self.model.context_buckets[-1] + if input_metadata.is_prompt: + seq_ids = input_metadata.slot_mapping[:, 0] // block_size + else: + seq_ids = input_metadata.block_tables + logits = self.model(input_ids, + cache_ids=positions, + start_ids=seq_ids.flatten()) + return logits + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.model.chkpt_model.lm_head, + hidden_states, sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + **kwargs): + from transformers_neuronx.llama.model import LlamaForSampling + + split_model_dir = f"{model_name_or_path}-split" + if os.path.isdir(os.path.join(model_name_or_path, + "pytorch_model.bin")): + split_model_dir = model_name_or_path + elif not os.path.exists(f"{model_name_or_path}-split"): + from transformers.models.llama import LlamaForCausalLM + from transformers_neuronx.module import save_pretrained_split + + hf_model = LlamaForCausalLM.from_pretrained(model_name_or_path, + low_cpu_mem_usage=True) + save_pretrained_split(hf_model, f"{model_name_or_path}-split") + + self.model = LlamaForSampling.from_pretrained(split_model_dir, + **kwargs) + self.model.to_neuron() diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py new file mode 100644 index 0000000000000..b8d63d4ff12fc --- /dev/null +++ b/vllm/model_executor/neuron_model_loader.py @@ -0,0 +1,66 @@ +"""Utilities for selecting and loading models.""" +from typing import Type + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import ModelConfig, DeviceConfig +from vllm.model_executor.models import ModelRegistry + +TORCH_DTYPE_TO_NEURON_AMP = { + "auto": "f32", + "half": "f16", + "float16": "f16", + "bfloat16": "bf16", + "float": "f32", + "float32": "f32", + torch.float16: "f16", + torch.bfloat16: "bf16", + torch.float32: "f32", +} + + +def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: + architectures = getattr(config, "architectures", []) + for arch in architectures: + model_cls = ModelRegistry.load_model_cls(arch) + if model_cls is not None: + return model_cls + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {ModelRegistry.get_supported_archs()}") + + +def get_model(model_config: ModelConfig, device_config: DeviceConfig, + **kwargs) -> nn.Module: + from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig + + parallel_config = kwargs.get("parallel_config") + scheduler_config = kwargs.get("scheduler_config") + + model_class = _get_model_architecture(model_config.hf_config) + linear_method = None + + # Create a model instance. + model = model_class(model_config.hf_config, linear_method) + + continuous_batching_config = ContinuousBatchingConfig( + batch_size_for_shared_caches=scheduler_config.max_num_seqs) + neuron_config = NeuronConfig( + continuous_batching=continuous_batching_config) + + # Load the weights from the cached or downloaded files. + model.load_weights( + model_config.model, + model_config.download_dir, + model_config.load_format, + model_config.revision, + tp_degree=parallel_config.neuron_tp_degree, + amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], + neuron_config=neuron_config, + context_length_estimate=[scheduler_config.max_model_len], + n_positions=[scheduler_config.max_model_len], + batch_size=scheduler_config.max_num_seqs) + + return model.eval() diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index d0ffeecd2d74d..7deb80801856e 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -5,7 +5,7 @@ from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData -from vllm.utils import in_wsl +from vllm.utils import in_wsl, is_neuron _SAMPLING_EPS = 1e-5 @@ -155,7 +155,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], dtype: torch.dtype) -> "SamplingTensors": # Note that the performance will be very bad without # pinned memory. - pin_memory = not in_wsl() + pin_memory = not in_wsl() and not is_neuron() prompt_max_len = max(len(tokens) for tokens in prompt_tokens) prompt_padded_tokens = [ tokens + [vocab_size] * (prompt_max_len - len(tokens)) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 336bc1cd005cf..0113e3edf0675 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,10 +1,18 @@ """Utils for model executor.""" import random +import importlib from typing import Any, Dict, Optional import numpy as np import torch +from vllm.config import DeviceConfig, ModelConfig + +DEVICE_TO_MODEL_LOADER_MAP = { + "cuda": "model_loader", + "neuron": "neuron_model_loader", +} + def set_random_seed(seed: int) -> None: random.seed(seed) @@ -33,3 +41,12 @@ def set_weight_attrs( assert not hasattr( weight, key), (f"Overwriting existing tensor attribute: {key}") setattr(weight, key, value) + + +def get_model(model_config: ModelConfig, device_config: DeviceConfig, + **kwargs) -> torch.nn.Module: + model_loader_module = DEVICE_TO_MODEL_LOADER_MAP[device_config.device_type] + imported_model_loader = importlib.import_module( + f"vllm.model_executor.{model_loader_module}") + get_model_fn = imported_model_loader.get_model + return get_model_fn(model_config, device_config, **kwargs) diff --git a/vllm/utils.py b/vllm/utils.py index c8ac57de6f5f5..a4f9bfe6aac99 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -118,6 +118,14 @@ def is_hip() -> bool: return torch.version.hip is not None +def is_neuron() -> bool: + try: + import transformers_neuronx + except ImportError: + transformers_neuronx = None + return transformers_neuronx is not None + + def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # NOTE: This import statement should be executed lazily since diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index bbe33989fc2a4..880299783935c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -3,10 +3,9 @@ import torch -from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) @@ -39,6 +38,10 @@ def __init__( self.num_gpu_blocks = cache_config.num_gpu_blocks self.num_cpu_blocks = cache_config.num_cpu_blocks + # Skip initializing CUDA stream and buffer for Neuron backend. + if is_neuron(): + return + if cache_config.cache_dtype == "auto": self.dtype = model_config.dtype else: @@ -121,6 +124,8 @@ def _swap( dst: List[KVCache], src_to_dst: Dict[int, int], ) -> None: + from vllm._C import cache_ops + with torch.cuda.stream(self.cache_stream): for i in range(self.num_layers): src_key_cache, src_value_cache = src[i] @@ -140,6 +145,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: + from vllm._C import cache_ops + key_caches = [key_cache for key_cache, _ in self.gpu_cache] value_caches = [value_cache for _, value_cache in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b99a409e02d1e..efe570778fb43 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -80,9 +80,16 @@ def __init__( self.in_wsl = in_wsl() self.kv_cache_dtype = kv_cache_dtype + # Set enforce_eager to True for Neuron backend, to avoid capturing graph + if self.device_config.is_neuron: + self.model_config.enforce_eager = True + def load_model(self) -> None: - self.model = get_model(self.model_config, self.device_config, - self.lora_config) + self.model = get_model(self.model_config, + self.device_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) vocab_size = self.model.config.vocab_size @@ -393,6 +400,7 @@ def _prepare_sample( selected_token_start_idx = 0 categorized_sample_indices = {t: [] for t in SamplingType} categorized_sample_indices_start_idx = 0 + pin_memory = not self.in_wsl and not self.device_config.is_neuron max_subquery_len = max(subquery_lens) if subquery_lens else 1 for i, seq_group_metadata in enumerate(seq_group_metadata_list): @@ -443,12 +451,12 @@ def _prepare_sample( selected_token_indices = _async_h2d(selected_token_indices, dtype=torch.long, target_device=self.device, - pin_memory=not self.in_wsl) + pin_memory=pin_memory) categorized_sample_indices = { t: _async_h2d(seq_ids, dtype=torch.int, target_device=self.device, - pin_memory=not self.in_wsl) + pin_memory=pin_memory) for t, seq_ids in categorized_sample_indices.items() } diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py new file mode 100644 index 0000000000000..3229a21c11a38 --- /dev/null +++ b/vllm/worker/neuron_worker.py @@ -0,0 +1,191 @@ +"""A Neuron worker class.""" +from typing import Dict, List, Optional, Tuple + +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.model_executor import set_random_seed +from vllm.model_executor.parallel_utils.communication_op import ( + broadcast_tensor_dict) +from vllm.model_executor.parallel_utils.parallel_state import ( + ensure_model_parallel_initialized) +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.model_runner import ModelRunner + + +class Worker: + """A worker class that executes the model on a group of neuron cores. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + ) -> None: + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.model_runner = ModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + lora_config=self.lora_config, + is_driver_worker=is_driver_worker) + # Uninitialized cache engine. Will be initialized by + # self.init_cache_engine(). + self.cache_config = None + self.cache_engine = None + self.cache_events = None + self.gpu_cache = None + + def init_model(self) -> None: + # Initialize the distributed environment. + _init_distributed_environment(self.parallel_config, + self.rank, + self.distributed_init_method, + distributed_backend="gloo") + + # Initialize the model. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + @torch.inference_mode() + def profile_num_available_blocks( + self, + block_size: int = 128, + gpu_memory_utilization: float = 0.9, + cpu_swap_space: int = 0, + cache_dtype: str = "float16", + ) -> Tuple[int, int]: + """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" + num_gpu_blocks = self.scheduler_config.max_num_seqs + num_cpu_blocks = 0 + return num_gpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig) -> None: + self.cache_config = cache_config + self.cache_engine = CacheEngine(self.cache_config, self.model_config, + self.parallel_config) + self.model_runner.set_block_size(self.cache_engine.block_size) + + def warm_up_model(self) -> None: + # Warm up is maintained in transformers-neuronx + pass + + def cache_swap( + self, + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + # Issue cache operations. + issued_cache_op = False + if blocks_to_swap_in: + self.cache_engine.swap_in(blocks_to_swap_in) + issued_cache_op = True + if blocks_to_swap_out: + self.cache_engine.swap_out(blocks_to_swap_out) + issued_cache_op = True + if blocks_to_copy: + self.cache_engine.copy(blocks_to_copy) + issued_cache_op = True + + cache_events = self.cache_events if issued_cache_op else None + + # Wait for cache operations to finish. + if cache_events is not None: + raise NotImplementedError( + "cache operations are not implemented for neuron backend.") + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, + blocks_to_swap_in: Optional[Dict[int, int]] = None, + blocks_to_swap_out: Optional[Dict[int, int]] = None, + blocks_to_copy: Optional[Dict[int, List[int]]] = None, + ) -> Optional[SamplerOutput]: + if self.is_driver_worker: + assert seq_group_metadata_list is not None + num_seq_groups = len(seq_group_metadata_list) + assert blocks_to_swap_in is not None + assert blocks_to_swap_out is not None + assert blocks_to_copy is not None + data = { + "num_seq_groups": num_seq_groups, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) + else: + data = broadcast_tensor_dict(src=0) + num_seq_groups = data["num_seq_groups"] + blocks_to_swap_in = data["blocks_to_swap_in"] + blocks_to_swap_out = data["blocks_to_swap_out"] + blocks_to_copy = data["blocks_to_copy"] + + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return {} + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.gpu_cache) + return output + + +def _init_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + distributed_backend: Optional[str] = None, +) -> None: + """Initialize the distributed environment.""" + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch world " + "size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + distributed_backend = distributed_backend if distributed_backend else "nccl" + torch.distributed.init_process_group( + backend=distributed_backend, + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) + + # A small all_reduce for warmup. + torch.distributed.all_reduce(torch.zeros(1)) + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) From 929b4f2973ec6a53ea4f0f03d21147ef8b8278be Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 28 Feb 2024 13:03:28 -0800 Subject: [PATCH 028/113] Add LoRA support for Gemma (#3050) --- .buildkite/test-pipeline.yaml | 2 +- csrc/punica/bgmv/bgmv_config.h | 2 ++ tests/lora/conftest.py | 5 ++++ tests/lora/test_gemma.py | 46 +++++++++++++++++++++++++++++ tests/lora/test_punica.py | 4 +-- vllm/model_executor/models/gemma.py | 28 ++++++++++++++++-- vllm/model_executor/models/llama.py | 2 +- 7 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 tests/lora/test_gemma.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index efcc4d2d07a12..c65ab04b8ddda 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -50,7 +50,7 @@ steps: command: pytest -v -s worker - label: LoRA Test - command: pytest -v -s lora + command: pytest -v -s lora --forked - label: Metrics Test command: pytest -v -s metrics diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index ebf638f104c3f..d5fee9c40d00c 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -28,6 +28,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 5120) \ f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ + f(in_T, out_T, W_T, narrow, 6144) \ f(in_T, out_T, W_T, narrow, 6912) \ f(in_T, out_T, W_T, narrow, 7168) \ f(in_T, out_T, W_T, narrow, 8192) \ @@ -39,6 +40,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 20480) \ + f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 28672) \ f(in_T, out_T, W_T, narrow, 32000) \ f(in_T, out_T, W_T, narrow, 32256) \ diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 75f4e41290c36..67273144ecd02 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -126,6 +126,11 @@ def mixtral_lora_files(): return snapshot_download(repo_id="terrysun/mixtral-lora-adapter") +@pytest.fixture(scope="session") +def gemma_lora_files(): + return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") + + @pytest.fixture def llama_2_7b_engine_extra_embeddings() -> nn.Module: cleanup() diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py new file mode 100644 index 0000000000000..0082c6e74e888 --- /dev/null +++ b/tests/lora/test_gemma.py @@ -0,0 +1,46 @@ +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "google/gemma-7b" + + +def do_sample(llm, lora_path: str, lora_id: int) -> str: + prompts = [ + "Quote: Imagination is", + "Quote: Be yourself;", + "Quote: So many books,", + ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def test_gemma_lora(gemma_lora_files): + llm = vllm.LLM(MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4) + + expected_lora_output = [ + "more important than knowledge.\nAuthor: Albert Einstein\n", + "everyone else is already taken.\nAuthor: Oscar Wilde\n", + "so little time\nAuthor: Frank Zappa\n", + ] + + output1 = do_sample(llm, gemma_lora_files, lora_id=1) + for i in range(len(expected_lora_output)): + assert output1[i].startswith(expected_lora_output[i]) + output2 = do_sample(llm, gemma_lora_files, lora_id=2) + for i in range(len(expected_lora_output)): + assert output2[i].startswith(expected_lora_output[i]) diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index 903814faa5dc7..cbe0f6fa2e851 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -44,8 +44,8 @@ def _lora_ref_impl( H1 = H2 = [ 128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120, - 5504, 5632, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 32000, - 32256, 32512, 32768, 33024 + 5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, + 24576, 32000, 32256, 32512, 32768, 33024 ] SEED = [0xabcdabcd987] diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index d8b515993d8ff..03948132d32c3 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -20,6 +20,7 @@ from torch import nn from transformers import GemmaConfig +from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import GeluAndMul from vllm.model_executor.layers.attention import PagedAttention @@ -246,12 +247,36 @@ def forward( class GemmaForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + # Gemma does not apply LoRA to the embedding layer. + embedding_modules = {} + embedding_padding_modules = [] def __init__( self, config: GemmaConfig, linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: + del lora_config # Unused. super().__init__() self.config = config self.linear_method = linear_method @@ -305,9 +330,6 @@ def load_weights(self, weight_loader(param, loaded_weight, shard_id) break else: - # Skip loading extra layer for lora models. - if "lm_head" in name: - continue # GemmaRMSNorm is different from Llama's in that it multiplies # (1 + weight) to the output, instead of just weight. if "norm.weight" in name: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b7f6b8f3ec374..d35887cc0f6a3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -27,6 +27,7 @@ from torch import nn from transformers import LlamaConfig +from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.attention import PagedAttention @@ -45,7 +46,6 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig KVCache = Tuple[torch.Tensor, torch.Tensor] From 01a5d18a537b65a156cfa1a77706693a24c869c1 Mon Sep 17 00:00:00 2001 From: CHU Tianxiang Date: Thu, 29 Feb 2024 13:52:23 +0800 Subject: [PATCH 029/113] Add Support for 2/3/8-bit GPTQ Quantization Models (#2330) --- csrc/ops.h | 6 +- csrc/quantization/gptq/matrix_view.cuh | 123 ++ csrc/quantization/gptq/q_gemm.cu | 1452 +++++++++++++++-- csrc/quantization/gptq/qdq_2.cuh | 87 + csrc/quantization/gptq/qdq_3.cuh | 141 ++ csrc/quantization/gptq/qdq_4.cuh | 100 +- csrc/quantization/gptq/qdq_8.cuh | 40 + .../layers/quantization/gptq.py | 16 +- 8 files changed, 1736 insertions(+), 229 deletions(-) create mode 100644 csrc/quantization/gptq/qdq_2.cuh create mode 100644 csrc/quantization/gptq/qdq_3.cuh create mode 100644 csrc/quantization/gptq/qdq_8.cuh diff --git a/csrc/ops.h b/csrc/ops.h index dbdd2c2c57945..08dfb0e8604f1 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -98,11 +98,13 @@ torch::Tensor gptq_gemm( torch::Tensor b_gptq_qzeros, torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, - bool use_exllama); + bool use_exllama, + int bit); void gptq_shuffle( torch::Tensor q_weight, - torch::Tensor q_perm); + torch::Tensor q_perm, + int bit); void moe_align_block_size( torch::Tensor topk_ids, diff --git a/csrc/quantization/gptq/matrix_view.cuh b/csrc/quantization/gptq/matrix_view.cuh index 1fdf019b29028..eda3436eb5375 100644 --- a/csrc/quantization/gptq/matrix_view.cuh +++ b/csrc/quantization/gptq/matrix_view.cuh @@ -146,6 +146,129 @@ public: __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } }; +class MatrixView_q2_row +{ +public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, const int height, const int width) + : data(data), height(height), width(width) + { } + + __device__ __forceinline__ int item(int row, int column) const + { + int shift = (column & 0x0f) * 2; + return (data[row * width / 16 + column / 16] >> shift) & 0x03; + } + + __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const + { + int shift = (column & 0x0f) * 2; + uint32_t d = data[row * width / 16 + column / 16] >> shift; + items[0] = d & 0x03; + items[1] = (d >> 2) & 0x03; + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const + { + int shift = (column & 0x0f) * 2; + uint32_t d = data[row * width / 16 + column / 16] >> shift; + items[0] = d & 0x03; + items[1] = (d >> 2) & 0x03; + items[2] = (d >> 4) & 0x03; + items[3] = (d >> 6) & 0x03; + } +}; + +class MatrixView_q3_row +{ +public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, const int height, const int width) + : data(data), height(height), width(width) + { } + + __device__ __forceinline__ int item(int row, int column) const + { + int z_w = column * 3 / 32; + int z_mod = column & 0x1f; + + if (z_mod == 10) { + return (data[row * width * 3 / 32 + z_w] >> 30) | ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4); + } else if (z_mod == 21) { + return (data[row * width * 3 / 32 + z_w] >> 31) | ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6); + } else if (z_mod < 10) { + return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07; + } else if (z_mod < 21) { + return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 32)) & 0x07; + } else { + return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3 - 64)) & 0x07; + } + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const + { + int shift = (column & 0x1f); + uint32_t d; + if (shift <= 4) { + d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3); + } else if (shift == 8) { + d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8); + } else if (shift <= 16) { + d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32); + } else if (shift == 20) { + d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4); + } else { + d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64); + } + items[0] = d & 0x07; + items[1] = (d >> 3) & 0x07; + items[2] = (d >> 6) & 0x07; + items[3] = (d >> 9) & 0x07; + } +}; + +class MatrixView_q8_row +{ +public: + const uint32_t* data; + const int height; + const int width; + + __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, const int height, const int width) + : data(data), height(height), width(width) + { } + + __device__ __forceinline__ int item(int row, int column) const + { + int shift = (column & 0x03) * 8; + return (data[row * width / 4 + column / 4] >> shift) & 0xff; + } + + __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const + { + int shift = (column & 0x03) * 8; + uint32_t d = data[row * width / 4 + column / 4] >> shift; + items[0] = d & 0xff; + items[1] = (d >> 8) & 0xff; + } + + __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const + { + int shift = (column & 0x03) * 2; + uint32_t d = data[row * width / 4 + column / 4] >> shift; + items[0] = d & 0xff; + items[1] = (d >> 8) & 0xff; + items[2] = (d >> 16) & 0xff; + items[3] = (d >> 24) & 0xff; + } +}; + } // namespace gptq } // namespace vllm #endif diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu index a5d2345f1e7fd..655158e38f557 100644 --- a/csrc/quantization/gptq/q_gemm.cu +++ b/csrc/quantization/gptq/q_gemm.cu @@ -13,7 +13,10 @@ Adapted from https://github.com/turboderp/exllamav2 and https://github.com/qwopq #include "compat.cuh" #include "matrix_view.cuh" +#include "qdq_2.cuh" +#include "qdq_3.cuh" #include "qdq_4.cuh" +#include "qdq_8.cuh" namespace vllm { namespace gptq { @@ -22,6 +25,7 @@ namespace gptq { #define BLOCK_M_SIZE_MAX 8 #define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32) #define MAX_Q_GEMM_ROWS 50 +#define MAX_Q_GEMM_ROWS_8BIT 24 #define MAX_ALT_GEMM_ROWS 8 #define THREADS_X 32 #define THREADS_Y 32 @@ -75,6 +79,106 @@ __forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr) return __half2float(__low2half(result)) + __half2float(__high2half(result)); } +__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); + return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); +} + +__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); + return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); +} + +__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); + return __hfma2(result, __halves2half2(qs_h, qs_h), g_result); +} + +__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result); + float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); + return fma(result_f, qs_f, g_result); +} + +__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); + float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); + return fma(result_f, qs_f, g_result); +} + +__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); + float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result)); + return fma(result_f, qs_f, g_result); +} + +__forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h) +{ + // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127 + + float result = {}; + #pragma unroll + for (int i = 0; i < 4; i++) + { + half2 w01 = dq[i]; + float w0 = __low2float(w01); + float w1 = __high2float(w01); + float x0 = __half2float(*a_ptr++); + float x1 = __half2float(*a_ptr++); + result = fma(w0, x0, result); + result = fma(w1, x1, result); + } + float qs = __half2float(qs_h); + result *= qs; + half result_h = __float2half_rn(result); + return __hadd(result_h, g_result); +} + +__forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result); + half result_h = __hadd(__low2half(result), __high2half(result)); + return __hfma(result_h, qs_h, g_result); +} + +__forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h) +{ + half2 result = {}; + const half2* a2_ptr = (const half2*)a_ptr; + #pragma unroll + for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result); + half result_h = __hadd(__low2half(result), __high2half(result)); + return __hfma(result_h, qs_h, g_result); +} + + typedef void (*fp_gemm_half_q_half_gptq_kernel) ( const half*, @@ -89,8 +193,9 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel) const int* ); + template -__global__ void gemm_half_q_half_gptq_kernel +__global__ void gemm_half_q_half_gptq_4bit_kernel ( const half* __restrict__ a, const uint32_t* __restrict__ b_q_weight, @@ -231,80 +336,794 @@ __global__ void gemm_half_q_half_gptq_kernel } } - -fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(bool first_block, const int m_count) +template +__global__ void gemm_half_q_half_gptq_2bit_kernel +( + const half* __restrict__ a, + const uint32_t* __restrict__ b_q_weight, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, + half* __restrict__ c, + const int size_m, + const int size_n, + const int size_k, + const int groups, + const int* __restrict__ b_q_perm +) { - #if BLOCK_M_SIZE_MAX >= 1 - if (m_count == 1) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 2 - if (m_count == 2) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 3 - if (m_count == 3) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 4 - if (m_count == 4) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 5 - if (m_count == 5) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 6 - if (m_count == 6) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 7 - if (m_count == 7) return gemm_half_q_half_gptq_kernel; - #endif - #if BLOCK_M_SIZE_MAX >= 8 - if (m_count == 8) return gemm_half_q_half_gptq_kernel; - #endif - return NULL; -} + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + int t = threadIdx.x; -void gemm_half_q_half_cuda_part + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) + { + for (int m = 0; m < m_count; ++m) + { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + + half a0; + if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; + else a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; + } + } + + // Zero output + if (n >= size_n) return; + + if (blockIdx.z == 0) + { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / (32 / 2); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + half scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + // Column result + half block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) + { + if (k == nextgroup) + { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + } + + #pragma unroll + for (int j = 0; j < 1; j++) + { + const int4* b_ptr4 = (int4*) b_ptr; + int4 load_int4 = *b_ptr4; + + half2 dq[4][8]; + dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1); + dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1); + dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1); + dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1); + + #pragma unroll + for (int m = 0; m < m_count; m++) + { + block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); + block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); + block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); + block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); + } + + b_ptr += size_n; + a_ptr += 16; + } + + k += 16; + } + + for (int m = 0; m < m_count; m++) + { + half2 *out = (half2*) c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); + half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); + atomicAdd(out , result01); + atomicAdd(out + 1, result23); + } +} + +template +__global__ void gemm_half_q_half_gptq_3bit_kernel ( - const half* a, - const uint32_t* b_q_weight, - const uint32_t* b_gptq_qzeros, - const half* b_gptq_scales, - const int* b_q_perm, - half* c, - int size_m, - int size_n, - int size_k, - int m_count, - int groups + const half* __restrict__ a, + const uint32_t* __restrict__ b_q_weight, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, + half* __restrict__ c, + const int size_m, + const int size_n, + const int size_k, + const int groups, + const int* __restrict__ b_q_perm ) { - dim3 blockDim, gridDim; - blockDim.x = BLOCK_KN_SIZE; - blockDim.y = 1; - blockDim.z = 1; - gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); - gridDim.y = DIVIDE(size_m, m_count); - gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); - fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count); + int t = threadIdx.x; - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - kernel<<>> - ( - a, - b_q_weight, - b_gptq_qzeros, - b_gptq_scales, - c, - size_m, - size_n, - size_k, - groups, - b_q_perm - ); -} + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) + { + for (int m = 0; m < m_count; ++m) + { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + half a0; + if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; + else a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; + } + } + + // Zero output + if (n >= size_n) return; + + if (blockIdx.z == 0) + { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / 32 * 3; + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + half scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + // Column result + half block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) + { + if (k == nextgroup) + { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + } + + #pragma unroll + for (int j = 0; j < 1; j++) + { + int4 load_int4[3]; + load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; + load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; + load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; + + half2 dq[4][16]; + dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1); + dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1); + dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1); + dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1); + + #pragma unroll + for (int m = 0; m < m_count; m++) + { + block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); + block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); + block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); + block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); + } + a_ptr += 32; + } + + k += 32; + } + + for (int m = 0; m < m_count; m++) + { + half2 *out = (half2*) c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); + half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); + atomicAdd(out , result01); + atomicAdd(out + 1, result23); + } +} + +template +__global__ void gemm_half_q_half_gptq_8bit_kernel +( + const half* __restrict__ a, + const uint32_t* __restrict__ b_q_weight, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, + half* __restrict__ c, + const int size_m, + const int size_n, + const int size_k, + const int groups, + const int* __restrict__ b_q_perm +) +{ + MatrixView_half a_(a, size_m, size_k); + MatrixView_half_rw c_(c, size_m, size_n); + MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int t = threadIdx.x; + + // Block + int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; + int offset_m = blockIdx.y * m_count; + int offset_k = blockIdx.z * BLOCK_KN_SIZE; + + int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); + int end_m = min(offset_m + m_count, size_m); + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + int n = offset_n + t * 4; + + // Preload block_a + __shared__ half block_a[m_count][BLOCK_KN_SIZE]; + + if (offset_k + t < end_k) + { + for (int m = 0; m < m_count; ++m) + { + const half* a_ptr = a_.item_ptr(offset_m + m, 0); + half* block_a_ptr = block_a[m]; + + half a0; + if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]]; + else a0 = a_ptr[offset_k + t]; + block_a_ptr[t] = a0; + } + } + + // Zero output + if (n >= size_n) return; + + if (blockIdx.z == 0) + { + for (int m = 0; m < m_count; m++) + *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0; + } + + __syncthreads(); + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // a, b offset + int qk = offset_k / (32 / 8); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + const half* a_ptr = &block_a[0][0]; + int a_stride = BLOCK_KN_SIZE; + + // Initial group + int zeros[4]; + half scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + // Column result + half block_c[m_count][4] = {}; + + // Dequantize and multiply + int k = offset_k; + while (k < end_k) + { + if (k == nextgroup) + { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4(scales, group, n); + } + + #pragma unroll + for (int j = 0; j < 4; j++) + { + int4 load_int4[2]; + load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; + load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; + + half2 dq[4][4]; + dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1); + dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1); + dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1); + dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1); + + for (int m = 0; m < m_count; m++) + { + block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]); + block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]); + block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]); + block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]); + } + a_ptr += 8; + } + k += 32; + } + + for (int m = 0; m < m_count; m++) + { + half2 *out = (half2*) c_.item_ptr(offset_m + m, n); + half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]); + half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]); + atomicAdd(out , result01); + atomicAdd(out + 1, result23); + } +} + +fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel( + bool first_block, const int m_count, const int bit) +{ + #define SELECT_KERNEL(M_COUNT) \ + if (m_count == M_COUNT) { \ + if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel; \ + if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel; \ + if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel; \ + if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel; \ + } + #if BLOCK_M_SIZE_MAX >= 1 + SELECT_KERNEL(1); + #endif + #if BLOCK_M_SIZE_MAX >= 2 + SELECT_KERNEL(2); + #endif + #if BLOCK_M_SIZE_MAX >= 3 + SELECT_KERNEL(3); + #endif + #if BLOCK_M_SIZE_MAX >= 4 + SELECT_KERNEL(4); + #endif + #if BLOCK_M_SIZE_MAX >= 5 + SELECT_KERNEL(5); + #endif + #if BLOCK_M_SIZE_MAX >= 6 + SELECT_KERNEL(6); + #endif + #if BLOCK_M_SIZE_MAX >= 7 + SELECT_KERNEL(7); + #endif + #if BLOCK_M_SIZE_MAX >= 8 + SELECT_KERNEL(8); + #endif + return NULL; +} + + +void gemm_half_q_half_cuda_part +( + const half* a, + const uint32_t* b_q_weight, + const uint32_t* b_gptq_qzeros, + const half* b_gptq_scales, + const int* b_q_perm, + half* c, + int size_m, + int size_n, + int size_k, + int m_count, + int groups, + int bit +) +{ + dim3 blockDim, gridDim; + blockDim.x = BLOCK_KN_SIZE; + blockDim.y = 1; + blockDim.z = 1; + gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4); + gridDim.y = DIVIDE(size_m, m_count); + gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); + + fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count, bit); + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + kernel<<>> + ( + a, + b_q_weight, + b_gptq_qzeros, + b_gptq_scales, + c, + size_m, + size_n, + size_k, + groups, + b_q_perm + ); +} + + +__global__ void reconstruct_exllama_8bit_kernel +( + const uint32_t* __restrict__ b_q_weight, + const int* __restrict__ b_q_perm, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, + const int size_k, + const int size_n, + const int groups, + half* __restrict__ b +) +{ + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; + + if (b_q_perm) + { + if (offset_k + t < size_k) + perm[t] = b_q_perm[offset_k + t]; + } + + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // b offset + int qk = offset_k / (32 / 8); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + + __syncthreads(); + + int k = offset_k; + int lk = 0; + + while (k < end_k) + { + if (k == nextgroup) + { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + } + + for (int p = 0; p < 4; p++) + { + int4 load_int4[2]; + load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; + load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; + + half2 dq[4][4]; + dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1); + dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1); + dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1); + dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1); + + //half* dqh = (half*)dq; + if (b_q_perm) + { + for (int j = 0; j < 4; j++) + { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } + else + { + for (int j = 0; j < 4; j++) + { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } + } + k += 32; + } +} + +__global__ void reconstruct_exllama_4bit_kernel +( + const uint32_t* __restrict__ b_q_weight, + const int* __restrict__ b_q_perm, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, + const int size_k, + const int size_n, + const int groups, + half* __restrict__ b +) +{ + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; + + if (b_q_perm) + { + if (offset_k + t < size_k) + perm[t] = b_q_perm[offset_k + t]; + } + + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // b offset + int qk = offset_k / (32 / 4); + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + half2 z1z16[4][2]; + half2 y1y16[4][2]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); + dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); + dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); + dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); + + __syncthreads(); + + int k = offset_k; + int lk = 0; + + while (k < end_k) + { + if (k == nextgroup) + { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); + dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); + dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); + dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); + } + + for (int p = 0; p < 4; p++) + { + half2 dq[4][4]; + const int4* b_ptr4 = (int4*) b_ptr; + int4 load_int4 = *b_ptr4; + + dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); + dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); + dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); + dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); + + b_ptr += size_n; + //half* dqh = (half*)dq; + if (b_q_perm) + { + for (int j = 0; j < 4; j++) + { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } + else + { + for (int j = 0; j < 4; j++) + { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } + } + k += 32; + } +} + +__global__ void reconstruct_exllama_3bit_kernel +( + const uint32_t* __restrict__ b_q_weight, + const int* __restrict__ b_q_perm, + const uint32_t* __restrict__ b_gptq_qzeros, + const half* __restrict__ b_gptq_scales, + const int size_k, + const int size_n, + const int groups, + half* __restrict__ b +) +{ + MatrixView_half_rw b_(b, size_k, size_n); + MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); + + int offset_k = BLOCK_KN_SIZE * blockIdx.y; + int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; + + int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); + + // Preload remapping table + __shared__ int perm[BLOCK_KN_SIZE]; + int t = threadIdx.x; + + if (b_q_perm) + { + if (offset_k + t < size_k) + perm[t] = b_q_perm[offset_k + t]; + } + + // Column + int n = offset_n + t * 4; + if (n >= size_n) return; + + // Find initial group + int groupsize = size_k / groups; + int group = offset_k / groupsize; + int nextgroup = offset_k + groupsize; + + // b offset + int qk = offset_k / 32* 3; + + const uint32_t* b_ptr = b_q_weight + qk * size_n + n; + + // Initial zeros/scale + int zeros[4]; + half2 scales[4]; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + + __syncthreads(); + + int k = offset_k; + int lk = 0; + + while (k < end_k) + { + if (k == nextgroup) + { + group++; + nextgroup += groupsize; + b_gptq_qzeros_.item4(zeros, group, n); + b_gptq_scales_.item4_h2(scales, group, n); + } + + for (int p = 0; p < 1; p++) + { + int4 load_int4[3]; + load_int4[0] = *((int4*) b_ptr); b_ptr += size_n; + load_int4[1] = *((int4*) b_ptr); b_ptr += size_n; + load_int4[2] = *((int4*) b_ptr); b_ptr += size_n; + + half2 dq[4][16]; + dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1); + dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1); + dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1); + dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1); + + if (b_q_perm) + { + for (int j = 0; j < 16; j++) + { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } + else + { + for (int j = 0; j < 16; j++) + { + for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); + b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); + b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j])); + } + } + } + k += 32; + } +} -__global__ void reconstruct_exllama_kernel +__global__ void reconstruct_exllama_2bit_kernel ( const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm, @@ -317,7 +1136,7 @@ __global__ void reconstruct_exllama_kernel ) { MatrixView_half_rw b_(b, size_k, size_n); - MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); + MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); int offset_k = BLOCK_KN_SIZE * blockIdx.y; @@ -345,21 +1164,15 @@ __global__ void reconstruct_exllama_kernel int nextgroup = offset_k + groupsize; // b offset - int qk = offset_k / (32 / 4); + int qk = offset_k / (32 / 2); const uint32_t* b_ptr = b_q_weight + qk * size_n + n; // Initial zeros/scale int zeros[4]; half2 scales[4]; - half2 z1z16[4][2]; - half2 y1y16[4][2]; b_gptq_qzeros_.item4(zeros, group, n); b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); __syncthreads(); @@ -374,28 +1187,24 @@ __global__ void reconstruct_exllama_kernel nextgroup += groupsize; b_gptq_qzeros_.item4(zeros, group, n); b_gptq_scales_.item4_h2(scales, group, n); - dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]); - dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]); - dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]); - dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]); } - for (int p = 0; p < 4; p++) + for (int p = 0; p < 2; p++) { - half2 dq[4][4]; const int4* b_ptr4 = (int4*) b_ptr; int4 load_int4 = *b_ptr4; - dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false); - dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false); - dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false); - dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false); + half2 dq[4][8]; + dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1); + dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1); + dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1); + dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1); b_ptr += size_n; //half* dqh = (half*)dq; if (b_q_perm) { - for (int j = 0; j < 4; j++) + for (int j = 0; j < 8; j++) { for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); @@ -404,7 +1213,7 @@ __global__ void reconstruct_exllama_kernel } else { - for (int j = 0; j < 4; j++) + for (int j = 0; j < 8; j++) { for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]); b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j])); @@ -416,7 +1225,6 @@ __global__ void reconstruct_exllama_kernel } } - void reconstruct_exllama ( const uint32_t* b_q_weight, @@ -426,7 +1234,8 @@ void reconstruct_exllama half* out, int height, int width, - int groups + int groups, + int bit ) { dim3 blockDim, gridDim; @@ -435,6 +1244,15 @@ void reconstruct_exllama gridDim.y = DIVIDE(height, BLOCK_KN_SIZE); gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); + auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel; + if (bit == 2) { + reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel; + } else if (bit == 3) { + reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel; + } else if (bit == 8) { + reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel; + } + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); reconstruct_exllama_kernel<<>> ( @@ -450,7 +1268,7 @@ void reconstruct_exllama } -__global__ void gemm_half_q_half_alt_kernel( +__global__ void gemm_half_q_half_alt_4bit_kernel( const half2* __restrict__ vec, const uint32_t* __restrict__ mat, half* __restrict__ mul, @@ -548,6 +1366,95 @@ __global__ void gemm_half_q_half_alt_kernel( } +__global__ void gemm_half_q_half_alt_8bit_kernel( + const half2* __restrict__ vec, + const uint32_t* __restrict__ mat, + half* __restrict__ mul, + const half* __restrict__ scales, + const uint32_t* __restrict__ zeros, + const int* __restrict__ g_idx, + int batch, + int height, + int width +) +{ + int zero_width = width / 4; + int vec_height = height * 2; + const int blockwidth2 = BLOCK_KN_SIZE / 2; + int b = blockIdx.y * BLOCK_M_SIZE_MAX; + int b_end = min(BLOCK_M_SIZE_MAX, batch - b); + int h = BLOCK_KN_SIZE * blockIdx.z / 4; + int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2; + int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + + __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; + if (threadIdx.x < h_end) { + for (int m = 0; m < b_end; ++m) { + blockvec[m][threadIdx.x] = + vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 + + threadIdx.x]; + } + } + + + if (blockIdx.z == 0) + { + for (int m = 0; m < b_end; m++) + mul[(b + m) * width + w] = __int2half_rn(0); + } + __syncthreads(); + + int i = width * h + w; + int g_h = h * 4; + int k = 0; + int z_w = w / 4; + int z_mod = (w % 4) * 8; + half2 res2; + half res[BLOCK_M_SIZE_MAX] = {}; + + unsigned int tmp; + while (k < h_end) { + tmp = mat[i]; + half2 scales_tmp[2]; + half2 zeros_tmp[2]; + for (int tmp_k = 0; tmp_k < 2; tmp_k++) { + int g = g_idx[g_h + (k + tmp_k) * 2]; + int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1]; + half scale_f = scales[g * width + w]; + half scale_f2 = scales[g2 * width + w]; + half2 scale = __halves2half2(scale_f, scale_f2); + half2 zero = __halves2half2( + __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)), + __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)) + ); + scales_tmp[tmp_k] = scale; + zeros_tmp[tmp_k] = zero; + } + for (int m = 0; m < b_end; m++) { +#ifndef USE_ROCM + res2 = {}; +#else + res2.x = __half_as_ushort(__float2half(0)); + res2.y = __half_as_ushort(__float2half(0)); +#endif + half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF), __int2half_rn((tmp >> 8) & 0xFF)); + res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2); + half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF), __int2half_rn((tmp >> 24) & 0xFF)); + res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2); +#ifndef USE_ROCM + res[m] = __hadd(res[m], __hadd(res2.x, res2.y)); +#else + res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y))); +#endif + } + i += width; + k += 2; + } + for (int m = 0; m < b_end; m++) { + atomicAdd(&mul[(b + m) * width + w], res[m]); + } +} + void gemm_half_q_half_alt ( const half* a, @@ -558,7 +1465,8 @@ void gemm_half_q_half_alt half* c, int size_m, int size_n, - int size_k + int size_k, + int bit ) { dim3 blockDim, gridDim; @@ -569,8 +1477,13 @@ void gemm_half_q_half_alt gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX); gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE); + auto kernel = gemm_half_q_half_alt_4bit_kernel; + if (bit == 8) { + kernel = gemm_half_q_half_alt_8bit_kernel; + } + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - gemm_half_q_half_alt_kernel<<>> + kernel<<>> ( (const half2*) a, b_q_weight, @@ -579,12 +1492,12 @@ void gemm_half_q_half_alt b_gptq_qzeros, b_g_idx, size_m, - size_k / 8, + size_k / 32 * bit, size_n ); } - +template __global__ void reconstruct_gptq_kernel ( const uint32_t* __restrict__ w, @@ -600,30 +1513,79 @@ __global__ void reconstruct_gptq_kernel // Start of block int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; - int row = blockIdx.y * 8; + int row = blockIdx.y * 32 / bit; if (column >= width) return; // Views - MatrixView_q4_column w_(w, height, width); MatrixView_half_rw out_(out, height, width); MatrixView_half w_scales_(w_scales, group, width); - MatrixView_q4_row w_zeros_(w_zeros, group, width); + T w_zeros_(w_zeros, group, width); - uint32_t w_read = w_.item_uint32_t(row, column); + uint32_t w_read = w[blockIdx.y * width + column]; half* out_ptr = out_.item_ptr(row, column); #pragma unroll - for (int s = 0; s < 32; s += 4) + for (int s = 0; s < 32; s += bit) { - int group = g_idx[row + s / 4]; + int group = g_idx[row + s / bit]; half w_scale = w_scales_.item(group, column); uint32_t w_zero = w_zeros_.item(group, column) + 1; - half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale); + half w_item = __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero), w_scale); *out_ptr = w_item; out_ptr += out_.width; } } +__global__ void reconstruct_gptq_3bit_kernel +( + const uint32_t* __restrict__ w, + const half* __restrict__ w_scales, + const uint32_t* __restrict__ w_zeros, + const int* __restrict__ g_idx, + const int height, + const int width, + const int group, + half* __restrict__ out +) +{ + // Start of block + int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; + int row = blockIdx.y * 32; + if (column >= width) return; + + // Views + + MatrixView_half_rw out_(out, height, width); + MatrixView_half w_scales_(w_scales, group, width); + MatrixView_q3_row w_zeros_(w_zeros, group, width); + + uint32_t w1 = w[(blockIdx.y * 3) * width + column]; + uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column]; + uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column]; + half* out_ptr = out_.item_ptr(row, column); + + #pragma unroll + for (int i = 0; i < 32; i += 1) + { + int group = g_idx[row + i]; + half w_scale = w_scales_.item(group, column); + uint32_t w_zero = w_zeros_.item(group, column) + 1; + int w_item; + if (i == 10) { + w_item = (w1 >> 30) | ((w2 << 2) & 0x4); + } else if (i == 21) { + w_item = (w2 >> 31) | ((w3 << 1) & 0x6); + } else if (i < 10) { + w_item = ((w1 >> (i * 3)) & 0x7); + } else if (i < 21) { + w_item = ((w2 >> (i * 3 - 32)) & 0x7); + } else { + w_item = ((w3 >> (i * 3 - 64)) & 0x7); + } + *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale); + out_ptr += out_.width; + } +} void reconstruct_gptq ( @@ -634,16 +1596,28 @@ void reconstruct_gptq half* out, int height, int width, - int groups + int groups, + int bit ) { dim3 blockDim, gridDim; blockDim.x = BLOCK_KN_SIZE; blockDim.y = 1; - gridDim.y = DIVIDE(height, 8); + gridDim.y = DIVIDE(height, 32 / bit); gridDim.x = DIVIDE(width, BLOCK_KN_SIZE); + + auto kernel = reconstruct_gptq_kernel; + if (bit == 2) { + kernel = reconstruct_gptq_kernel; + } else if (bit == 8) { + kernel = reconstruct_gptq_kernel; + } else if (bit == 3) { + kernel = reconstruct_gptq_3bit_kernel; + gridDim.y = DIVIDE(height, 32); + } + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - reconstruct_gptq_kernel<<>> + kernel<<>> ( b_q_weight, b_gptq_scales, @@ -671,19 +1645,27 @@ void gemm_half_q_half_cuda int size_n, int size_k, int groups, - bool use_exllama + bool use_exllama, + int bit ) { - if ((use_exllama && size_m > MAX_Q_GEMM_ROWS) || (!use_exllama && size_m > MAX_ALT_GEMM_ROWS)) { + bool use_reconstruct; + if (use_exllama) { + use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) || (bit != 8 && size_m > MAX_Q_GEMM_ROWS)); + } else { + // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so we disabled them for now. + use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS); + } + if (use_reconstruct) { // Reconstruct FP16 matrix, then cuBLAS if (use_exllama) { reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq, - size_k, size_n, groups); + size_k, size_n, groups, bit); } else { reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - temp_dq, size_k, size_n, groups); + temp_dq, size_k, size_n, groups, bit); } const half alpha = __float2half(1.0f); @@ -707,7 +1689,7 @@ void gemm_half_q_half_cuda { gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX, - groups); + groups, bit); } if (last_chunk_size) @@ -715,18 +1697,17 @@ void gemm_half_q_half_cuda gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, c + last_chunk * size_n, last_chunk_size, size_n, size_k, last_chunk_size, - groups); + groups, bit); } } else { gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, - c, size_m, size_n, size_k); + c, size_m, size_n, size_k, bit); } } - -__global__ void shuffle_kernel +__global__ void shuffle_4bit_kernel ( uint32_t* __restrict__ b_q_weight, const int size_k, @@ -740,13 +1721,53 @@ __global__ void shuffle_kernel while (k < size_k) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k += 8; } } +__global__ void shuffle_8bit_kernel +( + uint32_t* __restrict__ b_q_weight, + const int size_k, + const int size_n +) +{ + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k += 4; } +} + +__global__ void shuffle_2bit_kernel +( + uint32_t* __restrict__ b_q_weight, + const int size_k, + const int size_n +) +{ + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16; } +} + +__global__ void shuffle_3bit_kernel +( + uint32_t* __restrict__ b_q_weight, + const int size_k, + const int size_n +) +{ + int n = blockIdx.x * THREADS_X + threadIdx.x; + if (n >= size_n) return; + int k = 0; + uint32_t* b_ptr = b_q_weight + n; + while (k < size_k) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32; } +} -__global__ void make_sequential_kernel +__global__ void make_sequential_4bit_kernel ( const uint32_t* __restrict__ w, uint32_t* __restrict__ w_new, const int* __restrict__ q_perm, - const int w_height, const int w_width ) { @@ -778,37 +1799,204 @@ __global__ void make_sequential_kernel w_new2[w_new2_row * w2_stride + w2_column] = dst; } +__global__ void make_sequential_2bit_kernel +( + const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width +) +{ + const uint64_t* w2 = (uint64_t*) w; + uint64_t* w_new2 = (uint64_t*) w_new; + int w2_stride = w_width >> 1; + int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w2_column >= w2_stride) return; + int w_new2_row = blockIdx.y; + int q_perm_idx = w_new2_row << 4; + uint64_t dst = 0; + + #pragma unroll + for (int i = 0; i < 16; i++) + { + int source_row = q_perm[q_perm_idx++]; + + int w2_row = source_row >> 4; + int w2_subrow = source_row & 0x0f; + int w2_row_shift = w2_subrow << 1; + int wnew2_row_shift = i << 1; + + uint64_t src = w2[w2_row * w2_stride + w2_column]; + src >>= w2_row_shift; + src &= 0x0000000300000003; + src <<= wnew2_row_shift; + dst |= src; + } + w_new2[w_new2_row * w2_stride + w2_column] = dst; +} + +__global__ void make_sequential_3bit_kernel +( + const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width +) +{ + int w_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w_column >= w_width) return; + int w_new_row = blockIdx.y * 3; + int q_perm_idx = blockIdx.y << 5; + uint32_t dst[3] = {0, 0, 0}; + + #pragma unroll + for (int i = 0; i < 32; i++) + { + int source_row = q_perm[q_perm_idx++]; + int z_w = (source_row / 32) * 3; + int z_mod = source_row % 32; + int z_bit; + + if (z_mod != 10){ + if (z_mod != 21){ + z_bit = z_mod; + if (z_bit > 21){ + z_bit *= 3; + z_bit -= 64; + z_w += 2; + } else if (z_bit > 10){ + z_bit *= 3; + z_bit -= 32; + z_w += 1; + } else { + z_bit *= 3; + } + } else { + z_w += 1; + } + } + + uint64_t src; + if (z_mod == 10) { + src = (w[z_w * w_width + w_column] >> 30) | ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4); + } else if (z_mod == 21){ + src = (w[z_w * w_width + w_column] >> 31) | ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6); + } else { + src = w[z_w * w_width + w_column]; + src >>= z_bit; + src &= 0x07; + } + + z_w = 0; + if (i != 10){ + if (i != 21){ + z_bit = i; + if (z_bit > 21){ + z_bit *= 3; + z_bit -= 64; + z_w += 2; + } else if (z_bit > 10){ + z_bit *= 3; + z_bit -= 32; + z_w += 1; + } else { + z_bit *= 3; + } + } else { + z_w += 1; + } + } + if (i == 10) { + dst[z_w] |= (src & 0x03) << 30; + dst[z_w + 1] |= ((src & 0x4) >> 2); + } else if (i == 21) { + dst[z_w] |= (src & 0x01) << 31; + dst[z_w + 1] |= ((src & 0x6) >> 1); + } else { + dst[z_w] |= (src << z_bit); + } + } + w_new[w_new_row * w_width + w_column] = dst[0]; + w_new[(w_new_row + 1) * w_width + w_column] = dst[1]; + w_new[(w_new_row + 2) * w_width + w_column] = dst[2]; +} + +__global__ void make_sequential_8bit_kernel +( + const uint32_t* __restrict__ w, + uint32_t* __restrict__ w_new, + const int* __restrict__ q_perm, + const int w_width +) +{ + const uint64_t* w2 = (uint64_t*) w; + uint64_t* w_new2 = (uint64_t*) w_new; + int w2_stride = w_width >> 1; + int w2_column = THREADS_X * blockIdx.x + threadIdx.x; + if (w2_column >= w2_stride) return; + int w_new2_row = blockIdx.y; + int q_perm_idx = w_new2_row << 2; + uint64_t dst = 0; + + #pragma unroll + for (int i = 0; i < 4; i++) + { + int source_row = q_perm[q_perm_idx++]; + + int w2_row = source_row >> 2; + int w2_subrow = source_row & 0x03; + int w2_row_shift = w2_subrow << 3; + int wnew2_row_shift = i << 3; + + uint64_t src = w2[w2_row * w2_stride + w2_column]; + src >>= w2_row_shift; + src &= 0x000000ff000000ff; + src <<= wnew2_row_shift; + dst |= src; + } + w_new2[w_new2_row * w2_stride + w2_column] = dst; +} + void shuffle_exllama_weight ( uint32_t* q_weight, int* q_perm, int height, - int width + int width, + int bit ) { if (q_perm) { uint32_t* new_qweight = NULL; - cudaMalloc(&new_qweight, height / 8 * width * sizeof(uint32_t)); + cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t)); dim3 blockDim, gridDim; blockDim.x = THREADS_X; blockDim.y = 1; gridDim.x = DIVIDE(width, THREADS_X); - gridDim.y = height / 8; - + gridDim.y = height / 32 * bit; + + auto kernel = make_sequential_4bit_kernel; + if (bit == 2) { + kernel = make_sequential_2bit_kernel; + } else if (bit == 3) { + kernel = make_sequential_3bit_kernel; + gridDim.y = height / 32; + } else if (bit == 8) { + kernel = make_sequential_8bit_kernel; + } const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - make_sequential_kernel<<>> + kernel<<>> ( q_weight, new_qweight, q_perm, - height / 8, width ); // Replace qweights - cudaMemcpyAsync(q_weight, new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(q_weight, new_qweight, height / 32 * bit * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); // Cleanup cudaDeviceSynchronize(); cudaFree(new_qweight); @@ -818,6 +2006,14 @@ void shuffle_exllama_weight blockDim.y = 1; gridDim.x = DIVIDE(width, THREADS_X); gridDim.y = 1; + auto shuffle_kernel = shuffle_4bit_kernel; + if (bit == 2) { + shuffle_kernel = shuffle_2bit_kernel; + } else if (bit == 3) { + shuffle_kernel = shuffle_3bit_kernel; + } else if (bit == 8) { + shuffle_kernel = shuffle_8bit_kernel; + } const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); shuffle_kernel<<>>(q_weight, height, width); } @@ -832,13 +2028,14 @@ torch::Tensor gptq_gemm torch::Tensor b_gptq_qzeros, torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, - bool use_exllama + bool use_exllama, + int bit ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options); - at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 8, b_q_weight.size(1)}, options); + at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options); vllm::gptq::gemm_half_q_half_cuda ( @@ -854,7 +2051,8 @@ torch::Tensor gptq_gemm c.size(1), // n a.size(1), // k b_gptq_qzeros.size(0), // group number - use_exllama + use_exllama, + bit ); return c; } @@ -862,14 +2060,16 @@ torch::Tensor gptq_gemm void gptq_shuffle ( torch::Tensor q_weight, - torch::Tensor q_perm + torch::Tensor q_perm, + int bit ) { const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight)); vllm::gptq::shuffle_exllama_weight( (uint32_t*) q_weight.data_ptr(), q_perm.device().is_meta() ? NULL : (int*) q_perm.data_ptr(), - q_weight.size(0) * 8, - q_weight.size(1) + q_weight.size(0) * 32 / bit, + q_weight.size(1), + bit ); } diff --git a/csrc/quantization/gptq/qdq_2.cuh b/csrc/quantization/gptq/qdq_2.cuh new file mode 100644 index 0000000000000..295872a91de37 --- /dev/null +++ b/csrc/quantization/gptq/qdq_2.cuh @@ -0,0 +1,87 @@ +/* +Copied from https://github.com/turboderp/exllamav2 +*/ + +#ifndef _qdq_2_cuh +#define _qdq_2_cuh + +#include "qdq_util.cuh" + +namespace vllm { +namespace gptq { + +// Permutation: +// +// ffddbb99 77553311 eeccaa88 66442200 + +__forceinline__ __device__ void shuffle_2bit_16 +( + uint32_t* q, + int stride +) +{ + uint32_t qa = q[0]; + uint32_t qb = 0; + + #pragma unroll + for (int i = 0; i < 8; i++) + { + uint32_t qa0 = qa & 0x03; + uint32_t qa1 = (qa & 0x0c) >> 2; + qa >>= 4; + qb |= (qa1 << (i * 2 + 16)); + qb |= (qa0 << (i * 2)); + } + q[0] = qb; +} + +__forceinline__ __device__ void dequant_2bit_16 +( + const uint32_t q_0, + half2 (&dq)[8], + int stride, + const uint32_t zero +) +{ + const uint32_t c0 = 0x64006400; + const half y4_ = __float2half_rn(1.0f / 4.0f); + const half y16_ = __float2half_rn(1.0f / 16.0f); + const half y64_ = __float2half_rn(1.0f / 64.0f); + const half2 y4 = __halves2half2(y4_, y4_); + const half2 y16 = __halves2half2(y16_, y16_); + const half2 y64 = __halves2half2(y64_, y64_); + + const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); + const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero)); + const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); + const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); + const half2 z1 = __half2half2(z1_.as_half); + const half2 z4 = __half2half2(z4_); + const half2 z16 = __half2half2(z16_); + const half2 z64 = __half2half2(z64_); + + uint32_t qa = q_0; + half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1]) + 1024 + half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) * 4 + 1024 + half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024 + half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024 + qa >>= 8; + half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8]) + 1024 + half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) * 4 + 1024 + half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024 + half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024 + + dq[0] = __hadd2(q0.as_half2, z1); + dq[1] = __hfma2(q1.as_half2, y4, z4); + dq[2] = __hfma2(q2.as_half2, y16, z16); + dq[3] = __hfma2(q3.as_half2, y64, z64); + dq[4] = __hadd2(q4.as_half2, z1); + dq[5] = __hfma2(q5.as_half2, y4, z4); + dq[6] = __hfma2(q6.as_half2, y16, z16); + dq[7] = __hfma2(q7.as_half2, y64, z64); +} + +} // namespace gptq +} // namespace vllm + +#endif diff --git a/csrc/quantization/gptq/qdq_3.cuh b/csrc/quantization/gptq/qdq_3.cuh new file mode 100644 index 0000000000000..3e7ecde752ba3 --- /dev/null +++ b/csrc/quantization/gptq/qdq_3.cuh @@ -0,0 +1,141 @@ +#ifndef _qdq_3_cuh +#define _qdq_3_cuh + +#include "qdq_util.cuh" + +namespace vllm { +namespace gptq { +// Permutation: +// +// v9997775 55333111 u8886664 44222000 (u, v lsb) +// vjjjhhhf ffdddbbb uiiiggge eecccaaa +// vtttrrrp ppnnnlll usssqqqo oommmkkk + +__forceinline__ __device__ void shuffle_3bit_32 +( + uint32_t* q, + int stride +) +{ + uint32_t qa = q[0 * stride]; + uint32_t qb = q[1 * stride]; + uint32_t qc = q[2 * stride]; + + // qa: aa999888 77766655 54443332 22111000 + // qb: lkkkjjji iihhhggg fffeeedd dcccbbba + // qc: vvvuuutt tsssrrrq qqpppooo nnnmmmll + + uint32_t qd = qc >> 26; + qc <<= 4; + qc |= qb >> 28; + qb <<= 2; + qb |= qa >> 30; + + // qa: ..999888 77766655 54443332 22111000 + // qb: ..jjjiii hhhgggff feeedddc ccbbbaaa + // qc: ..tttsss rrrqqqpp pooonnnm mmlllkkk + // qd: vvvuuu + + uint32_t za = 0; + uint32_t zb = 0; + uint32_t zc = 0; + + for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); } + for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); } + for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); } + + // za: 9997775 55333111 8886664 44222000 + // zb: jjjhhhf ffdddbbb iiiggge eecccaaa + // zc: tttrrrp ppnnnlll sssqqqo oommmkkk + // qd: vvvuuu + + za |= ((qd & 0x01) >> 0) << 15; + zb |= ((qd & 0x02) >> 1) << 15; + zc |= ((qd & 0x04) >> 2) << 15; + za |= ((qd & 0x08) >> 3) << 31; + zb |= ((qd & 0x10) >> 4) << 31; + zc |= ((qd & 0x20) >> 5) << 31; + + // za: v9997775 55333111 u8886664 44222000 (u, v lsb) + // zb: vjjjhhhf ffdddbbb uiiiggge eecccaaa + // zc: vtttrrrp ppnnnlll usssqqqo oommmkkk + + q[0 * stride] = za; + q[1 * stride] = zb; + q[2 * stride] = zc; +} + +__forceinline__ __device__ void dequant_3bit_32 +( + const uint32_t q_0, + const uint32_t q_1, + const uint32_t q_2, + half2 (&dq)[16], + int stride, + const uint32_t zero +) +{ + const uint32_t c0 = 0x64006400; + const half y8_ = __float2half_rn(1.0f / 8.0f); + const half y64_ = __float2half_rn(1.0f / 64.0f); + const half2 y8 = __halves2half2(y8_, y8_); + const half2 y64 = __halves2half2(y64_, y64_); + const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); + const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero)); + const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero)); + const half2 z1 = __halves2half2(z1_.as_half, z1_.as_half); + const half2 z8 = __halves2half2(z8_, z8_); + const half2 z64 = __halves2half2(z64_, z64_); + + uint32_t qa = q_0; + uint32_t qb = q_1; + uint32_t qc = q_2; + + half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1]) + 1024 + half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) * 8 + 1024 + qa >>= 6; + half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5]) + 1024 + half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) * 8 + 1024 + half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024 + qa >>= 9; + qa &= 0x00010001; + half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11]) + 1024 + half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) * 8 + 1024 + qb >>= 6; + half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15]) + 1024 + half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) * 8 + 1024 + half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024 + qb >>= 8; + qb &= 0x00020002; + half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21]) + 1024 + half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) * 8 + 1024 + qc >>= 6; + half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25]) + 1024 + half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) * 8 + 1024 + half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024 + qc >>= 7; + qc &= 0x00040004; + half2_uint32 q15((qa | qb | qc) | c0); + + dq[ 0] = __hadd2( q0.as_half2, z1); + dq[ 1] = __hfma2( q1.as_half2, y8, z8); + dq[ 2] = __hadd2( q2.as_half2, z1); + dq[ 3] = __hfma2( q3.as_half2, y8, z8); + dq[ 4] = __hfma2( q4.as_half2, y64, z64); + dq[ 5] = __hadd2( q5.as_half2, z1); + dq[ 6] = __hfma2( q6.as_half2, y8, z8); + dq[ 7] = __hadd2( q7.as_half2, z1); + dq[ 8] = __hfma2( q8.as_half2, y8, z8); + dq[ 9] = __hfma2( q9.as_half2, y64, z64); + dq[10] = __hadd2(q10.as_half2, z1); + dq[11] = __hfma2(q11.as_half2, y8, z8); + dq[12] = __hadd2(q12.as_half2, z1); + dq[13] = __hfma2(q13.as_half2, y8, z8); + dq[14] = __hfma2(q14.as_half2, y64, z64); + dq[15] = __hadd2(q15.as_half2, z1); +} + +} // namespace gptq +} // namespace vllm + +#endif diff --git a/csrc/quantization/gptq/qdq_4.cuh b/csrc/quantization/gptq/qdq_4.cuh index cfc4635a22c1d..881f353f6564d 100644 --- a/csrc/quantization/gptq/qdq_4.cuh +++ b/csrc/quantization/gptq/qdq_4.cuh @@ -38,16 +38,17 @@ __forceinline__ __device__ void dequant_4bit_8 ( const uint32_t q_0, half2 (&dq)[4], - int stride + int stride, + const uint32_t zero ) { const uint32_t c0 = 0x64006400; const half y16_ = __float2half_rn(1.0f / 16.0f); const half2 y16 = __halves2half2(y16_, y16_); - const half z1_ = __float2half_rn(-1024.0f - 8.0f); - const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f); - const half2 z1 = __halves2half2(z1_, z1_); - const half2 z16 = __halves2half2(z16_, z16_); + const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero); + const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero)); + const half2 z1 = __half2half2(z1_.as_half); + const half2 z16 = __half2half2(z16_); uint32_t qa = q_0; half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1]) + 1024 @@ -143,93 +144,4 @@ __forceinline__ __device__ void dequant_4bit_8_gptq } // namespace gptq } // namespace vllm -#else - -namespace vllm { -namespace gptq { -__forceinline__ __device__ void shuffle_4bit_8 -( - uint32_t* q, - int stride -) -{ -} - -__forceinline__ __device__ void dequant_4bit_8 -( - const uint32_t q_0, - half2 (&dq)[4], - int stride -) -{ - half dqh[8]; - for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8); - - for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale -( - const uint32_t zero, - const half scale, - half2 (&z1)[2], - half2 (&y1)[2] -) -{ - half z = __int2half_rn(-((int)zero)); - z = __hmul(z, scale); - z1[0] = __half2half2(z); - y1[0] = __half2half2(scale); -} - -__forceinline__ __device__ void dequant_4bit_8_prep_zero -( - const uint32_t zero, - half2(&z1)[2], - half2(&y1)[2] -) -{ - half z = __int2half_rn(-((int)zero)); - z1[0] = __half2half2(z); -} - -__forceinline__ __device__ void dequant_4bit_8_gptq -( - const uint32_t q_0, - half2 (&dq)[4], - half2 (&z1)[2], - half2 (&y1)[2], - int stride, - bool scaled -) -{ - half2 dqh2[8]; - - uint32_t qa = q_0; - for (int i = 0; i < 4; i++) - { - half d0 = __int2half_rn(qa & 0x0f); qa >>= 4; - half d1 = __int2half_rn(qa & 0x0f); qa >>= 4; - dqh2[i] = __halves2half2(d0, d1); - } - - if (scaled) - { - dq[0] = __hfma2(dqh2[0], y1[0], z1[0]); - dq[1] = __hfma2(dqh2[1], y1[0], z1[0]); - dq[2] = __hfma2(dqh2[2], y1[0], z1[0]); - dq[3] = __hfma2(dqh2[3], y1[0], z1[0]); - } - else - { - dq[0] = __hadd2(dqh2[0], z1[0]); - dq[1] = __hadd2(dqh2[1], z1[0]); - dq[2] = __hadd2(dqh2[2], z1[0]); - dq[3] = __hadd2(dqh2[3], z1[0]); - } -} - -} // namespace gptq -} // namespace vllm - #endif diff --git a/csrc/quantization/gptq/qdq_8.cuh b/csrc/quantization/gptq/qdq_8.cuh new file mode 100644 index 0000000000000..0c7ad7876140b --- /dev/null +++ b/csrc/quantization/gptq/qdq_8.cuh @@ -0,0 +1,40 @@ +/* +Copied from https://github.com/turboderp/exllamav2 +*/ + +#ifndef _qdq_8_cuh +#define _qdq_8_cuh + +#include "qdq_util.cuh" + +namespace vllm { +namespace gptq { + +__forceinline__ __device__ void shuffle_8bit_4 +( + uint32_t* q, + int stride +) +{ +} + +__forceinline__ __device__ void dequant_8bit_8 +( + const uint32_t q_0, + const uint32_t q_1, + half2 (&dq)[4], + int stride, + const uint32_t zero +) +{ + half dqh[8]; + for (int i = 0; i < 4; i++) dqh[i ] = dq_ns(exb(q_0, i * 8, 0xff), zero); + for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero); + + for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]); +} + +} // namespace gptq +} // namespace vllm + +#endif diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 7218760fbe55d..2e6aabb232673 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -1,6 +1,7 @@ import enum from enum import Enum from typing import Any, Dict, List, Optional +from fractions import Fraction import torch from torch.nn.parameter import Parameter @@ -27,11 +28,10 @@ def __init__( self.weight_bits = weight_bits self.group_size = group_size self.desc_act = desc_act - self.pack_factor = 32 // self.weight_bits - # exllama kernel v1 only supports 4 bit - if self.weight_bits != 4: + self.pack_factor = Fraction(32, self.weight_bits) + if self.weight_bits not in [2, 3, 4, 8]: raise ValueError( - "Currently, only 4-bit weight quantization is supported for " + "Currently, only 2/3/4/8-bit weight quantization is supported for " f"GPTQ, but got {self.weight_bits} bits.") def __repr__(self) -> str: @@ -101,7 +101,7 @@ def create_weights( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") - if output_size_per_partition % self.quant_config.pack_factor != 0: + if output_size_per_partition % self.quant_config.pack_factor.numerator != 0: raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " @@ -201,11 +201,13 @@ def apply_weights(self, else: weights["g_idx"] = torch.empty((1, 1), device="meta") weights["exllama_state"] = ExllamaState.READY - ops.gptq_shuffle(weights["qweight"], weights["g_idx"]) + ops.gptq_shuffle(weights["qweight"], weights["g_idx"], + self.quant_config.weight_bits) output = ops.gptq_gemm(reshaped_x, weights["qweight"], weights["qzeros"], weights["scales"], weights["g_idx"], - weights["exllama_state"] == ExllamaState.READY) + weights["exllama_state"] == ExllamaState.READY, + self.quant_config.weight_bits) if bias is not None: output = output + bias return output.reshape(out_shape) From a6d471c75939b2f4708a4e1cb1aa3b7b993ee54b Mon Sep 17 00:00:00 2001 From: Jae-Won Chung Date: Thu, 29 Feb 2024 01:04:07 -0500 Subject: [PATCH 030/113] Fix: `AttributeError` in OpenAI-compatible server (#3018) --- vllm/entrypoints/openai/protocol.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e85e7e2b1ede9..97cfd797587c4 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -57,7 +57,7 @@ class UsageInfo(BaseModel): class ChatCompletionRequest(BaseModel): model: str - messages: Union[str, List[Dict[str, str]]] + messages: List[Dict[str, str]] temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 n: Optional[int] = 1 diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 5635ac6c9e106..e5ae39e110a40 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -80,7 +80,7 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: return self.response_role else: - return request.messages[-1].role + return request.messages[-1]["role"] async def chat_completion_stream_generator( self, request: ChatCompletionRequest, From 9289e577ec185bd9feb2c03bb86b82f1bf9bb633 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Thu, 29 Feb 2024 14:15:18 +0800 Subject: [PATCH 031/113] add cache_config's info to prometheus metrics. (#3100) --- vllm/config.py | 4 ++++ vllm/engine/llm_engine.py | 1 + vllm/engine/metrics.py | 10 +++++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index fc848b72d7f2a..2f8883fe0733e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -308,6 +308,10 @@ def __init__( self.num_gpu_blocks = None self.num_cpu_blocks = None + def metrics_info(self): + # convert cache_config to dict(key: str, value:str) for prometheus metrics info + return {key: str(value) for key, value in self.__dict__.items()} + def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: raise ValueError( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f0fd7efdef813..6f5af71426d78 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -138,6 +138,7 @@ def __init__( self.stat_logger = StatLogger( local_interval=_LOCAL_LOGGING_INTERVAL_SEC, labels=dict(model_name=model_config.model)) + self.stat_logger.info("cache_config", self.cache_config) self.forward_dag = None if USE_RAY_COMPILED_DAG: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 83e66a9372272..54b09c38f58a5 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,5 +1,5 @@ from vllm.logger import init_logger -from prometheus_client import Counter, Gauge, Histogram, REGISTRY, disable_created_metrics +from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics import time import numpy as np @@ -23,6 +23,10 @@ def __init__(self, labelnames: List[str]): if hasattr(collector, "_name") and "vllm" in collector._name: REGISTRY.unregister(collector) + self.info_cache_config = Info( + name='vllm:cache_config', + documentation='information of cache_config') + # System stats self.gauge_scheduler_running = Gauge( name="vllm:num_requests_running", @@ -128,6 +132,10 @@ def __init__(self, local_interval: float, labels: Dict[str, str]) -> None: self.labels = labels self.metrics = Metrics(labelnames=list(labels.keys())) + def info(self, type: str, obj: object) -> None: + if type == "cache_config": + self.metrics.info_cache_config.info(obj.metrics_info()) + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: return float(np.sum(tracked_stats) / (now - self.last_local_log)) From bfdcfa6a053c693800551bd1bd71acabbe1941e8 Mon Sep 17 00:00:00 2001 From: Seonghyeon Date: Thu, 29 Feb 2024 17:51:48 +0900 Subject: [PATCH 032/113] Support starcoder2 architecture (#3089) --- README.md | 1 + tests/models/test_models.py | 1 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/starcoder2.py | 310 ++++++++++++++++++ vllm/transformers_utils/config.py | 10 + vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/starcoder2.py | 127 +++++++ 7 files changed, 452 insertions(+) create mode 100644 vllm/model_executor/models/starcoder2.py create mode 100644 vllm/transformers_utils/configs/starcoder2.py diff --git a/README.md b/README.md index f771788db2b89..064faa550f267 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.) - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.) +- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.) - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.) Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): diff --git a/tests/models/test_models.py b/tests/models/test_models.py index e44452e9893cf..fb567e837d281 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -19,6 +19,7 @@ "microsoft/phi-2", "stabilityai/stablelm-3b-4e1t", "allenai/OLMo-1B", + "bigcode/starcoder2-3b", ] diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index e4f3a785cd99a..75c2ae1e9f48e 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -45,6 +45,7 @@ "RWForCausalLM": ("falcon", "FalconForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), + "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), } # Models not supported by ROCm. diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py new file mode 100644 index 0000000000000..1eda07b724cae --- /dev/null +++ b/vllm/model_executor/models/starcoder2.py @@ -0,0 +1,310 @@ +# coding=utf-8 +# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Starcoder2 model.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearMethodBase, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) +from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +try: + from transformers import Starcoder2Config +except ImportError: + # fallback to PretrainedConfig + # NOTE: Please install transformers from source or use transformers>=4.39.0 + from transformers import PretrainedConfig as Starcoder2Config + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class Starcoder2Attention(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = self.hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = config.rope_theta + self.max_position_embeddings = config.max_position_embeddings + self.use_bias = config.use_bias + self.sliding_window = config.sliding_window + + self.qkv_proj = QKVParallelLinear( + self.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=self.use_bias, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + self.hidden_size, + bias=self.use_bias, + linear_method=linear_method, + ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=int(self.rope_theta), + is_neox_style=True, + ) + self.attn = PagedAttention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class Starcoder2MLP(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.c_fc = ColumnParallelLinear( + config.hidden_size, + config.intermediate_size, + bias=config.use_bias, + linear_method=linear_method, + ) + self.c_proj = RowParallelLinear( + config.intermediate_size, + config.hidden_size, + bias=config.use_bias, + linear_method=linear_method, + ) + self.act = get_act_fn(config.hidden_act, + intermediate_size=config.intermediate_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.c_proj(hidden_states) + return hidden_states + + +class Starcoder2DecoderLayer(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = Starcoder2Attention(config, + linear_method=linear_method) + self.mlp = Starcoder2MLP(config, linear_method=linear_method) + self.input_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.norm_epsilon) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.norm_epsilon) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + # Self Attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Starcoder2Model(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # TODO: consider padding_idx (currently removed) + self.embed_tokens = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.layers = nn.ModuleList([ + Starcoder2DecoderLayer(config, linear_method=linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states = layer(positions, hidden_states, kv_caches[i], + input_metadata) + hidden_states = self.norm(hidden_states) + return hidden_states + + +class Starcoder2ForCausalLM(nn.Module): + + def __init__(self, + config: Starcoder2Config, + linear_method: Optional[LinearMethodBase] = None): + super().__init__() + self.config = config + self.model = Starcoder2Model(config, linear_method=linear_method) + self.vocab_size = config.vocab_size + self.unpadded_vocab_size = config.vocab_size + if config.tie_word_embeddings: + self.lm_head_weight = self.model.embed_tokens.weight + else: + self.unpadded_vocab_size = config.vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + ) + self.lm_head_weight = self.lm_head.weight + self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head_weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if self.config.tie_word_embeddings and "lm_head.weight" in name: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 6b0413f440a0e..5e1f0439aec51 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -9,6 +9,7 @@ "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) + "starcoder2": Starcoder2Config, } @@ -16,6 +17,15 @@ def get_config(model: str, trust_remote_code: bool, revision: Optional[str] = None, code_revision: Optional[str] = None) -> PretrainedConfig: + # FIXME(woosuk): This is a temporary fix for StarCoder2. + # Remove this when the model is supported by HuggingFace transformers. + if "bigcode" in model and "starcoder2" in model: + config_class = _CONFIG_REGISTRY["starcoder2"] + config = config_class.from_pretrained(model, + revision=revision, + code_revision=code_revision) + return config + try: config = AutoConfig.from_pretrained( model, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index ef955f75cedaa..4966526f15184 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -4,9 +4,11 @@ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig +from vllm.transformers_utils.configs.starcoder2 import Starcoder2Config __all__ = [ "ChatGLMConfig", "MPTConfig", "RWConfig", + "Starcoder2Config", ] diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py new file mode 100644 index 0000000000000..4c3b6b8def074 --- /dev/null +++ b/vllm/transformers_utils/configs/starcoder2.py @@ -0,0 +1,127 @@ +from transformers import PretrainedConfig + + +class Starcoder2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a + Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model. + + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 49152): + Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Starcoder2Model`] + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 12288): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 30): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 24): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 2): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention + allows sequence of up to 4096*32 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + norm_epsilon (`float`, *optional*, defaults to 1e-05): + Epsilon value for the layer norm + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + bos_token_id (`int`, *optional*, defaults to 50256): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 50256): + The id of the "end-of-sequence" token. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + sliding_window (`int`, *optional*): + Sliding window attention window size. If not specified, will default to `None` (no sliding window). + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + residual_dropout (`float`, *optional*, defaults to 0.0): + Residual connection dropout value. + embedding_dropout (`float`, *optional*, defaults to 0.0): + Embedding dropout. + use_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias term on linear layers of the model. + + + ```python + >>> from transformers import Starcoder2Model, Starcoder2Config + + >>> # Initializing a Starcoder2 7B style configuration + >>> configuration = Starcoder2Config() + + >>> # Initializing a model from the Starcoder2 7B style configuration + >>> model = Starcoder2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "starcoder2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=49152, + hidden_size=3072, + intermediate_size=12288, + num_hidden_layers=30, + num_attention_heads=24, + num_key_value_heads=2, + hidden_act="gelu_pytorch_tanh", + max_position_embeddings=4096, + initializer_range=0.018042, + norm_epsilon=1e-5, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + rope_theta=10000.0, + sliding_window=None, + attention_dropout=0.0, + residual_dropout=0.0, + embedding_dropout=0.0, + use_bias=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + self.use_bias = use_bias + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.norm_epsilon = norm_epsilon + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.residual_dropout = residual_dropout + self.embedding_dropout = embedding_dropout + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + if self.architectures is None: + self.architectures = ['Starcoder2ForCausalLM'] From 2c08ff23c07f2f8d51da8e1783c5346dccc1fd12 Mon Sep 17 00:00:00 2001 From: Billy Cao Date: Fri, 1 Mar 2024 03:13:58 +0800 Subject: [PATCH 033/113] Fix building from source on WSL (#3112) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 16978d74e0425..1f48be948aa84 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ def _is_neuron() -> bool: torch_neuronx_installed = True try: subprocess.run(["neuron-ls"], capture_output=True, check=True) - except FileNotFoundError: + except (FileNotFoundError, PermissionError): torch_neuronx_installed = False return torch_neuronx_installed From 29a8d6a554a87292f05b62078976b43a899691e3 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 29 Feb 2024 11:20:42 -0800 Subject: [PATCH 034/113] [Fix] Don't deep-copy LogitsProcessors when copying SamplingParams (#3099) --- vllm/engine/llm_engine.py | 5 +++-- vllm/sampling_params.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6f5af71426d78..9bf19b932d35b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -484,8 +484,9 @@ def add_request( prompt_token_ids[:prefix_pos], lora_request.lora_int_id if lora_request else 0) if prefix_pos is not None else None - # Defensive copy of SamplingParams, which are used by the sampler - sampling_params = copy.deepcopy(sampling_params) + # Defensive copy of SamplingParams, which are used by the sampler, + # this doesn't deep-copy LogitsProcessor objects + sampling_params = sampling_params.clone() # Create the sequence group. seq_group = SequenceGroup(request_id, [seq], sampling_params, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 51d39220ca9ca..8103f3c2b24bf 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,4 +1,5 @@ """Sampling parameters for text generation.""" +import copy from enum import IntEnum from functools import cached_property from typing import Callable, List, Optional, Union @@ -237,6 +238,20 @@ def sampling_type(self) -> SamplingType: return SamplingType.RANDOM_SEED return SamplingType.RANDOM + def clone(self) -> "SamplingParams": + """Deep copy excluding LogitsProcessor objects. + + LogitsProcessor objects are excluded because they may contain an + arbitrary, nontrivial amount of data. + See https://github.com/vllm-project/vllm/issues/3087 + """ + + logit_processor_refs = None if self.logits_processors is None else { + id(lp): lp + for lp in self.logits_processors + } + return copy.deepcopy(self, memo=logit_processor_refs) + def __repr__(self) -> str: return ( f"SamplingParams(n={self.n}, " From 703e42ee4b3efed3c71e7ae7d15f0f96e05722d4 Mon Sep 17 00:00:00 2001 From: felixzhu555 <79335195+felixzhu555@users.noreply.github.com> Date: Thu, 29 Feb 2024 14:13:08 -0800 Subject: [PATCH 035/113] Add guided decoding for OpenAI API server (#2819) Co-authored-by: br3no Co-authored-by: simon-mo --- requirements.txt | 1 + tests/entrypoints/test_guided_processors.py | 75 ++++++ tests/entrypoints/test_openai_server.py | 237 ++++++++++++++++++ vllm/engine/async_llm_engine.py | 3 + vllm/entrypoints/openai/protocol.py | 36 ++- vllm/entrypoints/openai/serving_chat.py | 9 + vllm/entrypoints/openai/serving_completion.py | 9 + vllm/model_executor/guided_decoding.py | 99 ++++++++ .../guided_logits_processors.py | 129 ++++++++++ 9 files changed, 597 insertions(+), 1 deletion(-) create mode 100644 tests/entrypoints/test_guided_processors.py create mode 100644 vllm/model_executor/guided_decoding.py create mode 100644 vllm/model_executor/guided_logits_processors.py diff --git a/requirements.txt b/requirements.txt index d4599ec95d945..05ec2e804e13b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 pynvml == 11.5.0 triton >= 2.1.0 +outlines >= 0.0.27 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py new file mode 100644 index 0000000000000..5b39269916f8b --- /dev/null +++ b/tests/entrypoints/test_guided_processors.py @@ -0,0 +1,75 @@ +# This unit test should be moved to a new +# tests/test_guided_decoding directory. + +from transformers import AutoTokenizer +import torch + +from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor, + JSONLogitsProcessor) + +TEST_SCHEMA = { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "string" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work history"] +} + +TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" + + +def test_guided_logits_processors(): + """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor.""" + tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') + regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer) + json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer) + + regex_LP.init_state() + token_ids = tokenizer.encode( + f"Give an example IPv4 address with this regex: {TEST_REGEX}") + tensor = torch.rand(32000) + original_tensor = torch.clone(tensor) + regex_LP(token_ids, tensor) + assert tensor.shape == original_tensor.shape + assert not torch.allclose(tensor, original_tensor) + + json_LP.init_state() + token_ids = tokenizer.encode( + f"Give an employee profile that fits this schema: {TEST_SCHEMA}") + tensor = torch.rand(32000) + original_tensor = torch.clone(tensor) + json_LP(token_ids, tensor) + assert tensor.shape == original_tensor.shape + assert not torch.allclose(tensor, original_tensor) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 72e2374899793..e426cf7eed72b 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -9,12 +9,64 @@ import openai # use the official client for correctness check from huggingface_hub import snapshot_download # downloading lora to test lora requests +# imports for guided decoding tests +import json +import jsonschema +import re + from vllm.transformers_utils.tokenizer import get_tokenizer MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here LORA_NAME = "typeof/zephyr-7b-beta-lora" # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here +TEST_SCHEMA = { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "string" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work history"] +} + +TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" + +TEST_CHOICE = [ + "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", + "Swift", "Kotlin" +] + pytestmark = pytest.mark.asyncio @@ -325,6 +377,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): max_tokens=max_tokens, temperature=0.0, logit_bias={str(token_id): 100}, + seed=42, ) assert completion.choices[0].text is not None and len( completion.choices[0].text) >= 5 @@ -358,5 +411,189 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): assert first_response != completion.choices[0].text +async def test_guided_json_completion(server, client: openai.AsyncOpenAI): + completion = await client.completions.create( + model=MODEL_NAME, + prompt= + f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}", + n=3, + temperature=1.0, + max_tokens=500, + extra_body=dict(guided_json=TEST_SCHEMA)) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 3 + for i in range(3): + assert completion.choices[i].text is not None + output_json = json.loads(completion.choices[i].text) + jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) + + +async def test_guided_json_chat(server, client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "Give an example JSON for an employee profile that " + \ + f"fits this schema: {TEST_SCHEMA}" + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=500, + extra_body=dict(guided_json=TEST_SCHEMA)) + message = chat_completion.choices[0].message + assert message.content is not None + json1 = json.loads(message.content) + jsonschema.validate(instance=json1, schema=TEST_SCHEMA) + + messages.append({"role": "assistant", "content": message.content}) + messages.append({ + "role": + "user", + "content": + "Give me another one with a different name and age" + }) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=500, + extra_body=dict(guided_json=TEST_SCHEMA)) + message = chat_completion.choices[0].message + assert message.content is not None + json2 = json.loads(message.content) + jsonschema.validate(instance=json2, schema=TEST_SCHEMA) + assert json1["name"] != json2["name"] + assert json1["age"] != json2["age"] + + +async def test_guided_regex_completion(server, client: openai.AsyncOpenAI): + completion = await client.completions.create( + model=MODEL_NAME, + prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}", + n=3, + temperature=1.0, + max_tokens=20, + extra_body=dict(guided_regex=TEST_REGEX)) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 3 + for i in range(3): + assert completion.choices[i].text is not None + assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None + + +async def test_guided_regex_chat(server, client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + f"Give an example IP address with this regex: {TEST_REGEX}" + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=20, + extra_body=dict(guided_regex=TEST_REGEX)) + ip1 = chat_completion.choices[0].message.content + assert ip1 is not None + assert re.fullmatch(TEST_REGEX, ip1) is not None + + messages.append({"role": "assistant", "content": ip1}) + messages.append({"role": "user", "content": "Give me a different one"}) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=20, + extra_body=dict(guided_regex=TEST_REGEX)) + ip2 = chat_completion.choices[0].message.content + assert ip2 is not None + assert re.fullmatch(TEST_REGEX, ip2) is not None + assert ip1 != ip2 + + +async def test_guided_choice_completion(server, client: openai.AsyncOpenAI): + completion = await client.completions.create( + model=MODEL_NAME, + prompt="The best language for type-safe systems programming is ", + n=2, + temperature=1.0, + max_tokens=10, + extra_body=dict(guided_choice=TEST_CHOICE)) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 2 + for i in range(2): + assert completion.choices[i].text in TEST_CHOICE + + +async def test_guided_choice_chat(server, client: openai.AsyncOpenAI): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + "The best language for type-safe systems programming is " + }] + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=10, + extra_body=dict(guided_choice=TEST_CHOICE)) + choice1 = chat_completion.choices[0].message.content + assert choice1 in TEST_CHOICE + + messages.append({"role": "assistant", "content": choice1}) + messages.append({ + "role": "user", + "content": "I disagree, pick another one" + }) + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=10, + extra_body=dict(guided_choice=TEST_CHOICE)) + choice2 = chat_completion.choices[0].message.content + assert choice2 in TEST_CHOICE + assert choice1 != choice2 + + +async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI): + with pytest.raises(openai.BadRequestError): + _ = await client.completions.create( + model=MODEL_NAME, + prompt="Give an example JSON that fits this schema: 42", + extra_body=dict(guided_json=42)) + + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": + "user", + "content": + "The best language for type-safe systems programming is " + }] + with pytest.raises(openai.BadRequestError): + _ = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + extra_body=dict(guided_regex={ + 1: "Python", + 2: "C++" + })) + + with pytest.raises(openai.BadRequestError): + _ = await client.completions.create( + model=MODEL_NAME, + prompt="Give an example string that fits this regex", + extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA)) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7cba654602779..daa6419cdad3b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -333,6 +333,9 @@ def is_running(self) -> bool: return (self.background_loop is not None and not self.background_loop.done()) + def get_tokenizer(self): + return self.engine.tokenizer.tokenizer + def start_background_loop(self) -> None: """Start the background loop.""" if self.is_running: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 97cfd797587c4..26499b8d7a66f 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,7 +3,7 @@ import time from typing import Dict, List, Literal, Optional, Union -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator from vllm.utils import random_uuid from vllm.sampling_params import SamplingParams @@ -86,6 +86,9 @@ class ChatCompletionRequest(BaseModel): min_p: Optional[float] = 0.0 include_stop_str_in_output: Optional[bool] = False length_penalty: Optional[float] = 1.0 + guided_json: Optional[Union[str, dict, BaseModel]] = None + guided_regex: Optional[str] = None + guided_choice: Optional[List[str]] = None def to_sampling_params(self) -> SamplingParams: if self.logprobs and not self.top_logprobs: @@ -131,6 +134,20 @@ def logit_bias_logits_processor( logits_processors=logits_processors, ) + @model_validator(mode="before") + @classmethod + def check_guided_decoding_count(cls, data): + guide_count = sum([ + "guided_json" in data and data["guided_json"] is not None, + "guided_regex" in data and data["guided_regex"] is not None, + "guided_choice" in data and data["guided_choice"] is not None + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding " + "('guided_json', 'guided_regex' or 'guided_choice').") + return data + class CompletionRequest(BaseModel): model: str @@ -163,6 +180,9 @@ class CompletionRequest(BaseModel): min_p: Optional[float] = 0.0 include_stop_str_in_output: Optional[bool] = False length_penalty: Optional[float] = 1.0 + guided_json: Optional[Union[str, dict, BaseModel]] = None + guided_regex: Optional[str] = None + guided_choice: Optional[List[str]] = None def to_sampling_params(self): echo_without_generation = self.echo and self.max_tokens == 0 @@ -207,6 +227,20 @@ def logit_bias_logits_processor( logits_processors=logits_processors, ) + @model_validator(mode="before") + @classmethod + def check_guided_decoding_count(cls, data): + guide_count = sum([ + "guided_json" in data and data["guided_json"] is not None, + "guided_regex" in data and data["guided_regex"] is not None, + "guided_choice" in data and data["guided_choice"] is not None + ]) + if guide_count > 1: + raise ValueError( + "You can only use one kind of guided decoding " + "('guided_json', 'guided_regex' or 'guided_choice').") + return data + class LogProbs(BaseModel): text_offset: List[int] = Field(default_factory=list) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index e5ae39e110a40..f4ad0aa5a0184 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -12,6 +12,7 @@ UsageInfo) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA +from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor logger = init_logger(__name__) @@ -62,6 +63,14 @@ async def create_chat_completion( prompt=prompt) sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) + guided_decode_logits_processor = ( + await get_guided_decoding_logits_processor( + request, self.engine.get_tokenizer())) + if guided_decode_logits_processor: + if sampling_params.logits_processors is None: + sampling_params.logits_processors = [] + sampling_params.logits_processors.append( + guided_decode_logits_processor) except ValueError as e: return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 610f53549da48..713e67793b290 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -16,6 +16,7 @@ ) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA +from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor logger = init_logger(__name__) @@ -286,6 +287,14 @@ async def create_completion(self, request: CompletionRequest, try: sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) + guided_decode_logit_processor = ( + await get_guided_decoding_logits_processor( + request, self.engine.get_tokenizer())) + if guided_decode_logit_processor is not None: + if sampling_params.logits_processors is None: + sampling_params.logits_processors = [] + sampling_params.logits_processors.append( + guided_decode_logit_processor) prompt_is_tokens, prompts = parse_prompt_format(request.prompt) for i, prompt in enumerate(prompts): diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py new file mode 100644 index 0000000000000..a8573f8bdc6c8 --- /dev/null +++ b/vllm/model_executor/guided_decoding.py @@ -0,0 +1,99 @@ +import asyncio +import concurrent.futures +from copy import copy +from enum import Enum +from functools import lru_cache +from json import dumps as json_dumps +from re import escape as regex_escape +from typing import Union, Tuple +from pydantic import BaseModel + +from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest +from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor + + +class GuidedDecodingMode(Enum): + JSON = "json" + REGEX = "regex" + CHOICE = "choice" + + +global_thread_pool = None # used for generating logits processor fsm + + +async def get_guided_decoding_logits_processor( + request: Union[CompletionRequest, ChatCompletionRequest], + tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]: + """ + Given an OpenAI-compatible request, check for guided decoding parameters + and get the necessary logits processor for the given guide. + We cache logit processors by (guide, tokenizer), and on cache hit + we make a shallow copy to reuse the same underlying FSM. + """ + global global_thread_pool + guide, mode = _get_guide_and_mode(request) + if not guide: + return None + + if global_thread_pool is None: + global_thread_pool = concurrent.futures.ThreadPoolExecutor( + max_workers=2) + loop = asyncio.get_running_loop() + + result = await loop.run_in_executor(global_thread_pool, + _get_cached_logits_processor, guide, + tokenizer, mode) + + logits_processor = copy(result) + # reset logits processor's internal state + logits_processor.init_state() + return logits_processor + + +def _get_guide_and_mode( + request: Union[CompletionRequest, ChatCompletionRequest] +) -> Tuple[str, GuidedDecodingMode]: + + if request.guided_json: + if not isinstance(request.guided_json, (str, dict, BaseModel)): + raise TypeError("JSON schema must be str, dict, or BaseModel") + + json = request.guided_json + if isinstance(json, dict): + # turn dict into hashable string + json = json_dumps(json, sort_keys=True) + elif isinstance(json, BaseModel): + # use pydantic signature so that different model classes + # with the same fields will get hashed the same + json = str(json.__signature__) + return json, GuidedDecodingMode.JSON + + elif request.guided_regex: + if not isinstance(request.guided_regex, str): + raise TypeError("Regex must be string") + return request.guided_regex, GuidedDecodingMode.REGEX + + elif request.guided_choice: + if not isinstance(request.guided_choice, list): + raise TypeError("Choices must be a list") + + # choice just uses regex + choices = [ + regex_escape(str(choice)) for choice in request.guided_choice + ] + choices_regex = "(" + "|".join(choices) + ")" + return choices_regex, GuidedDecodingMode.CHOICE + + else: + return None, None + + +@lru_cache(maxsize=32) +def _get_cached_logits_processor(guide: str, tokenizer, + mode: GuidedDecodingMode): + if mode == GuidedDecodingMode.JSON: + return JSONLogitsProcessor(guide, tokenizer) + elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE: + return RegexLogitsProcessor(guide, tokenizer) + else: + raise ValueError(f"Unknown guided decoding mode {mode}") diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py new file mode 100644 index 0000000000000..1b3e5e71a5911 --- /dev/null +++ b/vllm/model_executor/guided_logits_processors.py @@ -0,0 +1,129 @@ +# Copyright 2024- the Outlines developers +# This file is adapted from +# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import math +from collections import defaultdict +from typing import Union, DefaultDict, Dict, List, Optional + +import torch +from pydantic import BaseModel +from outlines.fsm.fsm import RegexFSM +from outlines.fsm.json_schema import build_regex_from_schema + + +class RegexLogitsProcessor: + + def __init__(self, regex_string: str, tokenizer): + """Compile the FSM that drives the regex-structured generation. + + Parameters + ---------- + regex_string + A string that represents a regular expression + tokenizer + The model's tokenizer + + """ + tokenizer = self.adapt_tokenizer(tokenizer) + fsm = RegexFSM(regex_string, tokenizer) + self.fsm = fsm + + def init_state(self): + """Initialize the FSM states.""" + self.fsm_state: DefaultDict[int, int] = defaultdict(int) + + def __call__(self, input_ids: List[int], + scores: torch.Tensor) -> torch.Tensor: + """Use the FSM to bias the logits before sampling the next token.""" + + seq_id = hash(tuple(input_ids)) + + if len(input_ids) == 0: + self.init_state() + else: + last_token = input_ids[-1] + last_seq_id = hash(tuple(input_ids[:-1])) + self.fsm_state[seq_id] = self.fsm.next_state( + self.fsm_state[last_seq_id], last_token) + + allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id]) + + mask = torch.full((scores.shape[-1], ), + -math.inf, + device=scores.device) + mask[allowed_tokens] = 0 + scores.add_(mask) + + return scores + + def adapt_tokenizer(self, tokenizer): + """Adapt vLLM's tokenizer to use to compile the FSM. + + The API of Outlines tokenizers is slightly different to that of + `transformers`. In addition we need to handle the missing spaces to + Llama's tokenizer to be able to compile FSMs for this model. + + """ + tokenizer.vocabulary = tokenizer.get_vocab() + tokenizer.special_tokens = set(tokenizer.all_special_tokens) + + def convert_token_to_string(token: str) -> str: + from transformers.file_utils import SPIECE_UNDERLINE + + string = tokenizer.convert_tokens_to_string([token]) + + # A hack to handle missing spaces to HF's Llama tokenizers + if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>": + return " " + string + + return string + + tokenizer.convert_token_to_string = convert_token_to_string + + return tokenizer + + +class JSONLogitsProcessor(RegexLogitsProcessor): + + def __init__(self, + schema: Union[str, Dict, BaseModel], + tokenizer, + whitespace_pattern: Optional[str] = None): + """Compile the FSM that drives the JSON-guided generation. + + Parameters + ---------- + schema + A JSON schema that encodes the structure we want the model to generate + tokenizer + The model's tokenizer + whitespace_pattern + Pattern to use for JSON syntactic whitespace (doesn't impact string literals) + Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` + """ + if isinstance(schema, type(BaseModel)): + schema_str = json.dumps(schema.model_json_schema()) + elif isinstance(schema, Dict): + schema_str = json.dumps(schema) + elif isinstance(schema, str): + schema_str = schema + else: + raise ValueError( + f"Cannot parse schema {schema}. The schema must be either " + + "a Pydantic object, a dictionary or a string that contains the JSON " + + "Schema specification") + regex_string = build_regex_from_schema(schema_str, whitespace_pattern) + super().__init__(regex_string, tokenizer) From 54d3544784ff20e7038abf72793eaf734e727269 Mon Sep 17 00:00:00 2001 From: Sherry <503147114@qq.com> Date: Fri, 1 Mar 2024 15:52:22 +0800 Subject: [PATCH 036/113] Fix: Output text is always truncated in some models (#3016) --- vllm/engine/llm_engine.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9bf19b932d35b..df4858a696530 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -980,7 +980,10 @@ def _check_stop(self, seq: Sequence, def _finalize_sequence(self, seq: Sequence, sampling_params: SamplingParams, stop_string: str) -> None: - if not sampling_params.include_stop_str_in_output and stop_string: + if sampling_params.include_stop_str_in_output: + return + + if stop_string and seq.output_text.endswith(stop_string): # Truncate the output text so that the stop string is # not included in the output. seq.output_text = seq.output_text[:-len(stop_string)] From 27ca23dc002e06eade014ac6b801dc2dcbea40f3 Mon Sep 17 00:00:00 2001 From: Seonghyeon Date: Sat, 2 Mar 2024 02:59:06 +0900 Subject: [PATCH 037/113] Remove exclude_unset in streaming response (#3143) --- vllm/entrypoints/openai/serving_completion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 713e67793b290..86b753fa06ab5 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -96,7 +96,7 @@ async def completion_stream_generator( logprobs=logprobs, finish_reason=finish_reason, ) - ]).model_dump_json(exclude_unset=True) + ]).model_dump_json() yield f"data: {response_json}\n\n" if output.finish_reason is not None: # return final usage @@ -121,7 +121,7 @@ async def completion_stream_generator( ) ], usage=final_usage, - ).model_dump_json(exclude_unset=True) + ).model_dump_json() yield f"data: {response_json}\n\n" yield "data: [DONE]\n\n" @@ -306,7 +306,7 @@ async def create_completion(self, request: CompletionRequest, request, prompt=prompt) generators.append( - self.engine.generate(None, + self.engine.generate(prompt, sampling_params, f"{request_id}-{i}", prompt_token_ids=input_ids, From 49d849b3ab7aa6ae493ccde1d85d226833f73fbb Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Fri, 1 Mar 2024 14:04:14 -0500 Subject: [PATCH 038/113] docs: Add tutorial on deploying vLLM model with KServe (#2586) Signed-off-by: Yuan Tang --- docs/source/index.rst | 1 + docs/source/serving/deploying_with_kserve.rst | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 docs/source/serving/deploying_with_kserve.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 32929257661ad..bdc541cb2d58e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -70,6 +70,7 @@ Documentation serving/distributed_serving serving/run_on_sky + serving/deploying_with_kserve serving/deploying_with_triton serving/deploying_with_docker serving/serving_with_langchain diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst new file mode 100644 index 0000000000000..7f22766e09aef --- /dev/null +++ b/docs/source/serving/deploying_with_kserve.rst @@ -0,0 +1,8 @@ +.. _deploying_with_kserve: + +Deploying with KServe +============================ + +vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. + +Please see `this guide `_ for more details on using vLLM with KServe. From 90fbf12540da089fcc7dc825ce2ceb7ea3a3df33 Mon Sep 17 00:00:00 2001 From: Huarong Date: Sat, 2 Mar 2024 03:42:06 +0800 Subject: [PATCH 039/113] fix relative import path of protocol.py (#3134) Co-authored-by: huohuarong --- vllm/entrypoints/openai/serving_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 86b753fa06ab5..99a10196b5f73 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -5,7 +5,7 @@ from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.engine.async_llm_engine import AsyncLLMEngine -from .protocol import ( +from vllm.entrypoints.openai.protocol import ( CompletionRequest, CompletionResponse, CompletionResponseChoice, From c0c2335ce027486d254c31f665ce00d7db427d22 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 1 Mar 2024 14:47:51 -0600 Subject: [PATCH 040/113] Integrate Marlin Kernels for Int4 GPTQ inference (#2497) Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm --- csrc/ops.h | 9 + csrc/pybind.cpp | 4 +- csrc/quantization/marlin/LICENSE | 209 +++ .../quantization/marlin/marlin_cuda_kernel.cu | 1145 +++++++++++++++++ requirements-dev.txt | 1 + setup.py | 2 + tests/conftest.py | 32 + tests/models/test_marlin.py | 97 ++ vllm/config.py | 18 +- vllm/model_executor/layers/linear.py | 29 + .../layers/quantization/__init__.py | 2 + .../layers/quantization/marlin.py | 210 +++ 12 files changed, 1752 insertions(+), 6 deletions(-) create mode 100644 csrc/quantization/marlin/LICENSE create mode 100644 csrc/quantization/marlin/marlin_cuda_kernel.cu create mode 100644 tests/models/test_marlin.py create mode 100644 vllm/model_executor/layers/quantization/marlin.py diff --git a/csrc/ops.h b/csrc/ops.h index 08dfb0e8604f1..249c7451bf73c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -84,6 +84,15 @@ torch::Tensor awq_dequantize( int split_k_iters, int thx, int thy); + +torch::Tensor marlin_gemm( + torch::Tensor& a, + torch::Tensor& b_q_weight, + torch::Tensor& b_scales, + torch::Tensor& workspace, + int64_t size_m, + int64_t size_n, + int64_t size_k); #endif void squeezellm_gemm( diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 5d062bb5700bc..4b6ade7566398 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -52,11 +52,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &rotary_embedding, "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); - // Quantization ops +// Quantization ops #ifndef USE_ROCM ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); + ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ"); ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ"); #endif + ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ"); ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ"); ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); diff --git a/csrc/quantization/marlin/LICENSE b/csrc/quantization/marlin/LICENSE new file mode 100644 index 0000000000000..1d1e4cf9c8233 --- /dev/null +++ b/csrc/quantization/marlin/LICENSE @@ -0,0 +1,209 @@ +Contains code from https://github.com/IST-DASLab/marlin + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +------------------------------------------------------------------------------------ + +This product bundles various third-party components under other open source licenses. +This section summarizes those components and their licenses. See licenses/ +for text of these licenses. diff --git a/csrc/quantization/marlin/marlin_cuda_kernel.cu b/csrc/quantization/marlin/marlin_cuda_kernel.cu new file mode 100644 index 0000000000000..cf1b0afdec8b4 --- /dev/null +++ b/csrc/quantization/marlin/marlin_cuda_kernel.cu @@ -0,0 +1,1145 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +template inline std::string str(T x) { return std::to_string(x); } + +namespace marlin { + +constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + +// Instances of `Vec` are used to organize groups of >>registers<<, as needed +// for instance as inputs to tensor core operations. Consequently, all +// corresponding index accesses must be compile-time constants, which is why we +// extensively use `#pragma unroll` throughout the kernel code to guarantee +// this. +template struct Vec { + T elems[n]; + __device__ T &operator[](int i) { return elems[i]; } +}; + +using I4 = Vec; + +// Matrix fragments for tensor core instructions; their precise layout is +// documented here: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type +using FragA = Vec; +using FragB = Vec; +using FragC = Vec; +using FragS = Vec; // quantization scales + +// Predicated asynchronous global->shared copy; used for inputs A where we apply +// predication to handle batchsizes that are not multiples of 16. +__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, + bool pred = true) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), "l"(glob_ptr), "n"(BYTES)); +} + +// Asynchronous global->shared copy with a cache hint indicating that the values +// may be evicted immediately; used for quantized weights B, which are only +// accessed precisely once and should thus not pollute the L2 cache which we +// need for inputs A and outputs C. +__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " .reg .b64 p;\n" + " createpolicy.fractional.L2::evict_first.b64 p, 1.0;" + " cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), "n"(BYTES)); +} + +// Async copy fence. +__device__ inline void cp_async_fence() { + asm volatile("cp.async.commit_group;\n" ::); +} + +// Wait until at most `n` async copy stages are still pending. +template __device__ inline void cp_async_wait() { + asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); +} + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +__device__ inline void mma(const FragA &a_frag, const FragB &frag_b, + FragC &frag_c) { + const uint32_t *a = reinterpret_cast(&a_frag); + const uint32_t *b = reinterpret_cast(&frag_b); + float *c = reinterpret_cast(&frag_c); + asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), + "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3])); +} + +// Instruction for loading a full 16x16 matrix fragment of operand A from shared +// memory, directly in tensor core layout. +__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) { + uint32_t *a = reinterpret_cast(&frag_a); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" + : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) + : "r"(smem)); +} + +// Lookup-table based 3-input logical operation; explicitly used for +// dequantization as the compiler does not seem to automatically recognize it in +// all cases. +template __device__ inline int lop3(int a, int b, int c) { + int res; + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" + : "=r"(res) + : "r"(a), "r"(b), "r"(c), "n"(lut)); + return res; +} + +// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16 +// values. We mostly follow the strategy in the link below, with some small +// changes: +// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h +__device__ inline FragB dequant(int q) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point + // directly into `SUB` and `ADD`. + const int SUB = 0x64086408; + const int MUL = 0x2c002c00; + const int ADD = 0xd480d480; + FragB frag_b; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), + *reinterpret_cast(&SUB)); + frag_b[1] = __hfma2(*reinterpret_cast(&hi), + *reinterpret_cast(&MUL), + *reinterpret_cast(&ADD)); + return frag_b; +} + +// Multiply dequantized values by the corresponding quantization scale; used +// only for grouped quantization. +__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) { + half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]); + frag_b[0] = __hmul2(frag_b[0], s); + frag_b[1] = __hmul2(frag_b[1], s); +} + +// Wait until barrier reaches `count`, then lock for current threadblock. +__device__ inline void barrier_acquire(int *lock, int count) { + if (threadIdx.x == 0) { + int state = -1; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" + : "=r"(state) + : "l"(lock)); + while (state != count); + } + __syncthreads(); +} + +// Release barrier and increment visitation count. +__device__ inline void barrier_release(int *lock, bool reset = false) { + __syncthreads(); + if (threadIdx.x == 0) { + if (reset) { + lock[0] = 0; + return; + } + int val = 1; + // Make sure that all writes since acquiring this barrier are visible + // globally, while releasing the barrier. + asm volatile("fence.acq_rel.gpu;\n"); + asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" + : + : "l"(lock), "r"(val)); + } +} + +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks with + // a separate quantization scale + > +__global__ void +Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk + const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn + int4 *__restrict__ C, // fp16 output buffer of shape mxn + const int4 + *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int *locks // extra global storage for barrier synchronization +) { + // Each threadblock processes one "stripe" of the B matrix with (roughly) the + // same size, which might involve multiple column "slices" (of width 16 * + // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM + // example: + // 0 1 3 + // 0 2 3 + // 1 2 4 + // While this kind of partitioning makes things somewhat more complicated, it + // ensures good utilization of all SMs for many kinds of shape and GPU + // configurations, while requiring as few slow global cross-threadblock + // reductions as possible. + + // For larger GEMMs we run multiple batchsize 64 versions in parallel for a + // better partitioning with less reductions + int parallel = 1; + if (prob_m > 16 * thread_m_blocks) { + parallel = prob_m / (16 * thread_m_blocks); + prob_m = 16 * thread_m_blocks; + } + + int k_tiles = prob_k / 16 / thread_k_blocks; + int n_tiles = prob_n / 16 / thread_n_blocks; + int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x); + // Ensure that the number of tiles in each stripe is a multiple of the + // groupsize; this avoids an annoying special case where a stripe starts in + // the middle of group. + if (group_blocks != -1) + iters = (group_blocks / thread_k_blocks) * + ceildiv(iters, (group_blocks / thread_k_blocks)); + + int slice_row = (iters * blockIdx.x) % k_tiles; + int slice_col_par = (iters * blockIdx.x) / k_tiles; + int slice_col = slice_col_par; + int slice_iters; // number of threadblock tiles in the current slice + int slice_count = + 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top + + // We can easily implement parallel problem execution by just remapping + // indices and advancing global pointers + if (slice_col_par >= n_tiles) { + A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8; + C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; + locks += (slice_col_par / n_tiles) * n_tiles; + slice_col = slice_col_par % n_tiles; + } + + // Compute all information about the current slice which is required for + // synchronization. + auto init_slice = [&]() { + slice_iters = + iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) + slice_iters = 0; + if (slice_iters == 0) + return; + if (slice_row + slice_iters > k_tiles) + slice_iters = k_tiles - slice_row; + slice_count = 1; + slice_idx = 0; + int col_first = iters * ceildiv(k_tiles * slice_col_par, iters); + if (col_first <= k_tiles * (slice_col_par + 1)) { + int col_off = col_first - k_tiles * slice_col_par; + slice_count = ceildiv(k_tiles - col_off, iters); + if (col_off > 0) + slice_count++; + int delta_first = iters * blockIdx.x - col_first; + if (delta_first < 0 || (col_off == 0 && delta_first == 0)) + slice_idx = slice_count - 1; + else { + slice_idx = slice_count - 1 - delta_first / iters; + if (col_off > 0) + slice_idx--; + } + } + if (slice_col == n_tiles) { + A += 16 * thread_m_blocks * prob_k / 8; + C += 16 * thread_m_blocks * prob_n / 8; + locks += n_tiles; + slice_col = 0; + } + }; + init_slice(); + + int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory + // We typically use `constexpr` to indicate that this value is a compile-time + // constant + constexpr int a_sh_stride = + 16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory + constexpr int a_gl_rd_delta_o = + 16 * thread_k_blocks / + 8; // delta between subsequent A tiles in global memory + int a_gl_rd_delta_i = + a_gl_stride * + (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile + constexpr int a_sh_wr_delta = + a_sh_stride * (threads / a_gl_rd_delta_o); // between shared memory writes + constexpr int a_sh_rd_delta_o = + 2 * ((threads / 32) / + (thread_n_blocks / 4)); // between shared memory tile reads + constexpr int a_sh_rd_delta_i = + a_sh_stride * 16; // within a shared memory tile + constexpr int a_sh_stage = + a_sh_stride * (16 * thread_m_blocks); // overall size of a tile + constexpr int a_sh_wr_iters = + ceildiv(a_sh_stage, + a_sh_wr_delta); // number of shared write iterations for a tile + + int b_gl_stride = 16 * prob_n / 32; + constexpr int b_sh_stride = 32 * thread_n_blocks / 4; + int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; + int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride); + constexpr int b_sh_wr_delta = threads; + constexpr int b_sh_rd_delta = threads; + constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; + constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; + + int s_gl_stride = prob_n / 8; + constexpr int s_sh_stride = 16 * thread_n_blocks / 8; + constexpr int s_sh_stage = s_sh_stride; + int s_gl_rd_delta = s_gl_stride; + + // Global A read index of current thread. + int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + a_gl_rd += a_gl_rd_delta_o * slice_row; + // Shared write index of current thread. + int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); + // Shared read index. + int a_sh_rd = + a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16; + a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); + + int b_gl_rd = + b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride); + b_gl_rd += b_sh_stride * slice_col; + b_gl_rd += b_gl_rd_delta_o * slice_row; + int b_sh_wr = threadIdx.x; + int b_sh_rd = threadIdx.x; + + int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + + s_sh_stride * slice_col + threadIdx.x; + int s_sh_wr = threadIdx.x; + int s_sh_rd; + // We use a different scale layout for grouped and column-wise quantization as + // we scale a `half2` tile in column-major layout in the former and in + // row-major in the latter case. + if (group_blocks != -1) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + else + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) % 4; + + // Precompute which thread should not read memory in which iterations; this is + // needed if there are more threads than required for a certain tilesize or + // when the batchsize is not a multiple of 16. + bool a_sh_wr_pred[a_sh_wr_iters]; +#pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; + bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + + // To ensure that writing and reading A tiles to/from shared memory, the + // latter in fragment format, is fully bank conflict free, we need to use a + // rather fancy XOR-based layout. The key here is that neither reads nor + // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the + // same shared memory banks. Further, it seems (based on NSight-Compute) that + // each warp must also write a consecutive memory segment? + auto transform_a = [&](int i) { + int row = i / a_gl_rd_delta_o; + return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row; + }; + // Since the computation of this remapping is non-trivial and, due to our main + // loop unrolls, all shared memory accesses are static, we simply precompute + // both transformed reads and writes. + int a_sh_wr_trans[a_sh_wr_iters]; +#pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) + a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); + int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { +#pragma unroll + for (int j = 0; j < thread_m_blocks; j++) + a_sh_rd_trans[i][j] = + transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); + } + + // Since B-accesses have non-constant stride they have to be computed at + // runtime; we break dependencies between subsequent accesses with a tile by + // maintining multiple pointers (we have enough registers), a tiny + // optimization. + const int4 *B_ptr[b_sh_wr_iters]; +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; + + extern __shared__ int4 sh[]; + // Shared memory storage for global fetch pipelines. + int4 *sh_a = sh; + int4 *sh_b = sh_a + (stages * a_sh_stage); + int4 *sh_s = sh_b + (stages * b_sh_stage); + // Register storage for double buffer of shared memory reads. + FragA frag_a[2][thread_m_blocks]; + I4 frag_b_quant[2]; + FragC frag_c[thread_m_blocks][4][2]; + FragS frag_s[2][4]; + + // Zero accumulators. + auto zero_accums = [&]() { +#pragma unroll + for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) + reinterpret_cast(frag_c)[i] = 0; + }; + + // Asynchronously fetch the next A, B and s tile from global to the next + // shared memory pipeline location. + auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { + if (pred) { + int4 *sh_a_stage = sh_a + a_sh_stage * pipe; +#pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) { + cp_async4_pred( + &sh_a_stage[a_sh_wr_trans[i]], + &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], + a_sh_wr_pred[i]); + } + int4 *sh_b_stage = sh_b + b_sh_stage * pipe; +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { + cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); + B_ptr[i] += b_gl_rd_delta_o; + } + // Only fetch scales if this tile starts a new group + if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) { + int4 *sh_s_stage = sh_s + s_sh_stage * pipe; + if (s_sh_wr_pred) + cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); + s_gl_rd += s_gl_rd_delta; + } + } + // Insert a fence even when we are winding down the pipeline to ensure that + // waiting is also correct at this point. + cp_async_fence(); + }; + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + // Load the next sub-tile from the current location in the shared memory pipe + // into the current register buffer. + auto fetch_to_registers = [&](int k, int pipe) { + // It may seem inefficient that we reload the groups for every sub-tile; + // however, this does not seem to be a significant bottleneck, while some + // theoretically better attempts have lead to bad instruction ordering by + // the compiler and correspondingly a noticeable drop in performance. + if (group_blocks != -1) { + int4 *sh_s_stage = + sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + } + int4 *sh_a_stage = sh_a + a_sh_stage * pipe; +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) + ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4 *sh_b_stage = sh_b + b_sh_stage * pipe; + frag_b_quant[k % 2] = *reinterpret_cast( + &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]); + }; + + // Execute the actual tensor core matmul of a sub-tile. + auto matmul = [&](int k) { +// We have the m dimension as the inner loop in order to encourage overlapping +// dequantization and matmul operations. +#pragma unroll + for (int j = 0; j < 4; j++) { + int b_quant = frag_b_quant[k % 2][j]; + int b_quant_shift = b_quant >> 8; + FragB frag_b0 = dequant(b_quant); + // If there are no groups, we can just scale the final output once and can + // avoid doing so for each weight. + if (group_blocks != -1) + scale(frag_b0, frag_s[k % 2][j], 0); + FragB frag_b1 = dequant(b_quant_shift); + if (group_blocks != -1) + scale(frag_b1, frag_s[k % 2][j], 1); +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]); + mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]); + } + } + }; + + // Since we slice across the k dimension of a tile in order to increase the + // number of warps while keeping the n dimension of a tile reasonable, we have + // multiple warps that accumulate their partial sums of the same output + // location; which we have to reduce over in the end. We do in shared memory. + auto thread_block_reduce = [&]() { + constexpr int red_off = threads / b_sh_stride / 2; + if (red_off >= 1) { + int red_idx = threadIdx.x / b_sh_stride; + constexpr int red_sh_stride = b_sh_stride * 4 * 2; + constexpr int red_sh_delta = b_sh_stride; + int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) + + (threadIdx.x % b_sh_stride); + + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + +#pragma unroll + for (int m_block = 0; m_block < thread_m_blocks; m_block++) { +#pragma unroll + for (int i = red_off; i > 0; i /= 2) { + if (i <= red_idx && red_idx < 2 * i) { +#pragma unroll + for (int j = 0; j < 4 * 2; j++) { + int red_sh_wr = + red_sh_delta * j + (red_sh_rd - red_sh_stride * i); + if (i < red_off) { + float *c_rd = reinterpret_cast( + &sh[red_sh_delta * j + red_sh_rd]); + float *c_wr = reinterpret_cast(&sh[red_sh_wr]); +#pragma unroll + for (int k = 0; k < 4; k++) + reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += + c_rd[k] + c_wr[k]; + } + sh[red_sh_wr] = + reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + } + } + __syncthreads(); + } + if (red_idx == 0) { +#pragma unroll + for (int i = 0; i < 4 * 2; i++) { + float *c_rd = + reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); +#pragma unroll + for (int j = 0; j < 4; j++) + reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += + c_rd[j]; + } + } + __syncthreads(); + } + } + }; + + // Since multiple threadblocks may process parts of the same column slice, we + // finally have to globally reduce over the results. As the striped partitioning + // minimizes the number of such reductions and our outputs are usually rather + // small, we perform this reduction serially in L2 cache. + auto global_reduce = [&](bool first = false, bool last = false) { + // We are very careful here to reduce directly in the output buffer to + // maximize L2 cache utilization in this step. To do this, we write out + // results in FP16 (but still reduce with FP32 compute). + constexpr int active_threads = 32 * thread_n_blocks / 4; + if (threadIdx.x < active_threads) { + int c_gl_stride = prob_n / 8; + int c_gl_wr_delta_o = 8 * c_gl_stride; + int c_gl_wr_delta_i = 4 * (active_threads / 32); + int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + + 4 * (threadIdx.x / 32) + threadIdx.x % 4; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + constexpr int c_sh_wr_delta = active_threads; + int c_sh_wr = threadIdx.x; + + int row = (threadIdx.x % 32) / 4; + + if (!first) { +// Interestingly, doing direct global accesses here really seems to mess up the +// compiler and lead to slowdowns, hence we also use async-copies even though +// these fetches are not actually asynchronous. +#pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || + 8 * (i / 2) + row < prob_m); + } + cp_async_fence(); + cp_async_wait<0>(); + } + +#pragma unroll + for (int i = 0; i < thread_m_blocks * 4; i++) { + if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { + if (!first) { + int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; +#pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] += + __half2float(reinterpret_cast<__half *>(&c_red)[j]); + } + } + if (!last) { + int4 c; +#pragma unroll + for (int j = 0; j < 2 * 4; j++) { + reinterpret_cast<__half *>(&c)[j] = + __float2half(reinterpret_cast( + &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]); + } + C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = + c; + } + } + } + } + }; + + // Write out the reduce final result in the correct layout. We only actually + // reshuffle matrix fragments in this step, the reduction above is performed + // in fragment layout. + auto write_result = [&]() { + int c_gl_stride = prob_n / 8; + constexpr int c_sh_stride = 2 * thread_n_blocks + 1; + int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); + constexpr int c_sh_rd_delta = + c_sh_stride * (threads / (2 * thread_n_blocks)); + + int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + c_gl_wr += (2 * thread_n_blocks) * slice_col; + int c_sh_wr = + (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; + c_sh_wr += 32 * (threadIdx.x / 32); + int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + + (threadIdx.x % (2 * thread_n_blocks)); + + int c_gl_wr_end = c_gl_stride * prob_m; + + // We first reorder in shared memory to guarantee the most efficient final + // global write patterns + auto write = [&](int idx, float c0, float c1, FragS &s) { + half2 res = __halves2half2(__float2half(c0), __float2half(c1)); + if (group_blocks == + -1) // for per-column quantization we finally apply the scale here + res = __hmul2(res, s[0]); + ((half2 *)sh)[idx] = res; + }; + if (threadIdx.x / 32 < thread_n_blocks / 4) { +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { +#pragma unroll + for (int j = 0; j < 4; j++) { + int wr = c_sh_wr + 8 * j; + write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], + frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], + frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], + frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], + frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + } + c_sh_wr += 16 * (4 * c_sh_stride); + } + } + __syncthreads(); + +#pragma unroll + for (int i = 0; + i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); + i++) { + if (c_gl_wr < c_gl_wr_end) { + C[c_gl_wr] = sh[c_sh_rd]; + c_gl_wr += c_gl_wr_delta; + c_sh_rd += c_sh_rd_delta; + } + } + }; + + // Start global fetch and register load pipelines. + auto start_pipes = [&]() { +#pragma unroll + for (int i = 0; i < stages - 1; i++) + fetch_to_shared(i, i, i < slice_iters); + zero_accums(); + wait_for_stage(); + fetch_to_registers(0, 0); + a_gl_rd += a_gl_rd_delta_o * (stages - 1); + }; + start_pipes(); + + // Main loop. + while (slice_iters) { +// We unroll over both the global fetch and the register load pipeline to ensure +// all shared memory accesses are static. Note that both pipelines have even +// length meaning that the next iteration will always start at index 0. +#pragma unroll + for (int pipe = 0; pipe < stages;) { +#pragma unroll + for (int k = 0; k < b_sh_wr_iters; k++) { + fetch_to_registers(k + 1, pipe % stages); + if (k == b_sh_wr_iters - 2) { + fetch_to_shared((pipe + stages - 1) % stages, pipe, + slice_iters >= stages); + pipe++; + wait_for_stage(); + } + matmul(k); + } + slice_iters--; + if (slice_iters == 0) + break; + } + a_gl_rd += a_gl_rd_delta_o * stages; + + // Process results and, if necessary, proceed to the next column slice. + // While this pattern may not be the most readable, other ways of writing + // the loop seemed to noticeably worse performance after compilation. + if (slice_iters == 0) { + cp_async_wait<0>(); + bool last = slice_idx == slice_count - 1; + // For per-column scales, we only fetch them here in the final step before + // write-out + if (group_blocks == -1 && last) { + if (s_sh_wr_pred) + cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]); + cp_async_fence(); + } + thread_block_reduce(); + if (group_blocks == -1 && last) { + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + } + if (slice_count > 1) { // only globally reduce if there is more than one + // block in a slice + barrier_acquire(&locks[slice_col], slice_idx); + global_reduce(slice_idx == 0, last); + barrier_release(&locks[slice_col], last); + } + if (last) // only the last block in a slice actually writes the result + write_result(); + slice_row = 0; + slice_col_par++; + slice_col++; + init_slice(); + if (slice_iters) { + a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + + (threadIdx.x % a_gl_rd_delta_o); +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; + if (slice_col == 0) { +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) + B_ptr[i] -= b_gl_stride; + } + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + start_pipes(); + } + } + } +} + +#else + +template shared + // fetch pipeline + const int group_blocks = -1 // number of consecutive 16x16 blocks with + // a separate quantization scale + > +__global__ void +Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk + const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn + int4 *__restrict__ C, // fp16 output buffer of shape mxn + const int4 + *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int *locks // extra global storage for barrier synchronization +) { + // Marlin is not implemented yet for SM < 8.0 + assert(false); + return; +} + +#endif + +// 8 warps are a good choice since every SM has 4 schedulers and having more +// than 1 warp per schedule allows some more latency hiding. At the same time, +// we want relatively few warps to have many registers per warp and small tiles. +const int USER_THREADS = + 256; // Note: This is only used with user-provided thread_k/n +const int STAGES = 4; // 4 pipeline stages fit into shared memory +const int SHARED_MEM = + 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) + +static constexpr int min_thread_n = 64; +static constexpr int min_thread_k = 64; + +static constexpr int tile_size = 16; +static constexpr int max_par = 16; + +static constexpr int pack_factor_4bit = + 8; // We have 8 4-bit vals inside a 32 bit + +#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ + GROUP_BLOCKS, NUM_THREADS) \ + else if (thread_m_blocks == THREAD_M_BLOCKS && \ + thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ + cudaFuncSetAttribute(Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, \ + SHARED_MEM); \ + Marlin<<>>( \ + A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks); \ + } + +typedef struct { + int thread_k; + int thread_n; + int num_threads; +} thread_config_t; + +thread_config_t small_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {128, 128, 256}, // Default + {128, 64, 128}, // Reduce N 2X, same K + {64, 256, 256}, // Reduce K 2X, increase N 2X + {64, 128, 128}, // Reduce K 2X, same N +}; + +thread_config_t large_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {64, 256, 256}, // Default + {128, 128, 256}, // Reduce N 2X, increase K 2X + {64, 128, 128}, // Reduce N 2X, same K + {128, 64, 128}, // Reduce N 4X, increase K 2X +}; + +bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n, + int prob_k) { + // Sanity + if (th_config.thread_k == -1 || th_config.thread_n == -1 || + th_config.num_threads == -1) { + return false; + } + + // Verify K/N are divisible by thread K/N + if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { + return false; + } + + // thread_k can be only 128 or 64 (because it must be less than groupsize + // which is 128) + if (th_config.thread_k != 128 && th_config.thread_k != 64) { + return false; + } + + // Verify min for thread K/N + if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { + return false; + } + + // num_threads must be at least 128 (= 4 warps) + if (th_config.num_threads < 128) { + return false; + } + + return true; +} + +thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) { + + if (prob_m <= 16) { + for (auto th_config : small_batch_thread_configs) { + if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { + return th_config; + } + } + + } else { + for (auto th_config : large_batch_thread_configs) { + if (is_valid_config(th_config, prob_m, prob_n, prob_k)) { + return th_config; + } + } + } + + return thread_config_t{-1, -1, -1}; +} + +#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) \ + __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS) \ + __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS) + +void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m, + int prob_n, int prob_k, void *workspace, int groupsize = -1, + int dev = 0, cudaStream_t stream = 0, int thread_k = -1, + int thread_n = -1, int sms = -1, int max_par = 16) { + int tot_m = prob_m; + int tot_m_blocks = ceildiv(tot_m, 16); + int pad = 16 * tot_m_blocks - tot_m; + + if (sms == -1) + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev); + + // Set thread config + thread_config_t th_config; + if (thread_k != -1 && thread_n != -1) { + // User-defined config + th_config = thread_config_t{thread_k, thread_n, USER_THREADS}; + } else { + // Auto config + th_config = determine_thread_config(prob_m, prob_n, prob_k); + } + + if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) { + throw std::runtime_error( + "Invalid thread config: thread_k = " + str(th_config.thread_k) + + ", thread_n = " + str(th_config.thread_n) + + ", num_threads = " + str(th_config.num_threads) + " for MKN = [" + + str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]"); + } + + // Uncomment for debug + // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) + + // ", thread_n = " + str(th_config.thread_n) + + // ", num_threads = " + str(th_config.num_threads) + " for + // MKN = [" + str(prob_m) + + // ", " + str(prob_k) + ", " + str(prob_n) + "]\n"; + + int num_threads = th_config.num_threads; + thread_k = th_config.thread_k; + thread_n = th_config.thread_n; + + int thread_k_blocks = thread_k / 16; + int thread_n_blocks = thread_n / 16; + int group_blocks = (groupsize == -1) ? -1 : groupsize / 16; + int blocks = sms; + + if (prob_m == 0 || prob_n == 0 || prob_k == 0) { + return; + } + + TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n, + " is not divisible by thread_n = ", thread_n); + TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k, + " is not divisible by thread_k = ", thread_k); + if (group_blocks != -1) { + TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k, + " is not divisible by group_blocks = ", group_blocks); + } + + const int4 *A_ptr = (const int4 *)A; + const int4 *B_ptr = (const int4 *)B; + int4 *C_ptr = (int4 *)C; + const int4 *s_ptr = (const int4 *)s; + + int *locks = (int *)workspace; + + for (int i = 0; i < tot_m_blocks; i += 4) { + int thread_m_blocks = tot_m_blocks - i; + prob_m = tot_m - 16 * i; + int par = 1; + if (thread_m_blocks > 4) { + // Note that parallel > 1 currently only works for inputs without any + // padding + par = (16 * thread_m_blocks - pad) / 64; + if (par > max_par) + par = max_par; + prob_m = 64 * par; + i += 4 * (par - 1); + thread_m_blocks = 4; + } + + // For compilation speed, we only define the kernel configurations that have + // seemed useful (in terms of performance) in our testing, however many more + // are, in principle, possible. + if (false) { + } + CALL_IF(8, 8, 256) + CALL_IF(16, 4, 256) + CALL_IF(8, 4, 128) + CALL_IF(4, 8, 128) + else { + throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) + + ", " + str(prob_k) + ", " + str(prob_n) + "]" + + ", groupsize = " + str(groupsize) + + ", thread_m_blocks = " + str(thread_m_blocks) + + ", thread_n_blocks = " + str(thread_n_blocks) + + ", thread_k_blocks = " + str(thread_k_blocks)); + } + + A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par; + C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par; + } +} + +} // namespace marlin + +torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight, + torch::Tensor &b_scales, torch::Tensor &workspace, + int64_t size_m, int64_t size_n, int64_t size_k) { + + // Verify M + TORCH_CHECK(size_m == a.size(0), + "Shape mismatch: a.size(0) = " + str(a.size(0)) + + ", size_m = " + str(size_m)); + + // Verify K + TORCH_CHECK(size_k == a.size(1), + "Shape mismatch: a.size(1) = " + str(a.size(1)) + + ", size_k = " + str(size_k)); + TORCH_CHECK(size_k % marlin::tile_size == 0, + "size_k = " + str(size_k) + + " is not divisible by tile_size = " + str(marlin::tile_size)); + TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0), + "Shape mismatch: b_q_weight.size(0) = " + + str(b_q_weight.size(0)) + ", size_k = " + str(size_k) + + ", tile_size = " + str(marlin::tile_size)); + + // Verify N + TORCH_CHECK(b_scales.size(1) == size_n, + "b_scales.size(1) = " + str(b_scales.size(1)) + + ", size_n = " + str(size_n)); + TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0, + "b_q_weight.size(1) = " + str(b_q_weight.size(1)) + + " is not divisible by tile_size = " + str(marlin::tile_size)); + + int actual_size_n = + (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit; + TORCH_CHECK(size_n == actual_size_n, + "size_n = " + str(size_n) + + ", actual_size_n = " + str(actual_size_n)); + + // Verify A device and strides + TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); + TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + + // Verify B device and strides + TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); + TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous"); + + // Verify scales device and strides + TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); + TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + + // Alloc C matrix + const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); + auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device()); + torch::Tensor c = torch::empty({size_m, size_n}, options); + + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_k = -1; + // thread_n: `n` size of a thread_tile in `weights` (can usually be left as + // auto -1) + int thread_n = -1; + // sms: number of SMs to use for the kernel (can usually be left as auto -1) + int sms = -1; + + // Detect groupsize + if (b_scales.size(0) != 1) { + TORCH_CHECK(size_k % b_scales.size(0) == 0, + "size_k = " + str(size_k) + + ", is not divisible by b_scales.size(0) = " + + str(b_scales.size(0))); + } + int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0); + + // Verify groupsize + TORCH_CHECK(groupsize == -1 || groupsize == 128, + "Unexpected groupsize = " + str(groupsize)); + + // Verify workspace size + TORCH_CHECK( + size_n % marlin::min_thread_n == 0, + "size_n = " + str(size_n) + + ", is not divisible by min_thread_n = " + str(marlin::min_thread_n)); + int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par; + TORCH_CHECK(workspace.numel() >= min_workspace_size, + "workspace.numel = " + str(workspace.numel()) + + " is below min_workspace_size = " + str(min_workspace_size)); + + int dev = a.get_device(); + marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), + b_scales.data_ptr(), size_m, size_n, size_k, + workspace.data_ptr(), groupsize, dev, + at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, + sms, marlin::max_par); + + return c; +} diff --git a/requirements-dev.txt b/requirements-dev.txt index 80d66530f47f0..55e102374fd73 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,6 +15,7 @@ types-setuptools pytest pytest-forked pytest-asyncio +pytest-rerunfailures httpx einops # required for MPT openai diff --git a/setup.py b/setup.py index 1f48be948aa84..745b5a9b2d02a 100644 --- a/setup.py +++ b/setup.py @@ -342,6 +342,8 @@ def get_torch_arch_list() -> Set[str]: if _is_cuda(): vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") + vllm_extension_sources.append( + "csrc/quantization/marlin/marlin_cuda_kernel.cu") vllm_extension_sources.append("csrc/custom_all_reduce.cu") # Add MoE kernels. diff --git a/tests/conftest.py b/tests/conftest.py index 30a3df89d9f12..6eb8159837d51 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -199,6 +199,24 @@ def generate( outputs.append((req_sample_output_ids, req_sample_output_strs)) return outputs + def generate_w_logprobs( + self, + prompts: List[str], + sampling_params: SamplingParams, + ) -> List[Tuple[List[int], str]]: + assert sampling_params.logprobs is not None + + req_outputs = self.model.generate(prompts, + sampling_params=sampling_params) + outputs = [] + for req_output in req_outputs: + for sample in req_output.outputs: + output_str = sample.text + output_ids = sample.token_ids + output_logprobs = sample.logprobs + outputs.append((output_ids, output_str, output_logprobs)) + return outputs + def generate_greedy( self, prompts: List[str], @@ -209,6 +227,20 @@ def generate_greedy( return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs] + def generate_greedy_logprobs( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + ) -> List[Tuple[List[int], str]]: + greedy_logprobs_params = SamplingParams(temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs) + outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params) + + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + def generate_beam_search( self, prompts: List[str], diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py new file mode 100644 index 0000000000000..f3cc517364f06 --- /dev/null +++ b/tests/models/test_marlin.py @@ -0,0 +1,97 @@ +"""Compare the outputs of a GPTQ model to a Marlin model. + +Note: GPTQ and Marlin do not have bitwise correctness. +As a result, in this test, we just confirm that the top selected tokens of the +Marlin/GPTQ models are in the top 3 selections of each other. + +Note: Marlin internally uses locks to synchronize the threads. This can +result in very slight nondeterminism for Marlin. As a result, we re-run the test +up to 3 times to see if we pass. + +Run `pytest tests/models/test_marlin.py --forked`. +""" + +import pytest +import torch +from dataclasses import dataclass +from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY + +capability = torch.cuda.get_device_capability() +capability = capability[0] * 10 + capability[1] +marlin_not_supported = ( + capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability()) + + +@dataclass +class ModelPair: + model_marlin: str + model_gptq: str + + +model_pairs = [ + ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", + model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), + ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", + model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), + ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", + model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") +] + + +@pytest.mark.flaky(reruns=2) +@pytest.mark.skipif(marlin_not_supported, + reason="Marlin is not supported on this GPU type.") +@pytest.mark.parametrize("model_pair", model_pairs) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [3]) +def test_models( + vllm_runner, + example_prompts, + model_pair: ModelPair, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype) + marlin_outputs = marlin_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + # Note: not sure why, but deleting just the model on Ada Lovelace + # does not free the GPU memory. On Ampere, deleting the just model + # frees the memory. + del marlin_model.model.llm_engine.driver_worker + del marlin_model + + gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype) + gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts, + max_tokens, + num_logprobs) + + # Note: not sure why, but deleting just the model on Ada Lovelace + # does not free the GPU memory. On Ampere, deleting the just model + # frees the memory. + del gptq_model.model.llm_engine.driver_worker + del gptq_model + + # loop through the prompts + for prompt_idx in range(len(example_prompts)): + gptq_output_ids, gptq_output_str, gptq_logprobs = gptq_outputs[ + prompt_idx] + marlin_output_ids, marlin_output_str, marlin_logprobs = marlin_outputs[ + prompt_idx] + + for idx, (gptq_output_id, marlin_output_id) in enumerate( + zip(gptq_output_ids, marlin_output_ids)): + # If sequence is not an exact match, + if marlin_output_id != gptq_output_id: + # Each predicted token must be in top 5 of the other's + assert gptq_output_id in marlin_logprobs[idx], ( + f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" + ) + assert marlin_output_id in gptq_logprobs[idx], ( + f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" + ) + + # Break out since sequences will now diverge. + break diff --git a/vllm/config.py b/vllm/config.py index 2f8883fe0733e..b4d48d34a8a72 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -155,15 +155,21 @@ def _verify_tokenizer_mode(self) -> None: self.tokenizer_mode = tokenizer_mode def _verify_quantization(self) -> None: - supported_quantization = ["awq", "gptq", "squeezellm"] - rocm_not_supported_quantization = ["awq"] + supported_quantization = ["awq", "gptq", "squeezellm", "marlin"] + rocm_not_supported_quantization = ["awq", "marlin"] if self.quantization is not None: self.quantization = self.quantization.lower() # Parse quantization method from the HF model config, if available. hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: + hf_quant_method = str(hf_quant_config["quant_method"]).lower() + # If the GPTQ model is serialized in marlin format, use marlin. + if (hf_quant_method == "gptq" + and "is_marlin_format" in hf_quant_config + and hf_quant_config["is_marlin_format"]): + hf_quant_method = "marlin" if self.quantization is None: self.quantization = hf_quant_method elif self.quantization != hf_quant_method: @@ -183,9 +189,11 @@ def _verify_quantization(self) -> None: raise ValueError( f"{self.quantization} quantization is currently not supported " f"in ROCm.") - logger.warning(f"{self.quantization} quantization is not fully " - "optimized yet. The speed can be slower than " - "non-quantized models.") + if self.quantization != "marlin": + logger.warning( + f"{self.quantization} quantization is not fully " + "optimized yet. The speed can be slower than " + "non-quantized models.") def _verify_cuda_graph(self) -> None: if self.max_context_len_to_capture is None: diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 55d38b763b2b5..b2396a1d6f141 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -17,6 +17,14 @@ logger = init_logger(__name__) +def adjust_marlin_shard(param, shard_size, shard_offset): + marlin_tile_size = getattr(param, "marlin_tile_size", None) + if marlin_tile_size is None: + return shard_size, shard_offset + + return shard_size * marlin_tile_size, shard_offset * marlin_tile_size + + class LinearMethodBase(ABC): """Base class for different (maybe quantized) linear methods.""" @@ -276,6 +284,11 @@ def weight_loader(self, if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) @@ -293,6 +306,11 @@ def weight_loader(self, if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size @@ -372,6 +390,7 @@ def weight_loader(self, loaded_shard_id: Optional[str] = None): param_data = param.data output_dim = getattr(param, "output_dim", None) + if loaded_shard_id is None: # Loaded weight is already packed. if output_dim is None: @@ -393,6 +412,11 @@ def weight_loader(self, if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) @@ -417,6 +441,11 @@ def weight_loader(self, if packed_dim == output_dim: shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor + + # If marlin, we need to adjust the offset and size to account for the tiling. + shard_size, shard_offset = adjust_marlin_shard( + param, shard_size, shard_offset) + param_data = param_data.narrow(output_dim, shard_offset, shard_size) if loaded_shard_id == "q": diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index b3449eaff0e35..dc54641878c64 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -4,11 +4,13 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig +from vllm.model_executor.layers.quantization.marlin import MarlinConfig _QUANTIZATION_CONFIG_REGISTRY = { "awq": AWQConfig, "gptq": GPTQConfig, "squeezellm": SqueezeLLMConfig, + "marlin": MarlinConfig, } diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py new file mode 100644 index 0000000000000..7566d78a8aba4 --- /dev/null +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -0,0 +1,210 @@ +from typing import Any, Dict, List, Optional + +import torch +from torch.nn.parameter import Parameter + +from vllm._C import ops +from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + + +class MarlinConfig(QuantizationConfig): + """Config class for Marlin. + + Reference: https://github.com/IST-DASLab/marlin/tree/master + """ + + def __init__( + self, + group_size: int, + ) -> None: + # Group size for the quantization. + self.group_size = group_size + if self.group_size != 128 and self.group_size != -1: + raise ValueError( + "Currently, only group size 128 and -1 (channelwise) is supported for " + f"Marlin, but got group_size of {self.group_size}") + + # 4 Bits packed into 32 bit datatype. + self.pack_factor = 32 // 4 + + # Tile size used by marlin kernels. + self.tile_size = 16 + + # Min out_features dim + self.min_n_threads = 64 + + # Min in_features dim + self.min_k_threads = 128 + + # Max parallel problems to solve at once (improves large batch performance) + self.max_parallel = 16 + + # Permutation length used by the marlin kernels. + self.perm_len = 1024 + + def __repr__(self) -> str: + return f"MarlinConfig(group_size={self.group_size}" + + @classmethod + def get_name(cls) -> str: + return "marlin" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.half] + + @classmethod + # Need to figure it out + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig": + group_size = cls.get_from_keys(config, ["group_size"]) + return cls(group_size) + + def get_linear_method(self) -> "MarlinLinearMethod": + return MarlinLinearMethod(self) + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class MarlinLinearMethod(LinearMethodBase): + """Linear method for Marlin. + + Args: + quant_config: The Marlin quantization config. + """ + + def __init__(self, quant_config: MarlinConfig): + self.quant_config = quant_config + + def create_weights( + self, + input_size_per_partition: int, + output_size_per_partition: int, + input_size: int, + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + del output_size # Unused. + + if params_dtype != torch.float16: + raise ValueError( + f"The params dtype must be float16, but got {params_dtype}") + + # Validate output_size_per_partition + if output_size_per_partition % self.quant_config.min_n_threads != 0: + raise ValueError( + f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}." + ) + if output_size_per_partition % self.quant_config.pack_factor != 0: + raise ValueError( + f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}." + ) + + # Validate input_size_per_partition + if input_size_per_partition % self.quant_config.min_k_threads != 0: + raise ValueError( + f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}." + ) + if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0: + raise ValueError( + f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}." + ) + + # Check that we have at least 4 tiles horizontally in the shard + num_tiles_per_perm = self.quant_config.perm_len // ( + self.quant_config.tile_size**2) + if output_size_per_partition % num_tiles_per_perm != 0: + raise ValueError( + "Each permutation group must reside on the same gpu") + + # Quantized 4Bit weights packed into Int32. + qweight = Parameter( + torch.empty( + input_size_per_partition // self.quant_config.tile_size, + output_size_per_partition * self.quant_config.tile_size // + self.quant_config.pack_factor, + device="cuda", + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs( + qweight, + { + "input_dim": 0, + "output_dim": 1, + "packed_dim": 1, + "pack_factor": self.quant_config.pack_factor, + "marlin_tile_size": self.quant_config.tile_size, + }, + ) + + # Determine if channelwise or not + input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size + + scales = Parameter( + torch.empty( + input_groups, + output_size_per_partition, + device="cuda", + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs( + scales, + { + "input_dim": None if input_groups == 1 else 0, + "output_dim": 1, + }, + ) + + # Allocate workspace (Used for internal locking mechanism) + max_workspace_size = ( + output_size_per_partition // + self.quant_config.min_n_threads) * self.quant_config.max_parallel + workspace = Parameter(torch.zeros(max_workspace_size, + device="cuda", + dtype=torch.int), + requires_grad=False) + + return { + "B": qweight, + "s": scales, + "workspace": workspace, + } + + def apply_weights( + self, + weights: Dict[str, Any], + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + qweight = weights["B"] + scales = weights["s"] + workspace = weights["workspace"] + + x_2d = x.view(-1, x.shape[-1]) + + size_m = x_2d.shape[0] + size_k = x_2d.shape[1] + size_n = scales.shape[1] + + output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m, + size_n, size_k) + + output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], )) + + if bias is not None: + output.add_(bias) # In-place add + + return output From 82091b864af105dbe373353655dc9d8c0a6ba66f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 1 Mar 2024 12:58:06 -0800 Subject: [PATCH 041/113] Bump up to v0.3.3 (#3129) --- vllm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/__init__.py b/vllm/__init__.py index 7ff92d8cc681d..f1e30f5eb6e6e 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -8,7 +8,7 @@ from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import SamplingParams -__version__ = "0.3.2" +__version__ = "0.3.3" __all__ = [ "LLM", From 29e70e3e88698feca9509cf07fcf06b12163f1c3 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Sat, 2 Mar 2024 07:28:41 +0800 Subject: [PATCH 042/113] allow user chose log level by --log-level instead of fixed 'info'. (#3109) Co-authored-by: zixiao Co-authored-by: Simon Mo --- vllm/config.py | 2 +- vllm/engine/metrics.py | 1 + vllm/entrypoints/openai/api_server.py | 8 +++++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b4d48d34a8a72..e260e6a0cb1d6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -317,7 +317,7 @@ def __init__( self.num_cpu_blocks = None def metrics_info(self): - # convert cache_config to dict(key: str, value:str) for prometheus metrics info + # convert cache_config to dict(key: str, value: str) for prometheus metrics info return {key: str(value) for key, value in self.__dict__.items()} def _verify_args(self) -> None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 54b09c38f58a5..d31542159e4a4 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -23,6 +23,7 @@ def __init__(self, labelnames: List[str]): if hasattr(collector, "_name") and "vllm" in collector._name: REGISTRY.unregister(collector) + # Config Information self.info_cache_config = Info( name='vllm:cache_config', documentation='information of cache_config') diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b2f040114a078..3777e0f3a0601 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -62,6 +62,12 @@ def parse_args(): description="vLLM OpenAI-Compatible RESTful API server.") parser.add_argument("--host", type=str, default=None, help="host name") parser.add_argument("--port", type=int, default=8000, help="port number") + parser.add_argument( + "--uvicorn-log-level", + type=str, + default="info", + choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], + help="log level for uvicorn") parser.add_argument("--allow-credentials", action="store_true", help="allow credentials") @@ -245,7 +251,7 @@ async def authentication(request: Request, call_next): uvicorn.run(app, host=args.host, port=args.port, - log_level="info", + log_level=args.uvicorn_log_level, timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, ssl_certfile=args.ssl_certfile) From baee28c46c242b72f90d6b1211ab9d7872ab05d3 Mon Sep 17 00:00:00 2001 From: cloudhan Date: Sat, 2 Mar 2024 14:34:48 +0800 Subject: [PATCH 043/113] Reorder kv dtype check to avoid nvcc not found error on AMD platform (#3104) --- vllm/config.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index e260e6a0cb1d6..ff8536c1aca55 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -330,15 +330,14 @@ def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass elif self.cache_dtype == "fp8_e5m2": + if is_hip(): + raise NotImplementedError( + "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"): raise ValueError( "FP8 is not supported when cuda version is lower than 11.8." ) - device_name = torch.cuda.get_device_name() - if "AMD" in device_name: - raise NotImplementedError( - "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") logger.info( "Using fp8_e5m2 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " From ce4f5a29fb3e35041842518fefe999847b8326b9 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Sat, 2 Mar 2024 03:50:01 -0500 Subject: [PATCH 044/113] Add Automatic Prefix Caching (#2762) Co-authored-by: ElizaWszola Co-authored-by: Michael Goin --- benchmarks/benchmark_throughput.py | 30 ++- docs/source/models/engine_args.rst | 4 + examples/offline_inference_with_prefix.py | 11 +- tests/prefix_caching/test_prefix_caching.py | 103 ++++--- tests/test_cache_block_hashing.py | 76 ++++++ vllm/block.py | 14 +- vllm/config.py | 2 + vllm/core/block_manager.py | 285 +++++++++++++++----- vllm/core/evictor.py | 161 +++++++++++ vllm/core/scheduler.py | 15 +- vllm/engine/arg_utils.py | 9 +- vllm/engine/async_llm_engine.py | 14 +- vllm/engine/llm_engine.py | 26 +- vllm/entrypoints/api_server.py | 6 +- vllm/entrypoints/llm.py | 14 +- vllm/prefix.py | 87 ------ vllm/sequence.py | 23 +- vllm/worker/model_runner.py | 30 ++- 18 files changed, 618 insertions(+), 292 deletions(-) create mode 100644 tests/test_cache_block_hashing.py create mode 100644 vllm/core/evictor.py delete mode 100644 vllm/prefix.py diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1ad502526c97c..51c1a6540a451 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -73,21 +73,21 @@ def run_vllm( enforce_eager: bool, kv_cache_dtype: str, device: str, + enable_prefix_caching: bool, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - device=device, - ) + llm = LLM(model=model, + tokenizer=tokenizer, + quantization=quantization, + tensor_parallel_size=tensor_parallel_size, + seed=seed, + trust_remote_code=trust_remote_code, + dtype=dtype, + max_model_len=max_model_len, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + device=device, + enable_prefix_caching=enable_prefix_caching) # Add the requests to the engine. for prompt, _, output_len in requests: @@ -211,7 +211,8 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.device) + args.kv_cache_dtype, args.device, + args.enable_prefix_caching) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -302,6 +303,7 @@ def main(args: argparse.Namespace): default="cuda", choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') + parser.add_argument("--enable_prefix_caching", action='store_true') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst index d89b795149501..9f5f672ae4f34 100644 --- a/docs/source/models/engine_args.rst +++ b/docs/source/models/engine_args.rst @@ -81,6 +81,10 @@ Below, you can find an explanation of every engine argument for vLLM: Token block size for contiguous chunks of tokens. +.. option:: --enable-prefix-caching + + Enables automatic prefix caching + .. option:: --seed Random seed for operations. diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 8ccfb1ceea731..1aa718b88907c 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -37,20 +37,13 @@ print("-" * 80) -# -1 since the last token can change when concatenating prompts. -prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 - # The llm.generate call will batch all prompts and send the batch at once if resources allow. # The prefix will only be cached after the first batch is processed, so we need to call generate once # to calculate the prefix and cache it. -outputs = llm.generate(generating_prompts[0], - sampling_params, - prefix_pos=[prefix_pos]) +outputs = llm.generate(generating_prompts[0], sampling_params) # Subsequent batches can leverage the cached prefix -outputs = llm.generate(generating_prompts, - sampling_params, - prefix_pos=[prefix_pos] * len(generating_prompts)) +outputs = llm.generate(generating_prompts, sampling_params) # Print the outputs. You should see the same outputs as before for output in outputs: diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 1e301bedfc21e..7ef8dde7bb8f6 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -4,38 +4,73 @@ """ import pytest -from vllm import LLM, SamplingParams - -prefix = ( - "You are an expert school principal, skilled in effectively managing " - "faculty and staff. Draft 10-15 questions for a potential first grade " - "Head Teacher for my K-12, all-girls', independent school that emphasizes " - "community, joyful discovery, and life-long learning. The candidate is " - "coming in for a first-round panel interview for a 8th grade Math " - "teaching role. They have 5 years of previous teaching experience " - "as an assistant teacher at a co-ed, public school with experience " - "in middle school math teaching. Based on these information, fulfill " - "the following paragraph: ") - - -@pytest.mark.parametrize("model", ["facebook/opt-125m"]) -@pytest.mark.parametrize("max_tokens", [16]) -def test_prefix_caching( - example_prompts, - model: str, - max_tokens: int, +from vllm.core.block_manager import BlockAllocator +from vllm.utils import Device + + +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_blocks", [16]) +def test_block_allocator( + block_size: int, + num_blocks: int, ): - llm = LLM(model=model) - # -1 since the last token can change when concatenating prompts. - prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 - prompts = [prefix + prompt for prompt in example_prompts] - sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs_without_prefix = llm.generate(prompts, sampling_params) - outputs_with_prefix = llm.generate(prompts, - sampling_params, - prefix_pos=[prefix_pos] * len(prompts)) - for output_without_prefix, output_with_prefix in zip( - outputs_without_prefix, outputs_with_prefix): - assert (output_without_prefix.outputs[0].token_ids == - output_with_prefix.outputs[0].token_ids) - assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1 + block_hash = 1 + block_allocator = BlockAllocator(Device.CPU, + block_size, + num_blocks, + enable_caching=True) + + # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock + first_block = block_allocator.allocate(block_hash, 0) + second_block = block_allocator.allocate(block_hash, 0) + assert (first_block == second_block) + assert (second_block.ref_count == 2) + + # Free the first_block and confirm that the ref_count is correctly decremented on the second block + block_allocator.free(first_block) + assert (second_block.ref_count == 1) + + # Free the second block + block_allocator.free(second_block) + + # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back + first_block = block_allocator.allocate(block_hash, 0) + assert (first_block == second_block) + assert (first_block.block_hash == block_hash) + + +@pytest.mark.parametrize("num_blocks", [16]) +def test_eviction(num_blocks: int, ): + block_size = 16 + block_allocator = BlockAllocator(Device.CPU, + block_size, + num_blocks, + enable_caching=True) + blocks = [] + + for i in range(num_blocks): + # use i as the block_hash + blocks.append(block_allocator.allocate(i, 0)) + + #Free all blocks + for block in blocks: + block_allocator.free(block) + + # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block + new_block_hash = block_size + new_block = block_allocator.allocate(new_block_hash, 0) + assert (new_block == blocks[0]) + assert (new_block.block_hash == new_block_hash) + + # Reallocate the second in blocks to remove it from the free list + realloc_block_hash = 1 + realloc_block = block_allocator.allocate(realloc_block_hash, 0) + assert (realloc_block == blocks[realloc_block_hash]) + assert (realloc_block.block_hash == realloc_block_hash) + + # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list + new_block_hash = block_size + 1 + new_block = block_allocator.allocate(new_block_hash, 0) + assert (realloc_block != new_block) + assert (new_block.block_hash == new_block_hash) + assert (new_block.block_number == 2) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py new file mode 100644 index 0000000000000..7c4ade7f8c8ed --- /dev/null +++ b/tests/test_cache_block_hashing.py @@ -0,0 +1,76 @@ +"""Test hashing of cache blocks. + +Run `pytest tests/test_cache_block_hashing.py`. +""" +import pytest + +from vllm.transformers_utils.tokenizer import TokenizerGroup +from vllm.sequence import Sequence + +# Make two prefixes with different first blocks. +prefix_start = [("You are an expert"), ("You are a")] +prefix_common = ( + " school principal, skilled in effectively managing " + "faculty and staff. Draft 10-15 questions for a potential first grade " + "Head Teacher for my K-12, all-girls', independent school that emphasizes " + "community, joyful discovery, and life-long learning. The candidate is " + "coming in for a first-round panel interview for a 8th grade Math " + "teaching role. They have 5 years of previous teaching experience " + "as an assistant teacher at a co-ed, public school with experience " + "in middle school math teaching. Based on this, fulfill " + "the following: ") +prefixes = [start + prefix_common for start in prefix_start] + +# Sample prompts. +sample_prompts = [ + "Hello, my name is", "The president of the United States is", + "The capital of France is", "The future of AI is" +] + + +# Helper function. +def flatten_2d(li): + return [lss for ls in li for lss in ls] + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("max_num_seqs", [256]) +def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): + + tokenizer = TokenizerGroup( + tokenizer_id="facebook/opt-125m", + enable_lora=False, + max_num_seqs=max_num_seqs, + max_input_length=None, + ) + + hashes = [] + + for prefix in prefixes: + hashes.append([]) + prompts = [prefix + prompt for prompt in sample_prompts] + seq_id = 0 + for prompt in prompts: + hashes[-1].append([]) + prompt_token_ids = tokenizer.encode(prompt) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size) + + num_blocks = len(prompt_token_ids) // block_size + for idx in range(num_blocks): + hashes[-1][-1].append(seq.hash_of_block(idx)) + + seq_id += 1 + + # Check that hashes made with two prefixes with different first blocks are + # different everywhere. + for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): + assert (hash0 != hash1) + + # Check that hashes of different prompts made with the same prefix are the + # same until the hashes that contain the prompt. + for hash_pref in hashes: + same_hashes = [tuple(h[:-1]) for h in hash_pref] + different_hashes = [h[-1] for h in hash_pref] + assert (len(set(same_hashes)) == 1) + assert (len(set(different_hashes)) == len(different_hashes)) diff --git a/vllm/block.py b/vllm/block.py index 5fe39ed47b2ff..2cc6b947f2255 100644 --- a/vllm/block.py +++ b/vllm/block.py @@ -5,6 +5,8 @@ _BLANK_TOKEN_ID = -1 +DEFAULT_LAST_ACCESSED_TIME = -1 + class LogicalTokenBlock: """A block that stores a contiguous chunk of tokens from left to right. @@ -55,17 +57,27 @@ def __init__( device: Device, block_number: int, block_size: int, + block_hash: int, + num_hashed_tokens: int, ) -> None: self.device = device self.block_number = block_number self.block_size = block_size + self.block_hash = block_hash + self.num_hashed_tokens = num_hashed_tokens self.ref_count = 0 + self.last_accessed = DEFAULT_LAST_ACCESSED_TIME + + self.computed = False def __repr__(self) -> str: return (f'PhysicalTokenBlock(device={self.device}, ' f'block_number={self.block_number}, ' - f'ref_count={self.ref_count})') + f'num_hashed_tokens={self.num_hashed_tokens}, ' + f'ref_count={self.ref_count}, ' + f'last_accessed={self.last_accessed}, ' + f'computed={self.computed})') # Mapping: logical block number -> physical block. diff --git a/vllm/config.py b/vllm/config.py index ff8536c1aca55..876a439cd1280 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -303,12 +303,14 @@ def __init__( swap_space: int, cache_dtype: str, sliding_window: Optional[int] = None, + enable_prefix_caching: bool = False, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window + self.enable_prefix_caching = enable_prefix_caching self._verify_args() self._verify_cache_dtype() diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 3946096d4296a..08d519ab767a9 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,10 +1,13 @@ """A block manager that manages token blocks.""" import enum +from itertools import count +from os.path import commonprefix from typing import Dict, List, Optional, Set, Tuple from vllm.block import BlockTable, PhysicalTokenBlock from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device +from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor class BlockAllocator: @@ -15,29 +18,68 @@ class BlockAllocator: the reference count becomes zero, the block is added back to the free list. """ - def __init__( - self, - device: Device, - block_size: int, - num_blocks: int, - ) -> None: + def __init__(self, + device: Device, + block_size: int, + num_blocks: int, + eviction_policy: EvictionPolicy = EvictionPolicy.LRU, + enable_caching: bool = False) -> None: self.device = device self.block_size = block_size self.num_blocks = num_blocks + self.enable_caching = enable_caching + + self.current_num_blocks = 0 + self.cached_blocks: Dict[int, PhysicalTokenBlock] = {} + + # Switch over to FIFO eviction when caching is disabled + if not self.enable_caching: + eviction_policy = EvictionPolicy.FIFO + self.evictor: Evictor = make_evictor(eviction_policy) + + self.default_hash_ctr = count() + + def allocate_block(self, block_hash: int, + num_hashed_tokens: int) -> PhysicalTokenBlock: + if self.current_num_blocks == self.num_blocks: + block = self.evictor.evict() + block.block_hash = block_hash + block.num_hashed_tokens = num_hashed_tokens + return block + block = PhysicalTokenBlock(device=self.device, + block_number=self.current_num_blocks, + block_size=self.block_size, + block_hash=block_hash, + num_hashed_tokens=num_hashed_tokens) + self.current_num_blocks += 1 + return block - # Initialize the free blocks. - self.free_blocks: BlockTable = [] - for i in range(num_blocks): - block = PhysicalTokenBlock(device=device, - block_number=i, - block_size=block_size) - self.free_blocks.append(block) - - def allocate(self) -> PhysicalTokenBlock: - if not self.free_blocks: - raise ValueError("Out of memory! No free blocks are available.") - block = self.free_blocks.pop() - block.ref_count = 1 + def allocate(self, + block_hash: Optional[int] = None, + num_hashed_tokens: int = 0) -> PhysicalTokenBlock: + # If caching is disabled, just allocate a new block and return it + if not self.enable_caching: + block = self.allocate_block(next(self.default_hash_ctr), + num_hashed_tokens) + block.ref_count += 1 + return block + + if block_hash is None: + block_hash = next(self.default_hash_ctr) + if block_hash in self.evictor: + assert block_hash not in self.cached_blocks + block = self.evictor.remove(block_hash) + assert block.ref_count == 0 + self.cached_blocks[block_hash] = block + block.ref_count += 1 + assert block.block_hash == block_hash + return block + if block_hash not in self.cached_blocks: + self.cached_blocks[block_hash] = self.allocate_block( + block_hash, num_hashed_tokens) + block = self.cached_blocks[block_hash] + assert block.block_hash == block_hash + block.ref_count += 1 return block def free(self, block: PhysicalTokenBlock) -> None: @@ -45,10 +87,27 @@ def free(self, block: PhysicalTokenBlock) -> None: raise ValueError(f"Double free! {block} is already freed.") block.ref_count -= 1 if block.ref_count == 0: - self.free_blocks.append(block) + assert block.block_hash not in self.evictor + self.evictor.add(block) + + # If caching is enabled, remove the block from the cached_blocks + if self.enable_caching: + del self.cached_blocks[block.block_hash] def get_num_free_blocks(self) -> int: - return len(self.free_blocks) + return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks + + def contains_block(self, block_hash: int) -> bool: + return block_hash in self.cached_blocks or block_hash in self.evictor + + def update_hash(self, block_hash: int, block: PhysicalTokenBlock): + # If caching is enabled, update the hash of block and the cached_blocks dictionary. + if self.enable_caching: + assert not self.contains_block(block_hash) + old_hash = block.block_hash + block.block_hash = block_hash + del self.cached_blocks[old_hash] + self.cached_blocks[block_hash] = block class AllocStatus(enum.Enum): @@ -75,6 +134,7 @@ def __init__( num_cpu_blocks: int, watermark: float = 0.01, sliding_window: Optional[int] = None, + enable_caching: bool = False, ) -> None: self.block_size = block_size self.num_total_gpu_blocks = num_gpu_blocks @@ -89,11 +149,17 @@ def __init__( self.watermark = watermark assert watermark >= 0.0 + self.enable_caching = enable_caching + self.watermark_blocks = int(watermark * num_gpu_blocks) - self.gpu_allocator = BlockAllocator(Device.GPU, block_size, - num_gpu_blocks) - self.cpu_allocator = BlockAllocator(Device.CPU, block_size, - num_cpu_blocks) + self.gpu_allocator = BlockAllocator(Device.GPU, + block_size, + num_gpu_blocks, + enable_caching=enable_caching) + self.cpu_allocator = BlockAllocator(Device.CPU, + block_size, + num_cpu_blocks, + enable_caching=enable_caching) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} @@ -103,9 +169,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = len(seq.logical_token_blocks) - if seq_group.prefix is not None and seq_group.prefix.allocated: - num_required_blocks -= seq_group.prefix.get_num_blocks() - if self.block_sliding_window is not None: num_required_blocks = min(num_required_blocks, self.block_sliding_window) @@ -129,36 +192,16 @@ def allocate(self, seq_group: SequenceGroup) -> None: num_prompt_blocks = len(seq.logical_token_blocks) block_table: BlockTable = [] - prefix_block_table: BlockTable = [] - num_prefix_blocks = 0 - - prefix = seq_group.prefix - if prefix is not None and prefix.allocated: - # Prefix has already been allocated. Use the existing block table. - num_prompt_blocks -= prefix.get_num_blocks() - for block in prefix.block_table: - block.ref_count += seq_group.num_seqs() - block_table.append(block) - for logical_idx in range(num_prompt_blocks): if (self.block_sliding_window is not None and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] else: - block = self.gpu_allocator.allocate() - # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() + block = self.gpu_allocator.allocate( + seq.hash_of_block(logical_idx), + seq.num_hashed_tokens_of_block(logical_idx)) block_table.append(block) - if prefix is not None and not prefix.allocated: - # Allocate blocks for the prefix, we will compute the prefix's - # KV cache in this run. - num_prefix_blocks = prefix.get_num_blocks() - prefix_block_table = block_table[:num_prefix_blocks] - for block in prefix_block_table: - block.ref_count += 1 - prefix.set_block_table(prefix_block_table) - # Assign the block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() @@ -170,12 +213,72 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool: num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING) return num_seqs <= num_free_gpu_blocks - def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: + def _promote_last_block( + self, + seq: Sequence, + last_block: PhysicalTokenBlock, + ) -> PhysicalTokenBlock: + # Compute a new hash for the block so that it can be shared by other Sequences + new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) + + # if new_hash is already in the cached table, then free last_block and return the cached version + if self.gpu_allocator.contains_block(new_hash): + self.gpu_allocator.free(last_block) + return self.gpu_allocator.allocate(new_hash) + else: + self.gpu_allocator.update_hash(new_hash, last_block) + return last_block + + def _is_last_block_full( + self, + seq: Sequence, + ) -> bool: + token_ids_len = len(seq.data.get_token_ids()) + return token_ids_len > 0 and token_ids_len % seq.block_size == 0 + + def _is_last_block( + self, + seq: Sequence, + index: int, + ) -> bool: + return index == len(seq.logical_token_blocks) - 1 + + def _maybe_promote_last_block( + self, + seq: Sequence, + last_block: PhysicalTokenBlock, + ) -> PhysicalTokenBlock: + if self._is_last_block_full(seq): + return self._promote_last_block(seq, last_block) + else: + return last_block + + def _allocate_last_physical_block( + self, + seq: Sequence, + ) -> PhysicalTokenBlock: + block_hash: Optional[int] = None + if (self._is_last_block_full(seq)): + block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) + num_hashed_tokens = seq.num_hashed_tokens_of_block( + len(seq.logical_token_blocks) - 1) + new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens) + if block_hash is None: + assert new_block.ref_count == 1 + return new_block + + def append_slot( + self, + seq: Sequence, + ) -> Optional[Tuple[int, int]]: """Allocate a physical slot for a new token.""" logical_blocks = seq.logical_token_blocks block_table = self.block_tables[seq.seq_id] - + # If we need to allocate a new physical block if len(block_table) < len(logical_blocks): + # Currently this code only supports adding one physical block + assert len(block_table) == len(logical_blocks) - 1 + if (self.block_sliding_window and len(block_table) >= self.block_sliding_window): # reuse a block @@ -184,8 +287,8 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: else: # The sequence has a new logical block. # Allocate a new physical block. - block = self.gpu_allocator.allocate() - block_table.append(block) + new_block = self._allocate_last_physical_block(seq) + block_table.append(new_block) return None # We want to append the token to the last physical block. @@ -193,11 +296,15 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]: assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. + # If the last block is now complete, promote it to a full block so that it can be shared + new_block = self._maybe_promote_last_block(seq, last_block) + block_table[-1] = new_block return None else: # The last block is shared with other sequences. # Copy on Write: Allocate a new block and copy the tokens. - new_block = self.gpu_allocator.allocate() + new_block = self._allocate_last_physical_block(seq) + block_table[-1] = new_block self.gpu_allocator.free(last_block) return last_block.block_number, new_block.block_number @@ -233,25 +340,18 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool: def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: # CPU block -> GPU block. - if seq_group.prefix is not None: - # make sure to swap in the prefix first - assert seq_group.prefix.allocated and seq_group.prefix.computed - mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): new_block_table: BlockTable = [] block_table = self.block_tables[seq.seq_id] - if seq_group.prefix is not None: - for block in seq_group.prefix.block_table: - new_block_table.append(block) - block.ref_count += 1 for cpu_block in block_table: if cpu_block in mapping: gpu_block = mapping[cpu_block] gpu_block.ref_count += 1 else: - gpu_block = self.gpu_allocator.allocate() + gpu_block = self.gpu_allocator.allocate( + cpu_block.block_hash, cpu_block.num_hashed_tokens) mapping[cpu_block] = gpu_block new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. @@ -276,17 +376,12 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: block_table = self.block_tables[seq.seq_id] for gpu_block in block_table: - if (seq_group.prefix is not None - and gpu_block in seq_group.prefix.block_table): - # NOTE: We do not swap out the prefix blocks for now. - self.gpu_allocator.free(gpu_block) - continue - if gpu_block in mapping: cpu_block = mapping[gpu_block] cpu_block.ref_count += 1 else: - cpu_block = self.cpu_allocator.allocate() + cpu_block = self.cpu_allocator.allocate( + gpu_block.block_hash, gpu_block.num_hashed_tokens) mapping[gpu_block] = cpu_block new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. @@ -328,3 +423,49 @@ def get_num_free_gpu_blocks(self) -> int: def get_num_free_cpu_blocks(self) -> int: return self.cpu_allocator.get_num_free_blocks() + + def access_all_blocks_in_seq( + self, + seq: Sequence, + access_time: float, + ) -> None: + block_table = self.block_tables[seq.seq_id] + for block in block_table: + block.last_accessed = access_time + + def compute_last_full_block_in_seq(self, seq: Sequence): + if seq.seq_id not in self.block_tables: + return + max_full_block = seq.get_len() // seq.block_size - 1 + block_table = self.block_tables[seq.seq_id] + if max_full_block == -1: + return + block_table[max_full_block].computed = True + + def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]: + if seq.seq_id not in self.block_tables: + return [] + block_table = self.block_tables[seq.seq_id] + for block_idx in reversed(range(len(block_table))): + if block_table[block_idx].computed: + return [b.block_number for b in block_table[:block_idx + 1]] + return [] + + # Can return non-empty result only with prefix caching enabled. + def get_common_computed_block_ids(self, + seq_group: SequenceGroup) -> List[int]: + if not self.enable_caching: + return [] + + ids_list = [ + self.get_all_block_ids_till_computed(seq) + for seq in iter(seq_group.seqs_dict.values()) + ] + return commonprefix([ids for ids in ids_list if ids != []]) + + # We only mark the last full block because with prefix caching, + # all blocks until the marked one are guaranteed to be computed. + def mark_blocks_as_computed(self, seq_group: SequenceGroup): + if self.enable_caching: + for seq in seq_group.seqs_dict.values(): + self.compute_last_full_block_in_seq(seq) diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py new file mode 100644 index 0000000000000..b538ea574b604 --- /dev/null +++ b/vllm/core/evictor.py @@ -0,0 +1,161 @@ +import enum +from typing import Dict, List, Optional +from abc import ABC, abstractmethod, abstractproperty + +from vllm.block import PhysicalTokenBlock + + +class EvictionPolicy(enum.Enum): + """Enum for eviction policy used by make_evictor to instantiate the correct + Evictor subclass. + """ + LRU = enum.auto() + FIFO = enum.auto() + + +class Evictor(ABC): + """The Evictor subclasses should be used by the BlockAllocator class to + handle eviction of freed PhysicalTokenBlocks. + """ + + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def __contains__(self, block_hash: int) -> bool: + pass + + @abstractmethod + def evict(self) -> PhysicalTokenBlock: + """Runs the eviction algorithm and returns the evicted block""" + pass + + @abstractmethod + def add(self, block: PhysicalTokenBlock): + """Adds block to the evictor, making it a candidate for eviction""" + pass + + @abstractmethod + def remove(self, block_hash: int) -> PhysicalTokenBlock: + """Simply removes the block with the hash value block_hash from the + evictor. Caller is responsible for making sure that block_hash is contained + in the evictor before calling remove. Should be used to "bring back" blocks + that have been freed but not evicted yet. + """ + pass + + @abstractproperty + def num_blocks(self) -> int: + pass + + +class LRUEvictor(Evictor): + """Evicts in a least-recently-used order using the last_accessed timestamp + that's recorded in the PhysicalTokenBlock. If there are multiple blocks with + the same last_accessed time, then the one with the largest num_hashed_tokens + will be evicted. If two blocks each have the lowest last_accessed time and + highest num_hashed_tokens value, then one will be chose arbitrarily + """ + + def __init__(self): + self.free_table: Dict[int, PhysicalTokenBlock] = {} + + def __contains__(self, block_hash: int) -> bool: + return block_hash in self.free_table + + # TODO: The performance of this evict function can be optimized further. + def evict(self) -> PhysicalTokenBlock: + free_blocks: List[PhysicalTokenBlock] = list(self.free_table.values()) + if len(free_blocks) == 0: + raise ValueError("No usable cache memory left") + + # Find lowest timestamp + lowest_timestamp = free_blocks[0].last_accessed + for block in free_blocks: + if block.last_accessed < lowest_timestamp: + lowest_timestamp = block.last_accessed + + # Find all blocks with the lowest timestamp + least_recent: List[PhysicalTokenBlock] = [] + for block in free_blocks: + if block.last_accessed == lowest_timestamp: + least_recent.append(block) + + # Find highest prefix count per block + highest_num_hashed_tokens = 0 + for block in least_recent: + if block.num_hashed_tokens > highest_num_hashed_tokens: + highest_num_hashed_tokens = block.num_hashed_tokens + + evicted_block: Optional[PhysicalTokenBlock] = None + + # Find the first block with the lowest timestamp + for block in least_recent: + if block.num_hashed_tokens == highest_num_hashed_tokens: + evicted_block = block + break + + assert evicted_block is not None + + del self.free_table[evicted_block.block_hash] + + evicted_block.computed = False + return evicted_block + + def add(self, block: PhysicalTokenBlock): + self.free_table[block.block_hash] = block + + def remove(self, block_hash: int) -> PhysicalTokenBlock: + if block_hash not in self.free_table: + raise ValueError( + "Attempting to remove block that's not in the evictor") + block: PhysicalTokenBlock = self.free_table[block_hash] + del self.free_table[block_hash] + return block + + @property + def num_blocks(self) -> int: + return len(self.free_table) + + +class RandomEvictor(Evictor): + """Evicts in a first-in-first-out order""" + + def __init__(self): + self.free_table: Dict[int, PhysicalTokenBlock] = {} + + def __contains__(self, block_hash: int) -> bool: + return block_hash in self.free_table + + def evict(self) -> PhysicalTokenBlock: + if len(self.free_table) == 0: + raise ValueError("No usable cache memory left") + evicted_block = next(iter(self.free_table.values())) + evicted_block.computed = False + del self.free_table[evicted_block.block_hash] + return evicted_block + + def add(self, block: PhysicalTokenBlock): + self.free_table[block.block_hash] = block + + def remove(self, block_hash: int) -> PhysicalTokenBlock: + if block_hash not in self.free_table: + raise ValueError( + "Attempting to remove block that's not in the evictor") + block: PhysicalTokenBlock = self.free_table[block_hash] + del self.free_table[block_hash] + return block + + @property + def num_blocks(self) -> int: + return len(self.free_table) + + +def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: + if eviction_policy == EvictionPolicy.LRU: + return LRUEvictor() + elif eviction_policy == EvictionPolicy.FIFO: + return RandomEvictor() + else: + raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 5e7cc3091d775..1ae58f525b0fb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -10,7 +10,6 @@ from vllm.logger import init_logger from vllm.sequence import (Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata, SequenceStatus) -from vllm.prefix import PrefixPool logger = init_logger(__name__) @@ -95,10 +94,8 @@ def __init__( block_size=self.cache_config.block_size, num_gpu_blocks=self.cache_config.num_gpu_blocks, num_cpu_blocks=self.cache_config.num_cpu_blocks, - sliding_window=self.cache_config.sliding_window) - - # Create the prefix pool to cache the prefixes. - self.prefix_pool = PrefixPool(self.cache_config.block_size) + sliding_window=self.cache_config.sliding_window, + enable_caching=self.cache_config.enable_prefix_caching) # Sequence groups in the WAITING state. self.waiting: Deque[SequenceGroup] = deque() @@ -374,10 +371,12 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: seq_data: Dict[int, SequenceData] = {} block_tables: Dict[int, List[int]] = {} + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq_id = seq.seq_id seq_data[seq_id] = seq.data block_tables[seq_id] = self.block_manager.get_block_table(seq) + self.block_manager.access_all_blocks_in_seq(seq, now) seq_group_metadata = SequenceGroupMetadata( request_id=seq_group.request_id, @@ -386,7 +385,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]: sampling_params=seq_group.sampling_params, block_tables=block_tables, lora_request=seq_group.lora_request, - prefix=seq_group.prefix, + computed_block_nums=self.block_manager. + get_common_computed_block_ids(seq_group), state=seq_group.state, ) seq_group_metadata_list.append(seq_group_metadata) @@ -496,3 +496,6 @@ def _swap_out( blocks_to_swap_out.update(mapping) for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): seq.status = SequenceStatus.SWAPPED + + def mark_blocks_as_computed(self, seq_group: SequenceGroup): + self.block_manager.mark_blocks_as_computed(seq_group) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c01e7311fb89a..0349c3a6636c7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -25,6 +25,7 @@ class EngineArgs: tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None block_size: int = 16 + enable_prefix_caching: bool = False swap_space: int = 4 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None @@ -173,6 +174,11 @@ def add_cli_args( default=EngineArgs.block_size, choices=[8, 16, 32, 128], help='token block size') + + parser.add_argument('--enable-prefix-caching', + action='store_true', + help='Enables automatic prefix caching') + parser.add_argument('--seed', type=int, default=EngineArgs.seed, @@ -293,7 +299,8 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - model_config.get_sliding_window()) + model_config.get_sliding_window(), + self.enable_prefix_caching) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index daa6419cdad3b..9e52d20ca4980 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -225,7 +225,6 @@ async def add_request_async( prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -245,7 +244,6 @@ async def add_request_async( sampling_params=sampling_params, arrival_time=arrival_time, lora_request=lora_request, - prefix_pos=prefix_pos, ) async def _run_workers_async( @@ -422,7 +420,6 @@ async def add_request( prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> AsyncStream: if self.log_requests: shortened_prompt = prompt @@ -435,7 +432,6 @@ async def add_request( max_log_len] logger.info(f"Received request {request_id}: " f"prompt: {shortened_prompt!r}, " - f"prefix_pos: {prefix_pos}," f"sampling_params: {sampling_params}, " f"prompt_token_ids: {shortened_token_ids}, " f"lora_request: {lora_request}.") @@ -472,8 +468,7 @@ async def add_request( sampling_params=sampling_params, prompt_token_ids=prompt_token_ids, arrival_time=arrival_time, - lora_request=lora_request, - prefix_pos=prefix_pos) + lora_request=lora_request) return stream @@ -484,7 +479,6 @@ async def generate( request_id: str, prompt_token_ids: Optional[List[int]] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. @@ -500,11 +494,6 @@ async def generate( prompt_token_ids: The token IDs of the prompt. If None, we use the tokenizer to convert the prompts to token IDs. lora_request: LoRA request to use for generation, if any. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. Yields: The output `RequestOutput` objects from the LLMEngine for the @@ -565,7 +554,6 @@ async def generate( prompt_token_ids=prompt_token_ids, arrival_time=arrival_time, lora_request=lora_request, - prefix_pos=prefix_pos, ) async for request_output in stream: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index df4858a696530..e84fda5640e4d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -415,7 +415,6 @@ def add_request( prompt_token_ids: Optional[List[int]] = None, arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: """Add a request to the engine's request pool. @@ -432,11 +431,6 @@ def add_request( use the tokenizer to convert the prompts to token IDs. arrival_time: The arrival time of the request. If None, we use the current monotonic time. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. Details: - Set arrival_time to the current time if it is None. @@ -479,18 +473,13 @@ def add_request( seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, lora_request) - # Check whether the input specifies prefix - prefix = self.scheduler.prefix_pool.add_or_get_prefix( - prompt_token_ids[:prefix_pos], lora_request.lora_int_id - if lora_request else 0) if prefix_pos is not None else None - # Defensive copy of SamplingParams, which are used by the sampler, # this doesn't deep-copy LogitsProcessor objects sampling_params = sampling_params.clone() # Create the sequence group. seq_group = SequenceGroup(request_id, [seq], sampling_params, - arrival_time, lora_request, prefix) + arrival_time, lora_request) # Add the sequence group to the scheduler. self.scheduler.add_seq_group(seq_group) @@ -752,6 +741,13 @@ def _process_model_outputs( now = time.time() # Update the scheduled sequence groups with the model outputs. scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups + + # If prefix caching is enabled, mark all blocks in the sequence groups + # as completed so that future requests don't attempt to recompute them + if self.cache_config.enable_prefix_caching: + for seq_group in scheduled_seq_groups: + self.scheduler.mark_blocks_as_computed(seq_group) + for seq_group, outputs in zip(scheduled_seq_groups, output): self._process_sequence_group_outputs(seq_group, outputs) @@ -768,12 +764,6 @@ def _process_model_outputs( request_output = RequestOutput.from_seq_group(seq_group) request_outputs.append(request_output) - # Update prefix state, now all the uncomputed prefixes are computed. - for seq_group in scheduled_seq_groups: - if (seq_group.prefix is not None and seq_group.prefix.allocated - and not seq_group.prefix.computed): - seq_group.prefix.computed = True - # Log stats. if self.log_stats: self.stat_logger.log(self._get_stats(scheduler_outputs)) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index e7af2c6db5e4c..1eb4ab8b06b64 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -39,15 +39,11 @@ async def generate(request: Request) -> Response: """ request_dict = await request.json() prompt = request_dict.pop("prompt") - prefix_pos = request_dict.pop("prefix_pos", None) stream = request_dict.pop("stream", False) sampling_params = SamplingParams(**request_dict) request_id = random_uuid() - results_generator = engine.generate(prompt, - sampling_params, - request_id, - prefix_pos=prefix_pos) + results_generator = engine.generate(prompt, sampling_params, request_id) # Streaming case async def stream_results() -> AsyncGenerator[bytes, None]: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fc82018d18eb6..62f1d172377f6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -124,7 +124,6 @@ def generate( prompts: Optional[Union[str, List[str]]] = None, sampling_params: Optional[SamplingParams] = None, prompt_token_ids: Optional[List[List[int]]] = None, - prefix_pos: Optional[Union[int, List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, ) -> List[RequestOutput]: @@ -140,11 +139,6 @@ def generate( None, we use the default sampling parameters. prompt_token_ids: A list of token IDs for the prompts. If None, we use the tokenizer to convert the prompts to token IDs. - prefix_pos: If not None, we use the given position as the prefix - position for each prompt. We will cache the prefix's KV - cache and reuse it for the next request with the same prefix. - This is an experimental feature, and may be replaced with - automatic prefix caching in the future. use_tqdm: Whether to use tqdm to display the progress bar. lora_request: LoRA request to use for generation, if any. @@ -171,14 +165,12 @@ def generate( prompt_token_ids) for i in range(num_requests): prompt = prompts[i] if prompts is not None else None - prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None token_ids = None if prompt_token_ids is None else prompt_token_ids[ i] self._add_request(prompt, sampling_params, token_ids, - lora_request=lora_request, - prefix_pos=prefix_pos_i) + lora_request=lora_request) return self._run_engine(use_tqdm) def _add_request( @@ -187,15 +179,13 @@ def _add_request( sampling_params: SamplingParams, prompt_token_ids: Optional[List[int]], lora_request: Optional[LoRARequest] = None, - prefix_pos: Optional[int] = None, ) -> None: request_id = str(next(self.request_counter)) self.llm_engine.add_request(request_id, prompt, sampling_params, prompt_token_ids, - lora_request=lora_request, - prefix_pos=prefix_pos) + lora_request=lora_request) def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # Initialize tqdm. diff --git a/vllm/prefix.py b/vllm/prefix.py deleted file mode 100644 index 5b6e8e4b92be6..0000000000000 --- a/vllm/prefix.py +++ /dev/null @@ -1,87 +0,0 @@ -from typing import Dict, List, Sequence, Tuple, Optional - -from vllm.block import BlockTable - - -class Prefix: - """Data and states associated with a prefix of prompt tokens for multiple - sequence groups. - - NOTE: This feature is experimental and may be replaced with automatic - prefix caching in the future. - - Args: - token_ids: The token ids of the prefix. - block_size: The block size of the executed model. - """ - - def __init__( - self, - token_ids: Sequence[int], - block_size: int, - ) -> None: - self.token_ids = tuple(token_ids) - self.block_size = block_size - self.length = len(token_ids) - self.hash = hash(token_ids) - assert self.length % block_size == 0 - self.block_table: Optional[BlockTable] = None - self.computed = False - - @property - def allocated(self) -> bool: - return self.block_table is not None - - def get_num_blocks(self) -> int: - return self.length // self.block_size - - def get_block_numbers(self) -> List[int]: - return [block.block_number for block in self.block_table] - - def get_length(self) -> int: - return self.length - - def __hash__(self) -> int: - return self.hash - - def set_block_table(self, block_table: BlockTable) -> None: - self.block_table = block_table.copy() - - -class PrefixPool: - """Manages all the prompt prefixes. - - NOTE: This feature is experimental and may be replaced with automatic - prefix caching in the future. - - Args: - block_size: The block size of the executed model. - - Attributes: - prefixes: A list of all the prefixes. - block_size: The block size of the executed model. - """ - - def __init__( - self, - block_size: int, - ) -> None: - # TODO(zhuohan): Add a capacity limit to the prefix pool. - self.prefixes: Dict[int, Prefix] = {} - self.block_size = block_size - - def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]: - new_length = len(token_ids) // self.block_size * self.block_size - return tuple(token_ids[:new_length]) - - def add_or_get_prefix(self, token_ids: Sequence[int], - lora_int_id: int) -> Optional[Prefix]: - token_ids = self._truncate_token_ids(token_ids) - if len(token_ids) == 0: - # Prefix is empty. - return None - prefix = Prefix(token_ids, self.block_size) - prefix_hash = hash((prefix, lora_int_id)) - if prefix_hash not in self.prefixes: - self.prefixes[prefix_hash] = prefix - return self.prefixes[prefix_hash] diff --git a/vllm/sequence.py b/vllm/sequence.py index 040e9756e15c6..122960035e505 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,7 +5,6 @@ from typing import Dict, List, Optional, Union from vllm.block import LogicalTokenBlock -from vllm.prefix import Prefix from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest @@ -161,6 +160,16 @@ def __init__( def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 + # TODO The current hashing function is O(L^2). We should optimize this in + # the future. + def hash_of_block(self, logical_idx: int) -> int: + # Compute the number of tokens in the sequence + num_tokens = self.num_hashed_tokens_of_block(logical_idx) + return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + + def num_hashed_tokens_of_block(self, logical_idx: int): + return logical_idx * self.block_size + self.block_size + def _append_logical_block(self) -> None: block = LogicalTokenBlock( block_number=len(self.logical_token_blocks), @@ -265,7 +274,6 @@ class SequenceGroup: sampling_params: The sampling parameters used to generate the outputs. arrival_time: The arrival time of the request. lora_request: LoRA request. - prefix: The prefix of the prompt of the sequence group. """ def __init__( @@ -275,7 +283,6 @@ def __init__( sampling_params: SamplingParams, arrival_time: float, lora_request: Optional[LoRARequest] = None, - prefix: Optional[Prefix] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -286,7 +293,6 @@ def __init__( first_token_time=None, time_in_queue=None) self.lora_request = lora_request - self.prefix: Optional[Prefix] = prefix self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() @@ -302,6 +308,10 @@ def prompt_token_ids(self) -> List[int]: # We use the prompt of an arbitrary sequence. return next(iter(self.seqs_dict.values())).data.prompt_token_ids + @property + def block_size(self) -> int: + return next(iter(self.seqs_dict.values())).block_size + @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 @@ -408,7 +418,6 @@ class SequenceGroupMetadata: numbers) state: Internal state tied to this sequence group. lora_request: LoRA request. - prefix: The prefix of the prompt of the sequence group. """ def __init__( @@ -419,7 +428,7 @@ def __init__( sampling_params: SamplingParams, block_tables: Dict[int, List[int]], lora_request: Optional[LoRARequest] = None, - prefix: Optional[Prefix] = None, + computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, ) -> None: self.request_id = request_id @@ -428,7 +437,7 @@ def __init__( self.sampling_params = sampling_params self.block_tables = block_tables self.lora_request = lora_request - self.prefix = prefix + self.computed_block_nums = computed_block_nums self.state = SequenceGroupState() if state is None else state @property diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index efe570778fb43..aff8ebc903623 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -145,33 +145,37 @@ def _prepare_prompt( prompt_tokens = seq_data.get_token_ids() prompt_len = len(prompt_tokens) prompt_lens.append(prompt_len) - prefix_len = 0 - prefix = seq_group_metadata.prefix - if prefix is not None and prefix.computed: - prefix_len = prefix.get_length() - prompt_tokens = prompt_tokens[prefix_len:] - prefix_block_tables.append(prefix.get_block_numbers()) + computed_len = 0 + + # NOTE: This only works for oooooooxxx style attention. + computed_block_nums = seq_group_metadata.computed_block_nums + if computed_block_nums is not None and len( + computed_block_nums) > 0 and self.sliding_window is None: + # Prefix is not supported with sliding_window + computed_len = len(computed_block_nums) * self.block_size + prompt_tokens = prompt_tokens[computed_len:] + prefix_block_tables.append(computed_block_nums) else: prefix_block_tables.append([]) # actual prompt lens - context_lens.append(prefix_len) - subquery_lens.append(prompt_len - prefix_len) + context_lens.append(computed_len) + subquery_lens.append(prompt_len - computed_len) input_tokens.append(prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append( - list(range(prefix_len, prefix_len + len(prompt_tokens)))) + list(range(computed_len, computed_len + len(prompt_tokens)))) lora_id = seq_group_metadata.lora_int_id if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping.append([lora_id] * (prompt_len - prefix_len)) + lora_index_mapping.append([lora_id] * (prompt_len - computed_len)) lora_prompt_mapping.extend( [lora_id] * - (prompt_len - prefix_len + (prompt_len - computed_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) if seq_group_metadata.block_tables is None: @@ -190,11 +194,11 @@ def _prepare_prompt( # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. start_idx = 0 if self.sliding_window is not None: - assert prefix_len == 0, ( + assert computed_len == 0, ( "Prefix caching is currently not supported with " "sliding window attention") start_idx = max(0, prompt_len - self.sliding_window) - for i in range(prefix_len, prompt_len): + for i in range(computed_len, prompt_len): if i < start_idx: slot_mapping[-1].append(_PAD_SLOT_ID) continue From d65fac2738f0287a41955b45df76a2d5a919bff6 Mon Sep 17 00:00:00 2001 From: Jason Cox Date: Sun, 3 Mar 2024 00:00:29 -0500 Subject: [PATCH 045/113] Add vLLM version info to logs and openai API server (#3161) --- vllm/engine/llm_engine.py | 3 ++- vllm/entrypoints/openai/api_server.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e84fda5640e4d..c9bd89a1b18f4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -7,6 +7,7 @@ from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union) +import vllm from vllm.lora.request import LoRARequest from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig) @@ -85,7 +86,7 @@ def __init__( log_stats: bool, ) -> None: logger.info( - "Initializing an LLM engine with config: " + f"Initializing an LLM engine (v{vllm.__version__}) with config: " f"model={model_config.model!r}, " f"tokenizer={model_config.tokenizer!r}, " f"tokenizer_mode={model_config.tokenizer_mode}, " diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3777e0f3a0601..993a834e5a720 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,6 +15,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse, Response +import vllm from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse @@ -168,6 +169,12 @@ async def show_available_models(): return JSONResponse(content=models.model_dump()) +@app.get("/version") +async def show_version(): + ver = {"version": vllm.__version__} + return JSONResponse(content=ver) + + @app.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): @@ -231,6 +238,7 @@ async def authentication(request: Request, call_next): f"Invalid middleware {middleware}. Must be a function or a class." ) + logger.info(f"vLLM API server version {vllm.__version__}") logger.info(f"args: {args}") if args.served_model_name is not None: From 996d095c541e1cd67f0a7ec2579bc3bb0a435494 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Sun, 3 Mar 2024 14:37:18 -0800 Subject: [PATCH 046/113] [FIX] Fix styles in automatic prefix caching & add a automatic prefix caching benchmark (#3158) --- benchmarks/benchmark_prefix_caching.py | 59 ++++++++++++++++++++++++++ benchmarks/benchmark_throughput.py | 5 ++- vllm/core/block_manager.py | 15 ++----- vllm/sequence.py | 8 +--- 4 files changed, 69 insertions(+), 18 deletions(-) create mode 100644 benchmarks/benchmark_prefix_caching.py diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py new file mode 100644 index 0000000000000..c43bd9c3bed3e --- /dev/null +++ b/benchmarks/benchmark_prefix_caching.py @@ -0,0 +1,59 @@ +import argparse +import time + +from vllm import LLM +from vllm import SamplingParams + +PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" + + +def test_prefix(llm=None, sampling_params=None, prompts=None, prefix_len=None): + start_time = time.time() + # whether use Prefix + if prefix_len != None: + # start inference + llm.generate(prompts, + sampling_params=sampling_params, + prefix_pos=prefix_len) + else: + llm.generate(prompts, sampling_params=sampling_params) + + end_time = time.time() + print(f"cost time {end_time - start_time}") + + +def main(args): + llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat", + tokenizer_mode='auto', + trust_remote_code=True, + enforce_eager=True, + enable_prefix_caching=args.enable_prefix_caching) + + num_prompts = 100 + prompts = [PROMPT] * num_prompts + sampling_params = SamplingParams(temperature=0, max_tokens=100) + + print("------warm up------") + test_prefix( + llm=llm, + prompts=prompts[:1], + sampling_params=sampling_params, + ) + + print("------start generating------") + test_prefix( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Benchmark the performance with or without automatic ' + 'prefix caching.') + parser.add_argument('--enable-prefix-caching', + action='store_true', + help='enable prefix caching') + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 51c1a6540a451..1f0bfe06a67cb 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -303,7 +303,10 @@ def main(args: argparse.Namespace): default="cuda", choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') - parser.add_argument("--enable_prefix_caching", action='store_true') + parser.add_argument( + "--enable-prefix-caching", + action='store_true', + help="enable automatic prefix caching for vLLM backend.") args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 08d519ab767a9..daf83827a7e52 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -236,13 +236,6 @@ def _is_last_block_full( token_ids_len = len(seq.data.get_token_ids()) return token_ids_len > 0 and token_ids_len % seq.block_size == 0 - def _is_last_block( - self, - seq: Sequence, - index: int, - ) -> bool: - return index == len(seq.logical_token_blocks) - 1 - def _maybe_promote_last_block( self, seq: Sequence, @@ -436,7 +429,7 @@ def access_all_blocks_in_seq( def compute_last_full_block_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return - max_full_block = seq.get_len() // seq.block_size - 1 + max_full_block = seq.get_len() // self.block_size - 1 block_table = self.block_tables[seq.seq_id] if max_full_block == -1: return @@ -451,9 +444,9 @@ def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]: return [b.block_number for b in block_table[:block_idx + 1]] return [] - # Can return non-empty result only with prefix caching enabled. def get_common_computed_block_ids(self, seq_group: SequenceGroup) -> List[int]: + # Can return non-empty result only with prefix caching enabled. if not self.enable_caching: return [] @@ -463,9 +456,9 @@ def get_common_computed_block_ids(self, ] return commonprefix([ids for ids in ids_list if ids != []]) - # We only mark the last full block because with prefix caching, - # all blocks until the marked one are guaranteed to be computed. def mark_blocks_as_computed(self, seq_group: SequenceGroup): + # NOTE: We only mark the last full block because with prefix caching, + # all blocks until the marked one are guaranteed to be computed. if self.enable_caching: for seq in seq_group.seqs_dict.values(): self.compute_last_full_block_in_seq(seq) diff --git a/vllm/sequence.py b/vllm/sequence.py index 122960035e505..04a9a90a68bcc 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -160,10 +160,10 @@ def __init__( def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 - # TODO The current hashing function is O(L^2). We should optimize this in - # the future. def hash_of_block(self, logical_idx: int) -> int: # Compute the number of tokens in the sequence + # TODO: The current hashing function is O(L^2). We should optimize + # this in the future. num_tokens = self.num_hashed_tokens_of_block(logical_idx) return hash(tuple(self.data.get_token_ids()[0:num_tokens])) @@ -308,10 +308,6 @@ def prompt_token_ids(self) -> List[int]: # We use the prompt of an arbitrary sequence. return next(iter(self.seqs_dict.values())).data.prompt_token_ids - @property - def block_size(self) -> int: - return next(iter(self.seqs_dict.values())).block_size - @property def lora_int_id(self) -> int: return self.lora_request.lora_int_id if self.lora_request else 0 From 17c3103c562e748686a3fa4bd9b43ebe98aae3d9 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sun, 3 Mar 2024 16:19:13 -0800 Subject: [PATCH 047/113] Make it easy to profile workers with nsight (#3162) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- benchmarks/benchmark_latency.py | 6 ++++++ vllm/config.py | 7 +++++++ vllm/engine/arg_utils.py | 8 +++++++- vllm/engine/llm_engine.py | 15 ++++++++++++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 6e3b679cb81b2..2fdc08c5c26df 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -26,6 +26,7 @@ def main(args: argparse.Namespace): enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype, device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, ) sampling_params = SamplingParams( @@ -145,5 +146,10 @@ def run_to_completion(profile_dir: Optional[str] = None): default="cuda", choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') + parser.add_argument( + "--ray-workers-use-nsight", + action='store_true', + help="If specified, use nsight to profile ray workers", + ) args = parser.parse_args() main(args) diff --git a/vllm/config.py b/vllm/config.py index 876a439cd1280..e39fd7265689f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -382,6 +382,8 @@ class ParallelConfig: parallel and large models. disable_custom_all_reduce: Disable the custom all-reduce kernel and fall back to NCCL. + ray_workers_use_nsight: Whether to profile Ray workers with nsight, see + https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. """ def __init__( @@ -391,6 +393,7 @@ def __init__( worker_use_ray: bool, max_parallel_loading_workers: Optional[int] = None, disable_custom_all_reduce: bool = False, + ray_workers_use_nsight: bool = False, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size if is_neuron(): @@ -404,6 +407,7 @@ def __init__( self.worker_use_ray = worker_use_ray self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce + self.ray_workers_use_nsight = ray_workers_use_nsight self.world_size = pipeline_parallel_size * self.tensor_parallel_size # Ray worker is not supported for Neuron backend. @@ -426,6 +430,9 @@ def _verify_args(self) -> None: logger.info( "Disabled the custom all-reduce kernel because it is not " "supported with pipeline parallelism.") + if self.ray_workers_use_nsight and not self.worker_use_ray: + raise ValueError("Unable to use nsight profiling unless workers " + "run with Ray.") # FIXME(woosuk): Fix the stability issues and re-enable the custom # all-reduce kernel. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0349c3a6636c7..6882e8be34d11 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -46,6 +46,7 @@ class EngineArgs: lora_dtype = 'auto' max_cpu_loras: Optional[int] = None device: str = 'auto' + ray_workers_use_nsight: bool = False def __post_init__(self): if self.tokenizer is None: @@ -168,6 +169,10 @@ def add_cli_args( help='load model sequentially in multiple batches, ' 'to avoid RAM OOM when using tensor ' 'parallel and large models') + parser.add_argument( + '--ray-workers-use-nsight', + action='store_true', + help='If specified, use nsight to profile ray workers') # KV cache arguments parser.add_argument('--block-size', type=int, @@ -305,7 +310,8 @@ def create_engine_configs( self.tensor_parallel_size, self.worker_use_ray, self.max_parallel_loading_workers, - self.disable_custom_all_reduce) + self.disable_custom_all_reduce, + self.ray_workers_use_nsight) scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index c9bd89a1b18f4..8a2573034c940 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -124,7 +124,20 @@ def __init__( ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") if ray_usage != "1": os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - self._init_workers_ray(placement_group) + # Pass additional arguments to initialize the worker + additional_ray_args = {} + if self.parallel_config.ray_workers_use_nsight: + logger.info("Configuring Ray workers to use nsight.") + additional_ray_args = { + "runtime_env": { + "nsight": { + "t": "cuda,cudnn,cublas", + "o": "'worker_process_%p'", + "cuda-graph-trace": "node", + } + } + } + self._init_workers_ray(placement_group, **additional_ray_args) else: self._init_workers() From d0fae881143f07a558ea72b2cae3c4c6dfa94937 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Sun, 3 Mar 2024 17:03:51 -0800 Subject: [PATCH 048/113] [DOC] add setup document to support neuron backend (#2777) --- .../getting_started/neuron-installation.rst | 135 ++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 136 insertions(+) create mode 100644 docs/source/getting_started/neuron-installation.rst diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst new file mode 100644 index 0000000000000..0aff1037d8a29 --- /dev/null +++ b/docs/source/getting_started/neuron-installation.rst @@ -0,0 +1,135 @@ +.. _installation_neuron: + +Installation with Neuron +======================== + +vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK. +At the moment Paged Attention is not supported in Neuron SDK, but naive continuous batching is supported in transformers-neuronx. +Data types currently supported in Neuron SDK are FP16 and BF16. + +Requirements +------------ + +* OS: Linux +* Python: 3.8 -- 3.11 +* Accelerator: NeuronCore_v2 (in trn1/inf2 instances) +* Pytorch 2.0.1/2.1.1 +* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) + +Installation steps: + +- :ref:`Build from source ` + + - :ref:`Step 0. Launch Trn1/Inf2 instances ` + - :ref:`Step 1. Install drivers and tools ` + - :ref:`Step 2. Install transformers-neuronx and its dependencies ` + - :ref:`Step 3. Install vLLM from source ` + +.. _build_from_source_neuron: + +Build from source +----------------- + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +.. _launch_instances: + +Step 0. Launch Trn1/Inf2 instances +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS `_. + +- Please follow the instructions at `launch an Amazon EC2 Instance `_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. +- To get more information about instances sizes and pricing see: `Trn1 web page `_, `Inf2 web page `_ +- Select Ubuntu Server 22.04 TLS AMI +- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. +- After launching the instance, follow the instructions in `Connect to your instance `_ to connect to the instance + +.. _install_drivers: + +Step 1. Install drivers and tools +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron `_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: + +.. code-block:: console + + # Configure Linux for Neuron repository updates + . /etc/os-release + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`_ will be the backend to support inference on trn1/inf2 instances. +Follow the steps below to install transformer-neuronx package and its dependencies. + +.. code-block:: console + + # Install Python venv + sudo apt-get install -y python3.10-venv g++ + + # Create Python venv + python3.10 -m venv aws_neuron_venv_pytorch + + # Activate Python venv + source aws_neuron_venv_pytorch/bin/activate + + # Install Jupyter notebook kernel + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + + # Set pip repository pointing to the Neuron repository + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + + # Install wget, awscli + python -m pip install wget + python -m pip install awscli + + # Update Neuron Compiler and Framework + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + +.. _install_vllm: + +Step 3. Install vLLM from source +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: + +.. code-block:: console + + $ cd vllm + $ pip install -U -r requirements-neuron.txt + $ pip install . + +If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. diff --git a/docs/source/index.rst b/docs/source/index.rst index bdc541cb2d58e..e90481845c4ff 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -62,6 +62,7 @@ Documentation getting_started/installation getting_started/amd-installation + getting_started/neuron-installation getting_started/quickstart .. toctree:: From 901cf4c52bf65472ca13aa4f996d631d00c2228d Mon Sep 17 00:00:00 2001 From: TianYu GUO Date: Mon, 4 Mar 2024 14:48:27 +0800 Subject: [PATCH 049/113] [Minor Fix] Remove unused code in benchmark_prefix_caching.py (#3171) --- benchmarks/benchmark_prefix_caching.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index c43bd9c3bed3e..a0307439cd5f1 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -7,16 +7,10 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" -def test_prefix(llm=None, sampling_params=None, prompts=None, prefix_len=None): +def test_prefix(llm=None, sampling_params=None, prompts=None): start_time = time.time() - # whether use Prefix - if prefix_len != None: - # start inference - llm.generate(prompts, - sampling_params=sampling_params, - prefix_pos=prefix_len) - else: - llm.generate(prompts, sampling_params=sampling_params) + + llm.generate(prompts, sampling_params=sampling_params) end_time = time.time() print(f"cost time {end_time - start_time}") From 27a7b070db526326ede3335fb07c1fa13ac008bb Mon Sep 17 00:00:00 2001 From: Jialun Lyu <43287111+pian13131@users.noreply.github.com> Date: Mon, 4 Mar 2024 09:23:34 -0800 Subject: [PATCH 050/113] Add document for vllm paged attention kernel. (#2978) --- docs/source/assets/kernel/k_vecs.png | Bin 0 -> 27676 bytes docs/source/assets/kernel/key.png | Bin 0 -> 111314 bytes docs/source/assets/kernel/logits_vec.png | Bin 0 -> 17475 bytes docs/source/assets/kernel/q_vecs.png | Bin 0 -> 42065 bytes docs/source/assets/kernel/query.png | Bin 0 -> 32710 bytes docs/source/assets/kernel/v_vec.png | Bin 0 -> 51256 bytes docs/source/assets/kernel/value.png | Bin 0 -> 121414 bytes docs/source/dev/kernel/paged_attention.rst | 525 +++++++++++++++++++++ docs/source/index.rst | 1 + 9 files changed, 526 insertions(+) create mode 100644 docs/source/assets/kernel/k_vecs.png create mode 100644 docs/source/assets/kernel/key.png create mode 100644 docs/source/assets/kernel/logits_vec.png create mode 100644 docs/source/assets/kernel/q_vecs.png create mode 100644 docs/source/assets/kernel/query.png create mode 100644 docs/source/assets/kernel/v_vec.png create mode 100644 docs/source/assets/kernel/value.png create mode 100644 docs/source/dev/kernel/paged_attention.rst diff --git a/docs/source/assets/kernel/k_vecs.png b/docs/source/assets/kernel/k_vecs.png new file mode 100644 index 0000000000000000000000000000000000000000..4b7be1385aa2e012b3733835394175af97f073fd GIT binary patch literal 27676 zcmeFYbySpX*9Qs+NGPC$(%q$`G}7H&BMlNmH$y3cBHi5`(hbtxC5<#gN$0@1QJ?qy zzVG?_tabi4v)0Vqb=BVc+VR`_4pvf-e2PYZ1_uZCR9Z?*1r83;1UNoNK?Z)qW0X_j z;Lyq}MMagQMMcS!9KdFl)~0Z9Qo%8CkJVJQv3(CcS(~xQ$j4EerNQvoA4U=Aq#hHJ zqhiUR_@x_Z;%R=%r10HolL3ep!qO3I2<)5#0_n64P@&4-m{`O>e z$P3KoYIY!Sk;;w<#}L38liSe`XCmcb@)KUL_(z=^9t7^YFACjvj2|aRvnmQ&S}4M% zD);(Ju8%C_ma7Q9oE_fZYbrncT!929Y)D6oR*8g9M=IIF=%*I@2(ALNnvi8MBJ@Z+ z8lSC7IGQxn{umMStNrR5<5z2f5kK{!r{KwyF1@tb;BY7>hx>7mfvE%6RKVFk(Frh zV*7;3OGjVv-A;{tOrS9-7QgtN^lI^_G#+{ntcOROAexv&c2`G3KewotVWTh2|qvUNwcw-M_yGFG#yvbeH0MyRP7v*4Gu|I*Rww@mL694 zf|7dtlVOj|y)F|HLG|67p{&(tAAfeaL~~Gea7a4%iCS@rkb0|Bd&1`g;-Y+cy5qB? z<6xCbSw}CN`s^J;G#_OT-}clL!i@N&0FvH z!q|iFea1|{m*ty8ug`={o45Y$%{DW3x!ZHUGpE?*l4eRcV^^0|sV39VW$_?)_x{+O zKoZl_1G4!%>)=Ol9E*>H1NcqZMMi{d;NiX!B06Xve6ZQ$g=ZpzCwK?bO&v}PS2PgiH{Rc+BFwzfn4F8T5Itv8aHf=k2 zJ@}b6n_qCJzK~xCPy8<4;rNsBi+$V6^2E<}fj_GvX8kq$pn*ujFn z3Yo$Ga|T=f^TI%gpMtP?hOE*HCrs!^V zb|lG|;19At87{FFP|`k@eXspVa*4zr0{0>0)idu-G($=~Dx6@+NG$vcB6=~IaE5FQ z%eTDYNm7`{&#!{o47Xq6b)&6?qIQZIBE~ZfIt+Tq;ct@VKAQ-w58ml|V%TC-pk-4` zJ^gf`quVH|9K9Ur$MZdi^`mqCbFLcF8#rL&|Fx*6dbPpM zyUviGq|C3j({n}i$jb|(1+Ep_2gL{3=h1T5y)YT}Datvbj|l%;5iNN-@<}WOY+J0< zXA;5vV#b+-ucPUxSZHi14k&A|%AVZ>XHnrs()R`6NXt-{P{c$+rM9ForNMnB74Wk$ z{7FnSWmIvIVd1Qio_(bBg*fPFacprEdDhuP=~XHR%1bH*YS@L`oT-VTi&7=Isab-# z5T#5;ZN{VM4>F9={V@bFwsg&z&n&w+Fl+>w!!$(rBpM|=3*^+els6epV&WL%7}Jzm zviOTdG|N>{)%%pR%T%%)q)ml!%id=~^BB~IG#86J6sp5QN1yKKHTbp!wuC`amqxz7 zmc!?lV8{xxk00xMsiLaxsP;ueH(R6BMW*Ca@GJRveluQ2>=U}mq$|tBVUA&ru`1^Y zXzpS>S7vF>K?#crt6G_6YssB-K+ey+h}?4VEa|lf8i@%J$C^!%A%Vm#PSE?;RSVUK zIv;f;s}HO?=J4iz&dF9?ya!E@Opi^Wjz&QDCVBEsRN@MDOOgs3U1Fbu#I&1=M0!1Y} zB-p>k5=;>+a?=o0epL-|ed+o#EIvN5GrpM9k`4crI`>AxX2Nu$AV*fBdxB|#dcq=G zPeSF;#Gq>usMeP`nMsAIpSe`KJ$5+OXkal`f1-RsKTkGSQ@+($ynIz_LuX3oOozQh zv{+M@Tf4R5bwzOPtkHzw$<%Zo*P=_>{v$s1MzY2!7iJgm9`hd3`NnxIQ9i~U1|v}- zu>qeB?`NWmH%rc$d_zt;yOW!zU9(-=(Ux>7rV zo$IloS>~So{CUuU#)0Jl=fNX(IrAX1y`-@%!K>Tx}XL@+IRGk!Jc;jr?srVd@yMv!9 zL(HYkI6wL;8Ej9FN*d-GY8$o=cMoAC=?mUi*6Cl4{|ZI%==2Xt47!LAj7Xv&6-yBB z%bLry9COSn%91qlGX8ArVPs8)_DmsgME*tnH2 ztjlpY?lp9^bE3$c&b-35ckSoVz zOV6=8v%fHPk!H?88c$`aE#B7_t#~(kXHDm6;Npn!$R@P3e`sB(*V9qJallcCDa63d zI-;MVFSF{(o?&RkUEO|7{B?0wkYVi6Is=9Y?YL6hNpPG{=h2Z z+b@R*8)fOStuz0#IPM=dbc?@ijK{l&<%g5bt-Ebvtq+z~t2f@2%h(&(FHa}v64sX2 z;M#iDdbgd+QioFG@=5Wn^LfaK$|SiG>@PH(IBLtx?mMI%U+d2`PVW=UGB0==hh3yL zprw4y{H!Px?9-r&uQaP=s-j-;qL$nBaKd}+h8JRXuh$H6^clNjeN6S(10#^sy3D1Z zv~;X=RtRwvXLe$c^($7?RwvS@f^#k*-o*!XM@RZk zPXpT}^6l-{lvHI@6&>{)XM)qQZI}<3pSJ4Ud#q&N^&P9e&*Ms=Nf{P`?aM7C{M4}A z9jNRxznR0U5_t2%^FZTf?|k^AZHv|5SF2rVz=Zu~W=~gC+yYBV-C)atQ`eQuntEQ* z(^ekX^ytpQtlQ9r&WZEPF0(#+gTnH!2K)2e!y)uqx7yN$ROiMbzQUV>lzLZ>1GGbo z1=-e%Ysdj?>@g#$5h&AZSaI|@e^lZQRnrgK#K=AGI7om##%Z0T2 z=l~V zgqG0Awtb@_;oKbI;NBC#o$kVgZ_1rct|7>KJr2RXROBCX4g10l=cx-Hq=zbuQs|mF zhe*r;hU=colmf%?y`k$ix#wFd3JYerE*bTBwojCPp(*b1VT*2lwdrbp$xLPnK|q zf1gnR{vST!!1tlepZ`aZfpCw3v**CqEgj*Xrx8ukAN_L-cMjZxd#frcEe-su8atSp z+B#Z*oz4f+{DBLoc2e2^1@NdJzVOm2RKI}!XDrpUoV4WSc#Xj}EQTgvBU2VP8@q>o z;P~BmfkPWpCqptf8*5ufUN-@X-%s!Y#}AiTDad|5;$$U2p(U?GCJJ^iCF5dw#qx?m z5RHtCjNie;j8{cW;%{@{On}0|$;pnFmDSbNmBp2v1?*tX`kIG_hxHX3D;pa#@C37? zyRDO<8?&t=<)2Re=|{}e(b&P#&dCyNOZL#Op%K{GNq~alVW9tf{>;QVE|JR-Wj`%-XYW=t6 z>(^`?|Fh};xb=UVsymuGh=Of^F`Wee%dfwU|L4uW4f$Cgmi|9T@rTa8uL6V?MB``u z57PwEXk)qQ06r2~iYclA|A3VJ{&)hv57!_5fiDAlx0hTr931_$AdU(vGoUIXNO2(1e9q+%e*SDsEXvmx5jP7S zo*bb~2@T~l8bUvL?Lv^R5#oBFw2$ZzZzW%4Ptzig%e^X6qvP4pF673nb!I}fZS;%L zTR3<`EaCtCgCduX0H5%^xDVyu_g*2P^o~CJzb%B}{Ct5Mh))z;P{@4Y5dQZMJpT3n zy#zdg^$iX_!MCjso{(p}dpM_T@3seh@ zx;~35VHf_HtFR>sRyh4vm)zLj9C(fOqrtgIr`e=e6wF$R5{E$5P`|~m)Tgv05Qib(PjJ}7^c*Y5Jjt~vDn~zf^&H_;!|qi z_7iCYwii_My)EFedX~i3$xvcGesl)EI`$F%3CGrILE!g}^3zBPg+V>zl~QqP}R{hQ>hbM+7dincZ@I z2gkA`)wQ(~Jnmr^6S-?OjsU@AY(0(t?rXnofH%Pkx=PO<#_@ZJ+KX5S5ZocU4w&;M z5qaoW?a~keP8n(89xKBSA#rm2HHY)Hic|T@ls1O!NG1|tBqaweHwRUgZu?Ep!lFrk zB=m~ZtZqi)8p^-ize7anT_1^z{!K3#DPbCm5{;6?tnL>euk!2lC`F=q2knRm%#*DV z&4E~E)hHTyhh!Py?zU$GxCw)+8$c zB8wioCD6Uf5e-I-;#ZG6Nxgo(?YC!x|F)ph^PA1s zEV{;oSbSqeG}$9K@;P+*qSN3z7)`IL25?-J;chbtT)11-x>ODu%@Exzk?(t(a2&GY zVDcU9@3veQ2Q-1fOKHT1Eu20HEHYZ>%T{emJ+cWPora;w6>+OI^R}w2t+y54s;kZX z&l_##Yl5A3zlNmO-yRQ?s2e={qr=F=00|;rjpF<*LHtO7*cX`Kh#XUX-iW1kKUQhT zaoFhJaY?!v{~FF?x8keQ;8YN)tGRxj{~4FDFjqb)5mZ6QVUb=Ty<7sPLfi35#YOu1 zc%;SadM8gt0>q?UA*NoS(lw|nak_r}fxb^Mm4FI+7;<~m^L}@_1X^4=~j^g=IOC0P#ptiLy5UD6nn=*_4Xo0g1` z^`Di)`K=(Gc449aC8-J4Xw)SE?B;^oY4mL--p2Jzxv``%?D{N~2TIGGlU)Ypetl|O z{gr+A4W(&XGrxX=(`%>nZg!g+4A>;eT502fOj0h1X~Y}T&B25~j61i@`%Y}dmy)4G z(nN=65|3j-K9P9P*|cb+oZ9eqdmc7##YFGA(08o&MSV>b@Fd`s+SE2+#B`7GXa-bM zAckChZh~bNhF5lV1b_BEGA4=jwr}`Yo&8$0!;WsOUZwpmCNZy*oxpU9J#?~w-o46d zW*CBU_+PeOS-t%4+Tz*G75d|JZ(dyME${ZJ3phI6Af#GdSDKHU%Pr3x0rWO6jv~1- zvYxLQ3THi@jsDrsSjN$EcWuRp-cy8FO{0*!&6qz`ph~+|IU?woy;V)|Hru)%>LRDD zt*!Bvnh}i3veTn|GdyB|T;@A?d9GD;~xx!=s3x#`KR$d1=LBw z`Tl%Tt%xZvR}#!1>RRx)ml6u7?I_A%^E7V_*l}E|leMgYebSaxXap@6wNQn?mEQDU zVy6EBEHxTf4&(PC#L37UbM@@IaNUc7TT&!;o-vf}Y%r2#Nyg;++}|vhy7zY9-(9qt z3AdfVrSUD$B<)Y+g+-EyxWDi&3LURiaHpYktm|!J-q1HWh~cuCbEBbb3yo#gEu1RS z^dyPe#VGTYD>G=G1XxYf*L>7X2n{5(wX=5AjaZoi3-eQU&&^$K`A7qDefDp7OQR(aNy;P7By>Ti@;QVTTiV~ha}YU1?O z3;%vB43a%8cw_A-vXGbR(Vr^6+AMd+7&YC%e2qtOb~>S$qzH{@kP;SdDK@Xv^VrEL z)UCCZVTueu!lsc^T5NPxq+&PfL~%ZBzMz?oTJEK6-k=xRN_GsrJ|19=Nd@)DO%yS) zHlIc|z|QBRqNIxlwv&5EJekT_z}Zbgw?8q}E6s+Yt0E?0UszjS1*ou_4cZYgGTLT& z?LF>Wuj}}PSqqkFnmOr+56g@V_RPT`ghUDp^*q%4E%&D&Bux=-%Kb#-qI1ripvzy z4#hLbuueeE7RPUUgzgN9gdqG8A?xH(t5D%j8Q@)q!M4mayY&P!Kq-CozB#8JFK>+q zc6Sc%JJU-b;-2tgyOx&huCknX0&}8XTWJe8&sJT!-(dBT?vvasKGJb|MYZ8JAl+XK zMwQ`Fe_Z;+l5h7-ttN4}G>;-zZ|6W?k1T$Lwi6k{h8}Tv-==|3@)ev}48NV+qqvkkDnU0O1tED%eg!u9s_}Ggztg_`- zJG#KdKu`!^^~Su^sdrjIr;5D2at)AOV-{{)bi|@P( zt4?&&Z*nUiSL~1}Zfut3t_Xa_W@}{q$2&}S1N?9t@VpB9Z%IW!LnIC#3+vaI$Wv6* z%@Vr58iCff!b;X&3S5gmefjEp`H&U0zOV!WSx4Yg)fh(2bKXVg#)au@(zhuUiZs}z zN*N;kF11hD?&Xd%)9$jD+_zHJheDkNt|fwl2YD)uy3l%Bn_+rR%l!PdUw zc!JVraK3V3GnJgonEvSVAG!G7UZOi53NIPuNpTZEI7_S74(n1mq-}d|Xh`7` zuvL{L^SiTc!G-RhhJdNPYsW;`R>dXIq{FEunwSj|+#WX85HOjoH}adO5i5!{Z4BT)oNSeaP2%rA=~P$G)z7T|LcwY?$w z|)QM#?BaUs2t6QKTs>AwdsYebH){eX|R? z*MXgt`8Zn-RIR!+#sc>?_amOEoQo@-7M-s5pUcx90$%ic!NmENtm0z1p(O5+mf(wj zY(gZk((si&yoZ&NEdjc$!b+&NhsE60Y%?qJ5gXfQF^MQQX12s! z6_6YU2)zWkv2ot7p5d+Nb^Z{-Z`%mCzM&i@U5sRu@X<9k^Sq&PJ#GX191nh_b!{2{ z$sjvav-LX0I|@2qTSmb7KH8Y@KmK|sFcG_jKHT3E+5A0`?`g0-b7UQ%gFPOrv;##FKbByJaImR02v-dwhDh3|iuirIwA2S0r?> zX6oq&yy8jc8U46Oe||%0iEnPqfrjy6-&)S5b%{R#6kI1mU6R7;Y3a>sUPMqlH(qb% z?<+Ly2&Bi}?-wld4cT){`k^UtTdd!x0eEua81_uh-6C`gB46O+uXFjxAKMrI<7~qr zT#I}_`XjH14mfW-PV+N-8%s=1SVZG}HSVOe6&7<`Eri9-~ zxZ6Ht{xhUPso(vYUe@+ggYA61qj~=oFWH)hbq%3rsa`^4t~xF8_gA5$LaDFmy}yKI z&`xlkszyD5;m{lRhLd-14kZz94aL9i;3O4j=(JC+rnrLk*b~YpaXAYp&UIPs%vR>@ z@&N7}`}x7*u5%*!J~SpRe%n`W&XJk>rU=6&=jCv!zWU3F60jW=^(?U<|f+N6!!v0i6=9^7d4d z)#f|4G7|+n_og$Rtz=`{89yXWyA|OxK%i*%z357GYc(8_Mr&KnN;=&G9%E{-`HM-& z_flj*7W!!^eIj2-OWsvmPDlvc{_5D=f8E!kwfmq7F$GS;tD$Z&Ri0;1()mXkN&SYU*k!g-T|EP=u`*k`WU58A#TW&(cdyOt zH^TrRit?LhsrieK2%`XEEh0=S``Z$%V%ar*?Nl7%b=k8(wl8hHyDnCUyON}lPc$L9 znN(FwILR_`T>bU4V+wY$3?{iAmt`#=5%4f^)af>6TP$@vUYBO?UBQ#Ii~tMtN9Y1maC|%D~DDKd`^d)q=|a9cF48;}+x48SG! zG{8E=>=w}0{nz}vUS%^Z`VCa$@)r`DQe%`($B|_-3vWD3qz%RuON%fpveo1tXFMUE zWIMHyaUXHq9?hV?>O<~3X?sjM$SXwL#D2O>GE?;<=r0)c6D{AalGgQ)Iay`>Wg`3-wtY_Grbt!CMR(jw#Q)>um?(^@`x`HIX& zNq9uw2vA3YZ6&F!-QlQ)1K4umjsyMuPXuUXGx=wm#vIpQdMsg&c<7wTLt+|suNci% zIVYB$Z)}n|4oW5Un$)nB+vdo`3zb=9O z)ee3n^KZzGnnqYSx04_lO7Va13IG3%{^CaeKOz1W7{K=%UVrS%bGqqeR|vGr zxo@Y1I9gv268Z4-R|j3$tJ_J&e&=J=v^P~z(XW4Xp;+Q`ScGRV>X!NdfbT&4DdafD z(Lq1k8Ul^85_D*~GaTY!6>>f{rZqjOyevPNAia&2A3idU(4!UFIEyMvj@lKu7l6&G z6~nk^?!F)WwkBhEzZ|{4I|q9vIp}*|6ZDQa7dBdKmYjkv4m%SzRKn8lk(Yf`b;)a) z8EQLDT&tE@z0N@-5J9@z+wUhI&z%}?${D9wt4=`>`|KNzDjx;Yo28L0?FP1Qm=Eq& zx)Wj@%g+WXeelS+PA5b!e1`fIbPZ0v0XI~bSF#?$CjjoC2~o*dgR@nZW|;wgSfvkL zUGv4s+n==g!or(Qn#aa3T!c3P1KSONYa#Tixi24FB>;|7KY&QyJKyJ!3uQs@D6aAZ z76nJp#(Wk9WoI`R^UB)-1%*k2JV;j>Ivc|r^c{4*E0SVB8$R60-VdER5`4T%DhSz_ zJv<~-OC*{=GU0VSux8S$du3mf#%gqqY=4t3MRjo0t*GSZJ7_5<3YX9lL7eq?=`Sp%~D z{D^a(<&1T!xxN1W^q1uM&hZiR&+IXK)7z~dcUot6Zul4uqpks?r&et}t8&4U<7C(E z1^8*1;~-!4uLPWf&bzGu6wkP5Sv-{tf~D@7Kc^6Skg+zF8r}_+G}SzX0>GJnz#0=^ zrLKodGI#aNryy?J3!C~XzZn0tG;AL3{HDx!MHbkM!{+ZWBK-}(n!Xm|fNNqFYBYv2 zy3MEoXH;0+T~hP+9ZHy&X79!pmHvJ8Dgp0gq*ZyPU`slDf(Rb}E`b9X0tv!%EFv}& zDWfuyw;3jc=m_>$>G$0kF?5uFK+7ER@6OVeRgUG;<0bbldp};k9?cH7e~Q4?HCL3c zsRJKeRbx3pv++_Odvpf$M&-HDBLpOz^m{z4xa@y$LR$DzS%=ltl^|5Nv*p*u-T8vv z*YP>jWZ2~rS#P??!--I1bozdAq4s|x!*Yhd22bh+9shCRlz0*J;-VFFUl_|TUW32j zF-ObwP{=YpV!Ni7dB69gyz7U-WI+(3Npy&q_&$0q%PglGsVUGywC<;hzdh(Y^q}GS zLf2;Tx=g)C3)zIrb|D`Kp}XWT728Bv4~~LFM!*KCYeh{(GUB#mkH>|j50JRSNPaDI686{ zYEgrmBZCXfrTl5{D{H}OzuD*V@U;omJFR2Zj509{;R`;fz1ali55ZPYlCXntATa;x zcRR7%R}ErR+pRhCq^lZ^wRePHd_(488XuY576@XGC3MrxM^fM4ULJ*tjQbE16^m(+ zZG(uO<)&KzT1r?M#ZF%zXGp6eW~Cw9B&mgtfyC*)B)!#%;3^!wVdaXB-IDAbYHyJL z)ybA_(WX@@goDVk!e5vi@$#1W1B2y*JMk0sS3OCp08bpq_f8igs)C|OwtXinOw&|D zvxS2CN`yw_>sw_9yztiSexVzRRynQTaKkD+*69m>HwI;BKAP4HH4uHMj0oNoYs=k@ z;!jOy>h&RL+zUW%>j-$AEZS~Z;S<*J-ZfJp)%~LHGs_$QiJup!j}0%;hvU7fu;#rJWY7L>T`M`XMX~h*JhBP5=M^ z7+G|Umpp<`7`O3!PvQ^vLXbQ?iu!MopbpJWf3?58J02BCTxka=>wSWQ$AA2o5d}xX z7d@^0I+FLFYyvB=GtV`M@bw>3fHs6OvDyH>0`ORv42+<6x&Afs!{3q9lOHw#cK8G( zIe@u59}*__LcI@ z`_%{6Y5T|f#zK(;Xz@M2FW>h8I|z~;M|u$i;Amp&35!5{qVBvq&FRH3Tm-gc zbZyt0RyxFDQho$B@v&rTO;n&qMg`&S(r2v~Hy4MRB|6nAyEA1MOt;XfBKN|KJd8i7 z-1g@y4OC3-$3vflM`3Zka)LTvIgD7}!B(Dy2Yai=njjV5S(Wj_<2Op&E?q#UWWN$} zv)HtmBFb;{$4roPDEwiaKOWG_Z^8K6i*8tz5#yg;Y_MaU+Dh)MB+1B(XM&b*fY_Q2 zAAYU6%4S|;D1p7lihKSKH&NsP_K$A&y}?9ydjSGMU<;1OpjO|r07+q*I~(oKoZqg? znh$*F(o`7;1c*{`c3``%kQ<`O>TH5(`vidUCQ_mS8Lcdq#yvJ5aZ}=^(Jfg3 zOk60&H$r@edT#>Eh==N zom#VTIa3P;tpWM)kF3Io4le;mQrnd{JXqM0PNW9G^L+1hgVWCHcCCic!C2O(j5LKP z8zpv~a!a!tgOoh52mInuqAwam9~*@LRxHmeci?c}^Ps-x z5=g5UQ$)1H>&0bXpjsUQIEL{<`mg#Ppv$BikcMHyvw4Wuw+5X1gPu{cr4q(H@;_*G zCnTpF*^Tu{^SN`UXlERwj5 zcUOggr(;&V-8yEw2*7Ctgy1ISzM>h5EEAJ;x$>}BRQ||Aq~Y@>^j?rMYtt_g$%gSX zpvC@=ZXjEY6X)x!Z&QXlL+$c)WjXF)F^-cPaiG%mDJ#o`y7Fs4fYbmmstF^w%cWE{ zb4D(qpw2Gi{bEBk0CG87Y2Lt(@SlV<+#EFSCmCHWm89~scQ5=ultTJko{dT-h2MR=Twa4gquAn3cJ^62GJV?2*8AK%@xFa{TiK6jxD_Nu z-q1<1e6}JW9*}phxlj#U+#Zf?K zKRRp|qdqoPSGgp|Wq%GjXH~{ordw=fKkEuqUljf9_@w)(>p13)u-Qx&qUEcnJ6WQu z*$mKh^YB?w=QwMv^=#yEqk3o~P)JeSp$5c0Q@EOX4?x3i9Z8%c_2L^=zY7oyJ}QoS zJ43*o-YJ;&7)aJIzh0MH=CjyaY~&_qd-vV9QxLyhV{hOAZ_Y2fgL}RF!ZT$}PY7Sd zvO211?voaj)wUZ*c@y7U?9ad0f|@b$rM2P*8+rQb&I?|xM+IL?f#OEMHzT6Qp5CCk z*Wjh5PM)_dvp`ZfLsF+<3D1f(Z-HyR)yGG2Y<;T%&<~coG?v>!G%tagljJqMo6i_( zFM)l|ekyL(U6oyYZ8Q$?P&mXS6||O1els@u_AbO-dUj*_ou%=W0ON+|-PKlQKe+2N zZucWh{&&-h2ixZcPYxS)St_4qmo?qr-Dq#Lz5R(+2iyCkn>jw*+%!Y%VeqQQydB8S zJ|U*!R6g?}>h``^jQhcwhGY){Hi=c6-=oKg-g05>Z0oK;-eXyP*f2#%*tXE{ylc`S zpEET2jf8Du6om21sSWILV5i{DfIaA8b0%4kZTo*(96cs4yq}A^H22P*8;BWtJ$)n@ zM#8C4qGP(@&H8CnTx6C!? zvYMoxLN(V-Q)wHnAFG)sidvdk^|_GdY2icVOe+tU0T@>s4>j|L&*RxJdr#K4_^91; zFHn)Czjy3k%FPEp!8f{t)aJ$QG<&gZsF;-43z#- zXl8$A=vdI5eiM+{&>9HxhGY9Rn-zYqd1^JtVD0V`)%9VY*!Q9L#y3`@Gea6n$B!SE zUA>*Ju}RtvIMyF;p}J5M!2jl5n{ikymCOj!g}&ILesDCqCBgM4!+h8&!*@RmzzMHd zSJwUemJ0TJwy&g#c>1%mj^h|_UQc^oeMxB1@3r?j?k_j)8En0E*GKH{Y7(0}!*cB@ zaA-5%pxo+bZCmexkn;g?)%v1%wSf%a8g929Yiif$1>|N_UY-CW0#Ux&R5$YpJK zmBP?NZwCi@)7MWOwS9n+nTg4RB>iK2lY8$3SrXfx2Jc*HTaQpd(YdkxWWAbd@bXaQuWYZ*JO7>+~Wi1jqX!+ zx!Sw;lk;uS8}9E&;o$D?r~6!-;ovyTA4);@Q&2DUJFa>pEoE(T#BHYL;sp7FG%Oh` zNYqrtb4YTge-z|QQv{WCsx)e0dn?_PqEM8BzmmC-*939KYWd%!9Ui4kTmWEf^M&h$ z6qQIHjPe7;B z6K-~33WfAL0eb*%Gud&t!Z7B}EXhXW!92>oc`$4r0S#7ebW66DEgiMW0o1U(?9sKU zo?5**UyoK*2=2&-G|rY(PwUugKPt0j0@qe8%Q!oJhkuMSwUw6!(V>?qg?VmVaL>f3 z1^iMJf;lPkO@k+Z-bwSS0L2-1`DwXq{Rl{Q1g}*?#YXfw59`Qwfm8f1F1sbQ8v0ya zLeTuI)jA;UZlAUqcsmga)Jp_S$nzy|+32J#by&Lp@B}Iz+;q6cn(tsY+}V1b`_-Ce zByLU^ezw>8AfVEMef|K*8>wCbW|9qd=9Ld(2C94xIl8M_zW*A`R}NE16Kqa5SKN2`!dw$HpE{SGI+IQK+DMFOGd%!9 z#zS(3(56b6$n11`tibTtp>k?}h7(P$2nig$P@SEnU{?`UV8yvTB6Q#M%D6kk-zNI! zV-2C2PoxmH=#8~7ArkJn6(OKn>jl@M(?mN%QL)OZD=CVWWAl<&A$j<;!$x(c3pC}VT9(y z8R&$MgAQpK(*)uor8m$>8L5g>H}qX*bJE{_d1^HhlRGZB8brAohIauBYUb9Z4{km~ z1Ii%u3@7uIKcchLrKc}L_^>K=cJ;2zdHUYC{aP8kon%#ld?I$yicS-B$UY;rPK8Ql z*C&BncsNNnUfZpFZ%_3+(0&7Cma`fJ*TY5o9aK|Rc+-Tr&txS+i4v{}$*9)=zv{WV z#PMRumdL*D$Pd~yFBJ>dR-U4+)12ks<&uIu=iI_uD&4MMCVo&xvzD1cyU(p~nKB^l zuS;N0Nd@^!dnOe4n5!AihA}Lq+~=Muv>hp(+D)UHHi4c#ag!{#AFh=VSq){W9 zNJ}|=$H1<%4OL-JwMh~EZvl~Xey@FI%DQ6!2(#2PT&*8#z-HAB_eq=p_W+SpN@Cdf zXj~@;fTay}r0&9QfLhWVaYnDxG0Bf5wM)F5ww%O^;bDUPTGSeBm3#+vlu=%Ij#^vN`vFo|R@Z;{#ry|<>%A{tE zea-zj%C?jUK+uObr0i{aBi+>j(k5jR6;;-F-6vE&I}_%#U! z@;d6$f&x$&&Q-q%Uco^lkyl^?nX!+fE>Jk{8C{-R`C+(SscUd- zg-hXtTm)H>zxL{;tVcNK=tEaD%Kxuj`xWZ8;KIVbW7XsH!-ZjaV_;>3KeHsVw} z-cKv!m@3G~dHYv2?a65*?fAk!l4)mTrH`yyC+GS0)P96_w>Z;Hhrn;(v7fD4?{#KHv-*@TRN(wa=GNzC8IVBFJ{5MsWX-orlHw2+Ny%>fQ`n_deU zH||E;y-PJ!kh@K5jWt0Pb!!tC$SQvf6T16>yhemjPm!*T-{&e^Z^f)Bt*YXrl5s{N z!`QLtx~xx;+izrk^wAAU*K)O)ME$c8he`*7T3nXXM@8d<*Pb%yd6a_Hw$00w-7j}Q zgcS88(cQ+!2N%SSTme<8S&zJ;E<59^o6^t~m#wgEg`N8&m&+*jU!b!#eS75$os`r5 zwat&PILBH}O1RGyZyd@)W4qQj%bqd{u3^rOg-EItWQR!sn$e2`U233wZxA0gY&!I< z(qa8)6F|$=#i#q+C5zhv89Bp0n=_Y5clVf7W>cgc&pWn({gi-5<$4iM9u>tP+d@@^ z$5kH)tKUjxWRM>h;<%HMk@-ZGB@%gZuWA%);XUsJZ#KCeH&J8tnreaV z@f<|7zUs+{Ai76jHXIBv_99rJSiRKnd|%2J8s7%|T8}GMVEzSEH{!XTTPK+CQS3yr z^NC#UmNv5Bb2Kc0TVW~_7)MYZ=s>zNG<}x!irj|&?e*%gz{P@=+0+_fTa)o&KnZI| zN8oKCk4(W?J36aL?ib0e=iAeeU>xl8tFh+;>w}Nzu}XG7KpeyKrXim=)mYYA`V#!0 zar1W}T&O$9Gd5@mO|{DoHvKHdk^ZB+Ry|Au+f~&)c-Eq`9LH`;3?B#O{MSHGXTH{h zi7Ra!@f{AjolFkCqt%BZos+~aDGuUMDe65BNSFYzf`hpA1Cx2M!#c%4Vjdb?us^x^EVfEgZf=SX^Kob9=iH zbyE2w8DwyHyZLOa8uWSVHd63nAMC@|Fx{%K!#y)`I0}wSeal09MwsSgRn}Z{LfW4> zzeM+bVM{S^VB#z*Ob~i~w)-M@fqAXHsb*Pbznk%JQAL&+7wM!QcT3*BQ~8#q#P;XM z7olaG)3828$aByQ{(kRp2%?k4c9wJG<_UZv#5`}a3z!Q7qBMKrAhZ>||O ztr*T&Wjk!*w4^9glf|)*ry?nBh-o?kr@y6hGUZ%q?+%-yXsO0igT}R1q-j%d2AE1V z=fSPSrbU*KE-oRllFp4;u+1V^Zb?V%P*(Ot9Fx^N{muv{B5#av?AT0|>sLtY@#(d`iKh0OWwa6tZfg90t z&X^>b7*ntqHf|kiNfYGVnmvnA-FWjJ-$qKZvT@Io&q{ZXpKEntC_F+!)2Ah(f%NXz zr#tSRuDQeQ%z^uzftWN~JvABcw>KB9_xet(dRkxY>+;GFTJO%v7BL~e9&PyqrGPXK zZxat!_A&T>32%W=0JgS5ZwCr`mlatZ8H>M(;4CvbWU@YaJ5eeHDJh+Z>siTep499O z|D?HBw|sYAJIomIxqmh0Ej9Zy5rRdBC;Kz8v|i2S7Y%3p;C*fz{VpUaB1{+vQ9!pCaduhFe;o5vf*6v=c|R)GUAOu&HGRzl6@a$)D! z)ab+Dw<#fb`c~6HOWs2c(3$OKqU7~c)y57}r>2cq-G%GjGLTc5_6rC@-3cE%Z|g_f z)0Szz`MafbGSHN9TtS7s4M)9*fj3e^;)n=c@MW!jFw(mKuZT_@&)%Xt^ZhMBe zXDdev%^q9EHec_#Y`$}te)oj@6YWy52!G}4{aS%f;-AHab#I8dtm97t&nRa(Z^RE( zzs|rU!}G(uI-YXlsDDheUV)q>E#oYDuJ}D(VD-5gxJ;cZ=+16?RmWglX zo-(}lT(@*6_sVoO<&qSBm~4~>aS6Z!CYK-FW5F|s#fgh%HioN~bm(6DW(1@n5|t9v6~GbJ-)Nu1?1jiYjb;8$h(tu4qPG0o;4k}J_e8JW4< zPc%G%!wI|z%r;)K?;uKW?snWq%l+|V&5PQrZN0E#&AFDo1Dy(;5T=r*_arcAf&OzdY4kXgPOcBSZ$)-fty@keiXsKUp^J^}SmtjV%{ zUw5qt%FST@_8lR@&>%q3QZqK|6c_dQ2Tz_5V}LV`yThC^r;UCI2y@_BmJDlffi(LZvx z9=3-`izFNyCC^5Gp{ehzzV-B~9`JGLyjFhPC z+i-ue#!ic^m-Jpu@W*ONtaHtMxi@jGw_W;x$6s&-l%IXKl*EgX=aak2kyGf$n~gfs zMeeEjX>BrF6fT5;faME^NCxNEkEqnEW+sO6upTU5;RDp(_@zVO&5w!=DS}wwy}O85 zDG&~!;2}{MUGK(|_6}P4OB~{o{ z06H*1L*I*sQ2h*Lf2Mnyg{jU9YqS(;u9k!JDgkFQA!nnefWA@>C|Y z;Uv;tQOGfeigZ6W+AO-ZY-7yqkggBMwTW-Bz(urAX|j7hO$$Z zq#8o@U3OAPijvfG$MSuC-{;le&-2go-@N#I?&scf&pr2?_j#WK2qNFCtYDw zu+UKu+%%f`up8P4$*1I~kH@;#Ci!{+G@%zYY}|ynCbC+0Ur+R7@Eh@3u|m5UZZdM3 zs@r6{Sd^jATMb6HfSs3x<)6ByxwDH9X?>DM?!y{lZYw{aiQMvV*^(3pX}?GroDdfG z-hD|`{9E5=5G==|81c{NqFC)m5JGC4)lxdI0-HBWy$3tpfS1yn0PS$zm~ZdxhN2bL z+${1iDJVr#dstbCow&Qt*r2LL%x=w)K~Qj%VO0qZRo^4Q_QS?8>Qn~Gs!auGnHwwE z%T?a1P4V14AGYqP<}+4nKgWE38p^7u*x2{4}{0 zZ-7YLB7kOv7A)T}8}md;E}C@az=yKUDBq`4#Y$~U-BvCc_kO_w1KV=<2Jrnwy`H>0 zeXpDP_!$PyZR&6Xq3OW$K75)9Vq;rrTjR^scS6IJOB^_O91`{pa1RH?LkFZufrAs` z`nz^8sK4(jw~YWrsaG?-39-t0U5EQ~w6X(DAUu&&Nmc{J zFdf*)InZ8dMCsNX&#NuJdLS*%YP|=Xnq*OWugz#FHut7_lOaBPZ?1+%$kD?dVH)@Z z9_5|Y`^Cx_2u)SPZVsmCAvUJ(CNM<)4w+lVcdd74-wV_PiGKvC*p+S&lFB-^Qho!u zOXc9Qaom3;0pIj1)R@$V7PDk8IIOXHeo)2BRF+B7}o*_UXv~1N5+l61N6NC zK%cb&-moZTSw^?buG_UBCW*#=-6D>j@IwQ;ZmeoPyGIf%K#W}e5~qzc z4bYeY<>@z)&HkYL8wJSoB|NB2BiGCT^KY4-LSKu6n6UTBkNc64WRx9@*$&1DC(uDW zVXA_X(oq=*u*G>sGhCEuSHcpm;##m``=^17;wI_4Z;3HI;Uq2ii@NV=EeW5XB< zv&cBJez0LF4xf$Z0neW2&?Yp3N2*dT*AJ8Q?dH!I`tVDPKzc&9ly?q~*V0*MQF~P?a=2wJ3oA!LKvQ=k|brL&K*dX zO6EG-cDHx!#7r9B;RN#bI!K>&q@h9@r&+lJT=M%T%&BGmf@#!8gpvz$2zX8a%?r>8?O<858vS zFPcs%0Q2>zMV|;58ziF+S7glIT`a1>oZ6pu&m_qR1@59kmOXbk0c8O2LmW)!Nzvo9 z!ph^@3g0IDo;G$1J9N|LAw`3R?E=p`3odPriO%+;LkRg$p?xW5f1&M>T>dGq+UiMx zSe1?JCa7w(%9^Z`Me|X*3t+Z-5X)E=m>iA2Jo%V+!7#%H$=s8iifC{)-W7fH5<^(; z*aZxlWt#scN}H7TRte~KomS+1ahYa7i+lBFX_x;n@d<=B9Cq5G+2M$R=!6=H0)>8i!^8Ae~m z)1ybRSu^F@p)s~^nS&`FUS>)tO$p-&NmXv##rAl;c#>jeyfwsJIDq!^R@2bJrSU6y zLXJ}xkUzGq=k)5>j`GL8u6mAw>1{NaZ3|DCFATPC$Eb;ZgjPL$q*!A1D$<|-@~Q<( zv7gn%o0s{!Xf@3h)u(S}E%kTAUqdeMt$=uz{Kw$bfR)fU){1xI3)VK`3r4}E9lG06 zIS#5jgx#^FZ02$Akw;p-f}0TYt%kQZXXyg~+eJ$Sc8f32r*8)Bknj>}8n|$a=)_js z_2#5ovF=2qafW;U$r*kdaf!B1Z+gd7m;KDE@kOUm*iuOsTE`t*?z22?UGnClx6FU~ ze)C97+1ly)M7{bgeaNKjW_$VAmW{3r|B1bw^_K|O`4lbn+geAhITC#rDT`R`cvn*6 z;526%;TxB66Hr9>ZK6;X0Mi7an>n3v-u*U1N@*F+{N zPiDsB6bp9I{rO)#2WKfhHE|dbL)98MD&5Du#RD`r_9XJ&7HsMJl#!uP=vn!a)s!AI zqB+Q|*(#V`G=#4^-F;yAB>(mQJky6~i93bz*(?F!n~Om}teFmN%D$fM-QwJDDSZnt zMMqI1a|&!0JGZ_Kx)98unfu#jB>q#rQ7`F3ll^Zo*V0+uZJQ(up(VZ(AiQW68(wzl zz^3o)e>Gf+!s=72dE+O3lah@O^Pz1#B@*)~3*c=lU)c}n)0O5vMi;IB=i5&wcC9;1 zC6qo6ud~5qeExA@4rjkVPaWE94AiBSb6>j;%;LckdQ3@{E;gm@k46Ma&H^Lo)gAel z8QMD|WRz(-z~Y*G#>)N4Bnk_JspjKp1NTJqAJQWIR5k(gL5SEh9Xdk)`ab~}bUbWk zUr`P|bvTNf2$+xlOWoN&?Nl}%n0I5riSU0faNniVC(WnmhBn8ZfjAuRKumZhWge4` zNQFO+S_I}JbxQK?p`&-;BJRB!ZVZH{lX{){PoqjDf%)Xm3i$jnrQtuz6$zZ2-4)g{pPC4A(n5GkL@e$hA2CTJmP@`S?5v=;#Na=vd2?}D7*R6LiG ze8SUi`Hyt~5|(MI5h5YOAs`QzYtwP7C5(IxbcG1?>pUM#Z@#$=Ey$P!kBWT*T(Q$H zt!{1(PJ&p3vp^yA`?~N1z-#STyg;Ps5AsG?PeEzvy$*90;RUCX(i-1u89o|!|CZyp z_4D%@NK;CF9tQ%L0B+QR*R#(WvYsR7YP)2{@sULfullcEUj6lBxH);iK}$Cy#o8TO z3xEjCpt2{gAkRgM`_)cdY*PYWz~@GHgI_ySzZ(RSEn8iD=n;kSVgTDOb7)3;0k1uf z6h^PR6F&p50R=Mo!CC!~^G=cDpa_-t{(l2Ng0@NUPj#d- zK$rqk?1K@euY)9pd;}$aK0ub~> zzBdS__|6muw3Uo4%~VRR1(D_M`y~6LO@i#Egqh7OefxGJ4C=%<*jz#X7yTI5OAd1;dKXAzbZ4#<0&Q0 zz5!6u{|g99mlMx5=A(mAdmPnsNw+7o_ih3vas1Zr-5q>+-vc=xCze{q-yh6~q*fmS3l@#rZ1#zLI;+C%YTudw zw?@@^0zNXo3H(Y9)~EJcI`6lavaa4lA)A1^b~U3wDKUb0r^H`Eo$F8oNWQ$kbv*Y; z((&fv&t`)_=7p%ueqhgLG{ugOvXFWdtj8wEc->cVJR;o0N#=mwUPufa(n|t4b?Jm( zAj6kT%>N0Q$vao!LZSzn;CA2V{v8Wn*NvXN{%%osMMxdrk@XYjD{`)aE|t~(22YdZ zE&BJ;hBaQTk+kYMr5Yk|)l00l*Hy%#T|ZK81FD$yx~?Z3yH}m#isZ${o9PVIkHIWn zp;PQoRvfZ<4|5&forU6n)S|QL~`wj;W z!Dpbit;Oz@>MGoZc4CZvIWfaiW+j%fNWSGyDZ4K}6Y^*~LDPwaR$$UBO+UG_Yg?S$ z!6bu})>o&CszLr$Na@Ivn$Q^45x~Sdmu&a8tb;eoX?^%f?}dyA4vqsmSUAd2W3jrJ;Q>N{=Q>acn_9i3J9kK6-8L=%~LZzS3a_P}%RN zJ)(Ne-|K*+__Ml(gQc)p9>dByJChTggo8;gnF z@UFsPdGfMsxpJSG+AH*H>-I*BgS6^w9VCWYNC02E&1`2KmC)gonR%4|s79Z5J2Tt{ z=nU?iw1Z7loNETLs9V*(Mgpni0)@9ef{7ZMkJC7UTI6!M%TRT!L2N4b-w6OGv^T;8 zk4Cq7?piLSZm{5_uAPDzTLQ2hnYMa*-eQs<9s(-o@Bn`Q#tpR>QDO?R*is{ji|;q@cF51jrBa(2kf&1X8X8Hk zL!siSb{$)SLHNr`NPpH-aWbm9qZoeQx?B_r$FG3~CeBu?p3zc-Fxg~|=p?KF_qR;UawM;k^o*@6BP|nj(Q0Ih#T0R?KLL}3{J_s*72-vK!ux{Tf zXVWv^&KC$klwnmheDahlHav!y4-W>E5;B~=f@k8P-gA!Tv0-LJeq_E|JH}R&1*U>{ zsFD$vVErQ3y26nb9UH-${)~5!D zr{B}!^W-I%-cFX1pFPSZy7XzKn8f2~_SqZPg`F#hrdNQCX5K-BjcHv&KGQm(z^sCj zYHgA0l0z8tFt^^xbsf&Ts1TyO;8{>f$438?aHaQAA^=*G&)6GCuvh4W8@h%Zue@5( z0o2LKdpt*96KO1C4_TICcL#8x57vr~?P!l@(!8%|3Fb*fex`*nu@D61Lj7j|4@B&m zTv1A+tJkRF?bdma&ypyP2DbJBob7Ch8F6GPUPENSNk#6!haAw#e$vhJ<2aq~oLu=T=r-qxr*E(u7zM!a znhjNLAQ8gmS#w2uudv9x;x$08Feq7~Hn(Mgvz= z+{&C50`Yrgqgm$yTy{xgaJoGuYt-S^=gECu+2~E}24tr)gMlMX$vbMMN=3y@8VS#{CC5g3aI&wi zF!2(~T_C87(Nx)pZV`BEwoONLi&r|b=57w+Voaae{l9Q8^0TCm@ru9Im;yQn&6SE7 z$~r#y0C4b*j`p_unnM{-R`P(6IID8waXj;JJ_p}+0d;Mws7^rqK9&WT5mLuAE1f)V zQWI>A=BXS8axq22h2G!jS`@67Q-5CDdI2S{N~=M(No#3 zczN2U&Bgj!W*CxtvcbCeY-xRY4SQTob0=!1SZt=;W&|g*3fiKxyyh@dZF@-F_E9o6CkBN zV~(%xdZHo~{qZW3+BiXhTn#FhRr9v^2Z)~IQE}ptp4doV_tdxmU?ie?$=-?V450E2 zGTn1!zdpkOcxU9-&Zo0}%*`aOT#~FVkMj}2ldF&~jevQJ#DBNdIlnW`fa)|36N`#V zo>c-J*VLcNf}%Kag?mv;l=~N%VK`VBUD*1b*BHLY)FW{LAZ@~suL!nyyom7~A|ie2 z-2D@;^Hdi>)rbV74w`{KX52PHp1BrXXRI+IxxQgy*_9Wx{+xO)VhVV*z1fPX3CTll zk4N3Eog#bqOk1s{!=w@|74lj_fzAc;EV-kwF6{yb9uhifrFIJ2QpB z9+4p*iVEn`QknbcUvocoH6RLmr1f{3nrq|Z*!z0Ue_uYs0vlr*gBT}gS-K-|W(;ma zaW0R(;x#O{k_2rW*0k#h*SMht^RzA)eGYK}Is=77!l;8$P@6yrGN$0Ci-nNbOogZE z@EA-<4k}UVV;42}g~z$Ix?;GKz1{K4sVO*H^%bbqTGIS4v_8@I8t&hrq4HVdIdGGaZ)E~x_l1AoN1}%S|+;w zQlc7Opf;lOc=H>Hr&k=mzLnUdU5<24n@o^%iWh3n^phEL%N@XDD{Uh5202PqHEsub zDAybRG%>g+VD`Y;Ys^K|YTG?KbFoc8)S!OH7_QZ8C=o}sJ2Z-HS9?}P@R2d7UK$$l z*HTzLvoa?<2}uY#I&LKKD(hKay_;$3lT@o|R;B#j3<#=yP8M7j^$b$1O zL9Q=(<}fTOU<7v}LOMc`WR8R5!aP&?o0C1x5hTZC-`#N5Q#&H1kS+jBSPSHaOIKab z2~icc+{eZ`BdI4H849c@K+%pFhA0sK@y_)$$92G!i293Oq2op|Cz)<9elbr=Ths$n zKG6$Y;IRsNTn0U39|pLIg}u5-QZIx{f2lPUpIwzoT7%iTS0qyLV$PJske-q`Mk?62?0eslO&2;vaPSS>>II4e>1jpvoqd&FSiAZyG6++ zIQ%+il0m?{lqA^Ik>LN|xWZMEuYnshc=*qs{u}dg69U?_ns@cxAJCky{NR5|R;w%r zKIgx%oPRqo^+MCNmy*MOrgfYL!TBCnF2635L3_b`+an;Tc6p9of7sf8g>6tTA!h$e z;cFnpkstB_i`@ftl#kJpy7P2NqFV}$awkCc)-G@no$mAv^l=@^G$>)V{44xu2Fkg3m;{D<0Wx8c zY-;kWDR5^@se9YN?Wt(&u& z_ifz!axGw~4rgdQMq={q+x9H;(~UmBSpu`v)8j;|*ZA z4ZB-fC-)<$JrwQWr}7EXDjW$su*0wX$Ke0}!T%TO|EALXrY2{}omi9b4g{U($+XoC Ku9T_%74cuDm;Rps literal 0 HcmV?d00001 diff --git a/docs/source/assets/kernel/key.png b/docs/source/assets/kernel/key.png new file mode 100644 index 0000000000000000000000000000000000000000..2059b608caeaa7991113bd0ca05654e1a53d979d GIT binary patch literal 111314 zcmeFZbyQT}+Xo5=QqlrSgCIx^N;lFWA)P}BNO!}~AT3BKrGO05J#>eX(hOY^LwCbH z_|4xN_wT#beP^w44(FU*&))lq&*#~Ot0>9fV3A=VAtB+&$x5msA))ReA>EtAxDVV> zPemU`Lc%Jsl#o!7laQcMak4kJv@t_Mk_}HxMpswUA@bk#1~uZ-&<5KGMWR)jMHko;Sd7I~!z?u(_|8P{A+{Uoo7XB6oq>7UH|P|B^|$A@ zw^xS){XX_D+|74{Pcu02keGu&iMj1PNN;4F-Yg=E6i(K7kU)@H{4tnXa3>GZ;Hpa6 z+8APHs<#Gn?)Tolny(~_Jl?&%)p|`3QI3WrX3WHhRe|=HiCU(cB|tst9#T186*+rf zT+F^y!s8c}VhPkSjt8iC(T+=erX0&6af|v1W5_frXFfVFkRCo69q1vZ&vs;LVw#o@ zHo@#!7u@Un{{3NA)GC^JUhKnPWoKI*yG!)9Z01>XPct(M7s9Keuv@j#*c8`5H=?j* zKLzPJpBG`60!+`!TO8CmhlJ}>lO7iu$t@NBlq13JzV7CiqI$&rBhbd~iXQvCsWqZ@ zqeA>CzueZKTE~w?DW!6zX^=_0*~i5QeYqFx7V+oh-x>}onL>o6I@CG_6~d!FuIM`+ z70L}LMPg(eEHZDs@T|###L>4{Fqbr&Y(Iw27ismW_4UhzKTt1B7u9H%ZTlGUk+L99 zk?G(#^&njJOu^abVQtQaF_zzJFaNfTbn-7LsX;&6x2_Uo5Ad?v(;e5t^ z5b(HEeADUvP7C>201bY`I0gFZJ!Dzr$71(M0>2s&Ga*SCaZcj)h@nkl;|8`bFug@# zY}Ii<)<>RbwcSHH@`vo9JP0^7dKgF}BpI^){(cVnV~M2q!X2SPw4zL?SYb<_QJEgc zi#ff0R(YQ}FyhOLJi`1?NPv=<#TNw?QWrc#$o((8KN%kdJma`G9-wGMo`U1NV8ln5 zfzH^j<3N>$XCJJv$b5!BgOM3h(gItgIztnRLJE%JAn@(LGJc{@|1ex89{+JU#WP9y z_sluCmM;a~r^@0T5T3tlHC}&2(uK7ggV`ZzjGDsQ=hWx*>hUT~F2P7lZTLp#1LG!> zZ`!tn4C6Sx?Oi4x%CO7OCJDD7Hup}1PI#)RFR8-XpF8IAk?Pa1<0ePw1?t64M;CNg zE!8>r)))&>l?1>#ycZ<)eSC16keZ47F#PWO-J6fS6{8_K!Z<bn(Iv(sWY^^IEul}2k9`k{_x?fsEdL=kR`QEvI;c$!#pg=Zo!4`!-_(indATz_NzBR? z@<7h&xxWiuVhDTH`jf zdR{qGg{m8(D^s;&-9AM!wK%0vd1_@iMm0V(hWRrNu{FvMJycEpwpo;#U$1ytx$~LI zU{FoD5T>{rHRqf}7TGRukyt*ZBXKcx5pdyjEq|?a{ra9-5L3|D{p0(w59C5^+UeVE zLv2GPq`jm$qm#(S$Yyz;l2t^jMY%t6e-xXN@~I=GklXUbV-5}8UmsULj(-y2%Kqf} z(d?tf$JrO%A1nGt`rK0uVg77stg5U%Y{fclNdrkHy|YOMBV{87P=#DA#b#5fvL)?b zx?{S>x|~H4g<5*NI?d(J%fn%ClM&;?vGE@~vu>H&_XIWSY3j$^*xc;5*tXD4ex1N5 z@^Ei(StveHf(3O2A}CJz=3KJ``(1Q5M^}$J;hj3G`fEb#CM)p^<-NafQ(JLOODqgs zI_oZKiXDVEE{6KyY~9;=(}p{mJC-}#JNGzWS-dmfN*!7=UT8hdJ6hj=cAf2LZudIMbU^MG#ZtpT*>T)C*IAUqY)x-XQPVqnfBP@?(9C?p1jCQ> zrhfNs-`winenEozGawfQOtvhvum!k#2nZpUeByn$LRiU~C!9h^L7-1^Mc+XuNizI` zDC04g^JZUIMU;h{Id@2)3V40|r;Kr~v5s-`KvzF*s)5L*WsSkv@LmjtS4ZHxPw!6S zMB-BEs3kv2{m7omvK(^GF36TK@iC1s^)j(FN$v!Pk3O*VMoa@&<4(M_14*T?26firb3) z1?G_Ipt3^COd_msAj_;oCVL_~_em&aDN!~4K(&uJIouF>KUXs+CMP79QuZ>5>H|3Q z<#*Zde6+6=_l=YFvKtr3(5+v2ztp&$fvrKZO=3uI_t>8x1BN%E)1sX$G6Gg zY%{*5v8NezSm_a25z3w?*S%NsMbQU6is(#PXvjS_k`u{W5BZ%w zrWi)Pd)5>xC6_%?*Ho6QJ(3_FQ`ehY!MTFiaQ2I$pI!vp6rV5M^fUwgzDrWy8qD5~ ziV+;!Ul?js-hcQuwW|;CU?a@5<;)cok(hyN3?x)!G9)zM3K{r{Ad~-nEse~Cbno|l z6eOfDOC;1k-%$d7?>_H=?_HTce(%MHBB29+5dvS2&nSPrjk@>w-e1@ECV^*2FV!UE z?)Mu4!1dj25FO3$S6r-x>9iG9Xe8{N z%xHMnIoLVqM6hURXoQ^JmrrGty5y&cV6y~ZZ?t}eoKbaxH?{rRJx zW*(ORv}EV}=dgeQg6^Jxp0jg+{;nG+Ds*>OK*iF-%tlAj(iV^z(1r*YQ^U6-G@pj@lWx~TH8ag@259(w$xT+g8jWcz{T=TDZ&1xh&`RSRzyh9|ssbNwb zh_fSPZ0B}O`0_?*^*lJL`{zx`ph(fIdx^!a1tepBZtxIXGO-JTa{Ekp7g_99iF)IF zAl#+EMZxz+LZv}M#t=jLkIOvz067egL{`$M!2j{NznDA{a%{wZFZjE<04yT0mI8-G zBeMUgZr~OCy}Wz>Tgx2Cs0~=8xVbO?vqyjT6FFAk|C$Bl`hPi}xl6DNT$MS?Q6EA9ab=Q9FO1NmU_N%ViT&ri;P=G*JtopC*T*VB2w zW^8n8Q%O7dDla!(?l`lQG&s#KGl*ljh#TEr zpSpKbc^7(L?RQrg!R8_L-oIE3KT`hDV$WTR`T~>p|7h`%Dau-!b4;p;Z(aTQYO3?Y zBwed#ry(GpR^JT;L#O%k0rwflZpt-!eALsriN=BJBdDlrK}nMrU#D9X=8VU3^vsj)-z5B4f7riyjH^(J* zi5i+8jUojUZUDK|2%fKen1x(!ui*-Al@6dRd*`9jJM01j+lg0T|1msvt!V~Z%kUn* zfE0k;30qCW{7H!xAEDsqy#N#?b24uApNg_8rDZkMeuuqgAy5?sn}pTj1EVr~17!5I zOs(+AWcD5zV|Q4Kjz#*f>S??BYR% z@3q|9Ip_rq8jQ5MP8h6>8~R;`9uj#j6&POU3jz<-h#*JV)U|^?tDHoZN-~zBr!&rW z;S5Q#?e7{xQI_)+7DCDG8~{NS)ot0j}j8$RQA-fO5TdOGKAzi}0s|0MsAtgHB< zy7fMJrEjy!%Hi6e@-FX1Q3R}`t!iAiz-(3#=kPYe3-N7K30~7hW}S2D4G-`q#V89w zLbZ}8_NEl_?L=ce*l7=_dR<`>7liLpdlQ?mRjUB~IokOL2_}NeQVqq3Ar=jRKJ{ z;Q6wIRuObYPkr$-jO;4RVgJNA^HN`eLSQijQR!oHE1>Cnc0TAghrd(SO{D;6#>mi0 zJpPsUqa~y+WQcgg&@SY(o|PR`$q4ywBS9hMmZxOk+|R8unZM;hz&GUEm1J6K=y?eU zA~i3qu;6l6+Ou}ExW3#;t&z>?1iaK)b{_<;QEb|ew&$7=?Nt7?9ai=ff)yOZj*FG#A_`#!!Xu%Oqks`*#F}c-@O38d)lQr1r;jhAZ?~ zy3G*TH*v70jKjuiC(Eg$_9k$CNz(;_)nMK_*DM8JQ?GHo+a#23Gt}2y5>Jq@GSIgv z*?c7B3Wem`d^FEgUaKMR)6yq>aMYbF;`#CI zH};i(Za(j~$7*L{uPyQU#S$_hdI=vicRce%$G%R0zcB5JVznW5Q%@E>|Djyn7bJqE zykJGy{3>3%#=*eq7v8aP*twuE2MV%`1NbD zPdIg1rUafa*CEj^h0k6q|1HFIU{b*_mO)^=zp{usH!Ocf?gafe*{V zv-xgnKcDtYGb1TJ-?LI--FuAJRFPh;`668dC3R`bEXD?nKJHVN(9k;XLE+Ptz)cAy zSxy^<@_gR1ZSF_KmXZF+65o*lL*ob-8e$MlU(lb1Mliq}B?ElJ!1s_Jr?VVnMfh-3 zqDw%O8m#noNFif8I>*?E7B~DMg#9_WP%U6^%Qp|Sxaf!OzgKZ-K}M%ngDNS8rC8>{ zKc|J7-=D4;Q?Oxmlb= zkzxDtNae0$gNrF;iH&9h5l!1ClTn@V*q{au(#BA`O_*u#31Bq(&WAX(y$vqO>hk6T zgi8ceMlV(4HcknzC7p;B8x?(_u?SveaEWxDF3tZy^UOIsK~6j+vV)^Ma+U#XBv%ueO`slUkm- zeo{-Hy>dv7=^re{@UN!4s2DkPeYiJwa{{x_SI6r~^9n)3CA282IEE@UlmGfciaZ!$ zU6&kOXAaBf3`ng$X(D)p0Dt{biNMUq106um+8`MAoz09H9(1I;GquhVF16cuerN4z zqNoiNRJcG?6>%53dLSE9{Smqv&0uz*Oam%@Thc_|BYZ@YPHHPgBtrHcRaB5j{)gC>fMfY(tHGauk*Ovp zZ_tcS-UJ_vOlEljWjNz{IXrG-*eF$~ru=Lgd!uGKMgcT!?1Jz#@h*n%5mhw<4zJmH za^6TE{0dGwWf$@eQf6=NTw(2oHoNP)>)qc#0^UTg4=3DXuiq*!oJk!!JkuTWc@s#4 z7x`eZ<2(C;3Tn9>3Yc{VwXBKB-mMj@pV-3SZk-1a@AZVbO;Z)WY>9-2cnuHml#b;8 zI026`7ZUGPzE#*OIe(JM&muN;X;8tTf4BKn&`>_S32ufj{$p?$Fwf)`XHS9PVwC4L ze)UT7;A1-EtL$2s?$_=cvypTD*crM=)_AH+_c_mj)4txTi!sjG@?Q2-%H-{a)5cxb z8#HeB^GVZKSBTljQJPbDI~&A9cZFz- zawYibxI;wPV8ik(iCs%;T~3fW5m-B}V9nPg-Jav1+4Hr`-K#Wx%TIuLJ!~&=-tca1 zXDjO<{1`3Y{5`z6=cUs5fdp@4gW1}Zia1mV9V*ngp=gg4_Em4p;&S>x0+OCN&wja0 zjCRxrow>c)y{+S((63l8x8{uv31D1b(X}r6b!CEKiKv750FfK0vI9GHr`Pbt$xE>r zS#;27wFFpgW(zy@yKV(V)w-WZXVtD%Mp+Gw*Kq7Tf(|1UbHczazoW9c=KU^m!FeL~ z*;D7AUz@Y>b8RjG?`iTpmTTje1TKGv=RW6Q{sq+g>}jga($x{_0JhJD>oxdSga2AJ z5E!`l%{6!`S@w9oKuO|cI0`OboFv=v4I3B%z z4PmJGtX6Dpd592|$3j2{cJvP*EK4jP|D^n?pdgg5ats9-;b=kXHstB3S<*~LdfKCk z$Ff=ZE)wFtl>G(KpGNw5Kwv*_V9N-b&{qh@O+vA|cP0qTSNn<8VI2QT=PG zy6pgHsqXi#@yD|zk*jE9GgACaxbf@;;1~5@Gk*3Wek%exEjF_?E8mf9q^ptGpRR=WNM%l(sJ@^P4}^UmYO!2&Y^3kcp#$AK#bvy zR#s)z$Z4Je;LbF2;W<*&J%xJS+C17WD0DF>=;~?m$je$v_+UhAsOB+G#L-wB!nRc7 zT3E9jBJfb`9QTGH2jLE#hlXx-McLIXhH^z``8By4k;ERn$Rkc;u+mL6t4Bnm($4ZO zSqmi5Fle35NB1zW6H!t_&K50iD(UPSu*wkf+IbSx>Py~%ZJSM>y<;TJ#{yKSD>W!_ z2iUQ4tg)cqu;{6}-9q@(d0u{wfoiKae{{k5tH zTdDrSYgJL)s9s#Q5uXIT&!MtiOq(6Z6cyd(e`4I4VP9qM?K+wD-Uz=rO@0N@v0XW$ z#IIL~50?|#z94?m$Or4GD4zDobn1Eb)p)ex6+0vj&oVf}j!wF?!l+;t zAvHhXInGx9B0$0N{7d4ua3HMIqn)RThxlgk6ON4lqq+_ZS|lIZhuLk^Wwx`r`<`sUahCM1P@r>WxFc!Hvt+GLHH?xkOZMu~n5vXC%l zQdxJ7yf9YW$a@eOngDj6bq&{CiLx#0E>|#n>=k$JIZp<0C!C1kTY5K2vL6*<(+QbyC9F2EHHNqF#ml6RJI@hg#CSm^K-4Bt zX)97yQjUl6k&=w!DqL#4@v*iwhVNAw-5H+V7;VT%spjUWH`k61SzBH)$?-y@AFnP! zLtYGXM%H`~q(o|jMZ117><`n)YO=Vy!uQt#lg{+F_uyBoDMrKLay)nHWn0tKF-tre zY|g3cB+)BmK&^ipN&@8I==XAzg#kY5;7$Zfu`HwcMDTgLlWs!GDXueMQlW=*sd}zs z8tG574I$^h(Cx%UCr_<%NZeK61-PtDXGDkGV4gke=toH*-jpQhD=exl)j$joRNYf9OFZl2#0B#dx zGx^!$88COA7_DXx#&^@RtMW5ui z?dB#6o%8diBhn!5iy1^awcsDI2 z0f?^jTQgw#T5Rh!auu%~2vG3XGyFq+?Ew`x0D8A{v*vdLy?lp(`3F>*#}90`Poy-f zfA`D}g>VmibC4|9?>E+$4nQ(HqBqN;pF9@<&gbV7LIdXRxxb_#H{~55!c6S^7%9A(~Q#wXps~?8QlWvzu6-J8h`%@WY&he z&HpzS^v|WyB!KOPDyAL%?FRf21^%rbkbwc<8Z8q4Q{+D?YJckQ&ug=k^Di*^-z-xA zE(XnlD-w3e|I{{6(IXz9dSfC+;lF17zjVVz3>f-7Z4Usa`Y%s@cd(@tKm%}6`8NI+ z((Cgbh)UVg#_~UwMnVk*Bx^s}A&2+BjE)jttYs641WfzyG5rI(bJzgYw?9?#`d>!3 z?vI25MlapN`lrahD+(|MVkUAP$KgLC@4saJ*E?e9z}yJX4}XSS z|1mFjJ^H_?|G%mKZ;RshHv0ehu2-UI1r&y<#6pJmj@*@E?wWP_xev1A179_5u?6<-IQ=4`3qNCtHBISyuGF+-*AuyGOB=?tdZX zoAU0$XrypT1O+@MkLpdsL4(3O=aD__xt2aM*-I6APyW-}f9XanjGrPb{%y~G>nkRD zz~_D!@&W&mFbaSaQO`(|+a{Rzxua1vK8Szvw@OptM5!FK(lkoJ29%@`3ZTywX?)^C zTR1R(icvhZ9lirt_CP2Q0xDkT)xM*SsK=Z)t^t6(FxtZWDMEh94CE<^83kgDmidPI z^S(?Bc|!}RWiI@%KW!Q-eam(7CqShp5G>rNNQWNYX~z2spJPr9Q5#yvF-^k6%rHjx z?`u5vc!UCU;WIkQjGwo^j0#2+AMy^~6L8BLz&hly1_;dg5$rMRrE@$$xn@x5CRg{v z#3u*rN>v~DA;5u? zz+(SzHcnd1WcGPC#c~rs@Iu=R+!iwh(WyK>-{BqWNno#yGgYl_JfS`RX}4+H*iS`|L09euNfRT1i{@JzTyP-!;McANv6B zT0fg#P&^gq2@prl^H;09zcH;)hhEl-D8ZeY~&tM*KbL2LB&as2496 zLP)_5w`bj=!+BoU@g~jkXwye#()*flILN3Ga;~cHcQKNAhle0|W*pj&6jT!r=L&4A z`;LiK4WDdo9hvQe{LUl2Pv91_sdGoU0v2e~fA5x-?*twI4>$uj<{PVhA4j*9p?RNA9(K9bbp)<;mgAd{*gvX?B$e2I5@Z;EbAHiwsZmrDmsJTQ~4Q$avMuFeU434kM+UBpxAtzgi?aNGUJ395-xsuw|Ph7s- zsOr@Bx0?V-@~)hI$h=zl{9)Vr>ZaHBAYZF7pO%`r-VLpxhL$MVo3Cj-EAqlhFz&a* zDt3(NG9pp{DK5oFAyR0;o9+v)48jxP+s}UbBIrssr4om9$`DcbQ<-#XI-*$*8C2cv z7MNG=Pr6l2T_Q-lclCaG%lB7d#Q@gpTs=`%feH3ofQd_~7r{jb;un>k6#|ru77Z}j`6Ya_-*SQ@A%^6Y{wOCPYUKf)a@HlQ%#TvBC7OpGL+gWjAg(O!ht=g_*JAY^5qJG!1868{VA zKt%G}nYNPsJbj{rQxWk)f_?$oEZ1gO@g|7cwl@tGBb)QUz*PpE#>`kMND+K~s6 z8j4y4WQYjx1BoT^+V6)kXvmc|-X;V}yM1pSp754pIqe`)PJonZQhX7H@_33G#Q8HW z%AE^eZdHtI0}$lc>-a_t4d!TGTChEjiDu1D+@37JGkg$DC1GFELMx`i4#3{q#PM5~nuatI{5R_| zq6qIRGXjJ$^>9(DKXh@n)lF1cp6;^UkryR{jJ{_7$sPcz3)k+oVUINL_X_ieBB6dF zSg-OOXSFMirbfoJ))d0DB~iR*l8#ynaIj3)Bi}Pcn5HaEAizd@h^zO&vFUPlSE;yW z35(pe8q}Ki7Bz6Hk<8IVbcb1_>Km2s(a_Pdo6k0oDq$7m7xMb*LTv8ap))76-BLnC zV~rcFA||Bz~yqr6FyQYp0#c-h`K_dw5-hw65Fz&RL2tvf0qy9!QC!+;96wB zxc^!rQbLF^-9@$jCHD1LneTWe@4h%i8j#eIFL@{3n%7n=P>{uq2y;ru@*vbSrR7r6>xQN|$r}&f(InGY zC45|39$O~JgAn}KZKev^j&I;dgyovnFWX1NZa|(3s=a~jrpK*R%VvHaCThS1!TRxi zGyCfKi*Xp1J;IAgLEp05^~r+VJAOdzliAIcD;m~tIa=ZCM%cVxUr&w)RT? z1Tsq&Ll?^Yq3EY~aC~T2I8_p%XW_{EpcZAr>4>y+i=sT*H|^^$OLlIo#PXgG_l1WI zKiGgxC9%C2!oH+K-c2!$dB)nERpQZ;X;=l@>os8FHHbAB+OzOAi+=nrJ9xoOzXT=2 zOXNbMBL0qA2=|h-;%hG@A+}`nJA3@#PTE0b5!!&bnehQfEkyMZCHZLTvy3gnz%d{n zC=S_2^s# zJO!3Wc47PWN<@G~7&?0W*rlQ+X<4NA^L#bwK7~a$AhGTKw8T}}ewn{TSYs$h4{Yu= z05A4^OE%i0wpSKD9xXSr%`92ICPgmUUg~^;ictJKL9XsIV&irdZSRxKP^g3^g!$U- z%tYtHON{vV{^&$S%@o!KJ8fHri_u66h3a@QVYcIo!6C>Uza ze&aN0gO*z^Em;bEOFq~;!~zI@5G!ds6c9U^fbn(0wzWr)tFCQfpZ$>MF~a(n7x7-i z@R-AcYZ1q6h0hb0yX*0^7J$J$MMt#IU`(E;at}*7m%@48c}f0&S39z<$mZ1DTx?>m zrGEHHx)kYs+Ib$QLUa{cB=S8BF9osv)g4_h;RVV2qoI)qbHawgM_;+PqTWP9t> zI|Bf>0{vc<6WQ^TGC8^w&3Cq|OVuzNHIGgDa=4KEL)+CU;=NIGSk|{kof2l;c;Kz> zT(*}y2_I}?MV$&}MxU^s4e_{Xwp3gm6!1-NxaRROa80{xK}1Pz3zn~8ac-|NO#xS* z9^=R>sGPU26i~gZ&XLVW8(u5~`9#Rd67t8LY&iu*R6+vKTFN7u(g`lcAm8r=NUj4H zn2S{c3BuGTW0Pg|io8jl?H7pn2WdlSSIDqp%^S2!}-~5Kwg>wpQuT}IfiNd z((>=XXGwJh$em=O$G2-R_C8w-TMI2fV23yAYz9o(X!I+gHh75OVVYdDJxm)uzgiQ` z^u~vS4C-pp>#N3oB9ntJy=rttH8E5Lfax`4k$41G=b@Gs1WX|wEjvzu&FT@i3qY~S zclIi2e%LlAhy=q~sH>mTE)MWiCs<%cEwU8$!^+jY-UO>7Qq|S z&f~*H{iP9Hy`)iEK#n|kLNTUgHHJRM>+4$`t;?O;i%^lxv=zP!qqLQWwQ+ujAatKk zjNi`ZC4ZDma-gUttHnPtC6L~Z0|EJ~@Z*J&x6|SNT2GleznV|G^`nYTLtCO1=vgJ(r-R&YaYIs7)3yeh0*y5S)r0tZUe6rc#EqgbCR0l^)tH@h zjYqHu>z*am0{JGFKdpGyQYl|pu9Z!jB^#87y{^yz7yTr}mIp-1nlN8u?U`o5vao@z z8do8_r;F(!wK&Z`5$!+0;te-L6Y5pE{PdypRb_dk_xF+>r9+A?ziOXfgFOC){?Kw} zGO+F8+jf%Y+tMv+*(wqmqn{(wn>=)%BGTWH3!}K&(8PeBz5gAnFH3xetAna%2~MEO znHfROK%&&@6l1I5P&Kgggke*T+gKiMcFW~FZm+$pyhEwp^o-mE3v~7d=@lp$BCNd5 zZt98^RKe9nR-%&Kj(*$LhP^$C+c)L|O{c*j^rM3831uQ_?wdWRLRFp1ua(ks3GfPh zm%uO;1$krov?pbP=KVO$X z@xz)(68SOTd!3Q`Zm01#ljAF~*NazQV?2YD5IKU?@cb1HTs`n`%Ww&ROSlW%aot2Jy z$jWXT*ID1gE4I&661wq=4Uh+mPnhk@mNbQ~Jb*+OOVv+a-BWMY7#>u2=n3XO(+5Pj z&G5LVHzongSOBgY0GkD;OYKkGK^|J_@o7K|LxvGdUR*8Jm?DnELfCou4NY|FPA*@J zM(lW4wc1&N3~|OD49uzIClth~{ASlwxS{2t3KFEvgkEtIHA-asW)uNL0suiYQ7iLw z-x>yQgCP5nfG%IW%8nKJBn9OUZuhiZ8_di!7>OyK#v23#^Cn1=fz8EnT*u<;$b0R5 z#R-NNy3h;_LdxE$QN*@+NdAjBwvKPHQ(F3tQiRH7M8(s#tP7TNzqHdv>JzU03r_vb z0f{M_ne{3;hjgPPhc--|9c3K#aSp!uF@4^894ClNEo3!dkLkLU_nLDYn9#-xb|49Y z!XB=D#7;MmpX~*UuOSgs{E+Pi`EFWJM;b;QVv5@Enn~&aDS#FqgIC@*m95;!{-O>! zxz-I$CaZ5WPt;*F`YeWOfY zvV>&&tbT1ZSbq{M{%)D@KDeK6kak|zUy-_)SmeZ%H1ek`MKl#wnn^RQNczO(cfbwF zj3oV%;-I8Tfe&AtkAZVl)M(+ zrbkm$If&&cX?&HLiyr8GbtWR)PMVvGZ=vsxv&Uk#NtcY85rLXZV=7qOUN*BS zZ-IHz;eK`0IXOjaGm+EE6Z<@9A(ri*DGT;rm9Mak0_aCxoZ^#W;btI7n^%$?gllol zJ8f8M^zw!K&W!$5|JCNFDQH=Rc=T-Sa=cM4&u3SwOKxn_<@ZSA`C92B4bvP`S|XHEf?W{gm0i_235kz8neC`$f9;+{a)mq>)UhS|-;?bN@5FjU3L>H{cyB7yle^>DYQZwR!U8&Eof5Epom*OG=WUtqp3~G9dSN*-1mP>$ z7P;Ia>QjR4)7R&#StV?+`~uQ#C-;c;qE-IbUs0q1cGDKiQsFah3N6)s+Zbq<#|Fv@zTs%sT&8G;1|teH1IgTtNA?Iwy3*O4j9t&sKY9S5_@&eQcU@Pna9;GA)r)%Hh72?wB$jNYL^bo*1HjHa z8=&|sC#iu%q5=2EyG&Fe^FD|D6&zq2NAQBf&8DJT@!M`}4$G-KT7Bn$`u0p_EevC2 zfLm}6Te$|G-6 z)}A|Q!h1x6b=$D{xeVqRL))scG*X5N>n6sGyAwX;%jJ~w-u?h}Iyi{u6agVu2w-nWOXD(6*E-qPnyo?=*kDbx z2q4Q%kJoZIN4z;d6T*(W=H1Lck8iWLOpY@E;(4EOD*ojVC;Yys8DqyC5Tpp|J#lYX zrW_a~R$CPo+$HxIIJI)UJI3P{L-`w$e&mZXTRAA5iN#-tg*@N>!!w}kymnFe~vdi z;FqAx&-{KV<%L;f=w zG>zeT3VH;xW=VYYcy{&P0A+Pu2?Ij&GF`cpSs zTq#c9d@%GyQ{Sy=sd-y4=YbGr9B`_Jn(USEDG6-k9M!FQgg3v+->MAMa<~w9oUHY( z*Uk*)RnVmrnVXkvnx;&yc^F=Kh{hL_3Iu)thmj|Gw`Y$B`T70D_>reiRLLI#?iee* z7@p8ANk~u&AO89&DcF~IpD0G)c-R{_jP?m4rwbh1Q1H$<{!m7GnwD8Yw=GuPv=Ft&^|PRgN#+=7F~YDItulAL z{7j9C_zghG{2)R{wGz84N@mn4c~_JT)8W=O90L#kksc7~c2%$c@IXdp69Dp>)@s)> z{Y2a~uphmB-lWJo%jKSN593Ju4^j>+pH_;WM}X`qlPTzKXZ75$*h<|Y1<0Wo%@Cs` ze^(Wzbe~3r=#Dj1RY?KzY@#Rveg{#~#boiczt_nfNhbL@2B(90zn$c-J2XG-fruuP zcHy|51fTkEgUx|jnLpR}k#mHT)4Y#EYw};2CMg(y6i}2@q*eU8n!l*IKbW^lIUqqr zQN`nbN#SUb1TsicF%UU_<^%n9mjA_q1@HibXB8%v{l8d{e_sM!!0;e?na)KEoK8j5 z^bDXHk!{kGvT85bHS9~n=Dap*fpjvPLY?=Xe{`^g1K?oE@0$(ZwZuV(N(fjx4ieO2 z;4ECneD!d1*>{eQTkl#3WGLcTEBanoUjOEG)Rg$YX770bAoLx3s2=j052^ALpJucH z!acCnYQi%$tad=1YkoD(RZP`^M)z1BLg;OCq%YU#aokX#Y(%zApf>R|g2A zKtT>?egGhq@)o~}(akvz3iJAy$`YVFkp-A%3=(FYd@f}0Pf`**iUlLTyis&SyW|*p zs$JsA=A?)zM$zro?|p9cu8S|h$=$Qzj`R9tT@1%R`#8aBrSRAP1z?fk2;TG23Od4-pcyx96ofMaoT4;z`O#`jOTyREdLx#H*ItB| z(8E7NDE8Oq*d;;A0?=>3@7N(a0;Fmkz*;ty7A0Guc793cAG^}vP ziSF>sx~QEd!5TUhKXF;oSQ0!#?TQAH5gb_@4+P^-&l(l!eTVNRSA`nI)dx(oFb0Jf zKSTARTVRxO{$AJGsie|hRA$Qm!`64lQ~Cb?b2@Z{9E2i!zU|81krI&>*<|l6dt^p5kxeCJ zXYXB*%o?cC?SuXA75^?E(mg@YgkDmZK}0%Y_sD;^7k zSD6rAy7Pix`IygwN$x2$Ztjf5b>Qj$<)J&JaKFu3BvD8)c5?ZNRX^K|HLXXqZt1Aw zPYg=I2+Asd#{bU;kU5uZigNbwTNG`$a{OORFmk7>8L%t(9k0*)hx0~WU})hc@fkR)!?b_dY;0~?T~r={|#<!_ATbG(j}v^f9c9k_ z8lS6n`*K~70&4tDQbeN9p<6V|b?Ol?QA~rnvMa|hsw!X0bG2#aPNbOHc$17x_q*$o zMI{si2+RDfJz*hA$3^2exP$t$!NRAaZNCZnKJ}*#^@^B{8?Y3a23*0Cih$*E41^M+ zGf$;|sX-f^HuDsGz0dhh|NYJ2*{=E>HY3F9Z4N<{74y-bw7wNS31VVi8_9JqL9;! ze>~hz$o&wp=r6?H(8Em+gkFYfZ||KrDukg&>QSkAkFPu>PXAvry- z`XXZ#exQ@D%FAB|AfgmBI1ee<{TTQ^z-t;|(ei>1d6#D{Pd>TSZb!eHyLOr5S?NAITq3 ziB{rUgN8kc;fN2|>2D z-I~ZS+#|xZ?X_gx@voI80NIQD)V;V`p%JSYkToTo%Sa)hmO{FUHMs3>ZgyzD%Ved{ zz>XmsB#S`X379|M6Kav#=QmjR#ljV2#{n7n9;NNjvat;)!Ob+)=L{?*9a4Lxw!l<^ zMdwfl9NR-H?KXIogApK05}}FJG6qEaAm?G#R~`2ssXq+Z0aBV!j=9K6BjoX~?VkC{ zLZeFNed7UkqHW}y}Cj6Ev_)4#z~C3Hr~cVc8a>VU!V@Tt!}b6B<6eZ$hh2RBOcB8DJcIRlI0cUy{0Amkn(z$P{kn8Yip=* zK88}jwM9vHS{G6dz`(1;>uNq|(fmFj$2dxZt;j1iQtivm4&(wz|89R+eMu6ig}V>o zPQT^+c2H%Rlw55j&!CO1Q5h{f+i+Wnr_bdgP}s3?H<@DJza%}$GrjsTx+-g_@ZRBc zj);JTynXYbNf-jk$s1b|cX_&{=w^8GmVbYf)sFU=*#0DpXuYE8=)l^lvi>VPEOTzX zo=OaUud^pmh<$`yI`{IfF#Zs07_!83%GKufrU|EQptr5{06 zT&I{Tf`dJF2-xGF6*QCB-`MYLjF;wfVMYAB7kv83p?w*V-gJOeYXO>-BzxSzXyp)v(+owYK=l>-)CJI4{(i5_m3$bTKaJ_9$T$4~E*ayu}dt*3*gBe$T zyVzsiE%(7_eQf6@Y2 z97ZON4u4w*s(X?N8B!6tBBrH?l%lt-K_V)&NsjHJWv;y78DO%tPRblPD6!f4z;IS} z!BYHxHbtUP8J=J>qEq;*vih}9HxQp)c`cti^&)Azkv)D{GI~y1?k*7Q3?ut)zgF6$wcYhFbp|~cN-8wwFMP&rJD;Jf;D)Fp0oU%qg)*5?{JQ4PDD(G)s2KAG4!b^Q_tE)!Ldu@9@!Qv zn5pq>aTxJJn>A+!?Yehyi0c)S|E_&$eabc=X@n>G8nHRhv`c>T^OQF{8u45noFR$q zTY0bFm3gDqjc~;;f4` zI9u^ZuW8kw%udj>>Af#ysYl;Z*5bli@RpmXJW^J;X27aj12se9C`-Oz789C`HMGFFVH5?g&R)#tCs@V-H zZV7CDJ}CJov0D2|?cXBze|LkR`J4>qHhB|Fuc=w;>q}{ZsaN2xR}k6hj;?;CLvk2e zoPcpPeso(KU#-@XZQDHGU;pLWo!LUOt1INi_D@US*o`Fu)6@9gL}l@jvSw{=YZ2F^ z*WSoP`}7F(NnhV6bu2?Q5;JkBU4&WkH=>0TANaBb9Sg7MbO`e)k9yEDAO%nCuo0gW z)FH%SW(op}S8xhfDVCTgDQ^8(7DZPcuCWOHS78l>_gO|16m#lqjcC90Ar}xsmyOc9 ze^0RvUcOR$rz6`%y8@M%B+OT^{{8LKcKPh7cVzii)vSs|)Tp?ycVQ>ECe1?}{!r)D zCXyH!>OzKV&ACCe!1@Ia_bK8(%X}@$w_4Wn^lL-~Xtb{I>F--~mIq}J zVu&@0wO*1ija6#j?US&e4$3U+WvSi4W9WA4dX#jT(3R|kfVPlJYz^;Va&|&P2Vdhe zE7+{d7D7Ms`I27i`EAHG*}_|sr(i%m&3&0`HihFUqdq*Y&eFXd9?ObP4NvD=V_veQ zy5VOlFJVl`6b()BBu11j&K7BqDD)MS{<1i%`&ij%{?H-oV(GD*OQn9Q#0!eTrGk2P zw1u)SA5#%;_45mdkc-oyyiqZr>k)oqyczSiO?ay`S&(#K#daERYL=>ITi#~8)b0=% z_%@mJU`>%+>&lS893&g1b)QJH_9Zb{WMO9$;(Iky5gqZk0puAz`*W4)$Toj>M|0P7 zXHX#B=Nv8U;Zmh{%q^V`ZOrwye9JNV6t5B_32!d@zT%-CK5z9(zYlr2GO}djhgIT) zvJL1Y+I&oB?$}-O&eoxZc9y^8P1{y87&}#IPdK2xM{e zP~2aNx9)#BeY%5av-w3Wn*j%a%R?I@_TER^YVL8L`N1#2N3)S zvKGme7t<7NBR#us`Ky%acc&alYW@3Z67L4J`NHl+C`Vp!crJeCrA%q!5zrPGJhyi* zekxY&<(SXWf#WyN-xU+q3VIJxg2*9ork{8qLi%u9+qD9FHBWEK1x&zBn;JuWd-a0W z&dMr|(@jFQgkJZ$&BnIq)mM-GT0h>I3chAh?(iHeNZ!v z`Obpm#IyaT*@>cI-urnBF@TU=GeDwif;O^!O@Gg9^pW$^QEGwW&7i?P)9vTCwzz?3 z^mCV7pqH`muX%R0z?vf(n-oxOau}8)31u3iYk7Fn$Qyn!2>sfds}J-C>70v^zk5SQ z^n+Vsz(Zi?-zr@1;#s*H_qgOv2tL>@0?^!YOPXhqxPrb@??&~70+wAkQ zPeCwQy|L%fS7QKzxW%iprY4ALt-D)9c8~(~f$Nn{7fZZuuLU*u;3KURrQZo`1IW;LfrXaT?EF@`s(> zFWp>X6NLDbS$y_k^Hsq}nF zM!InRe-uZCFM4b(1|_;MdJcMjyOh)tnbbHqxWzDO{@SLinX=C>Pb~XeIiY}}@yEP2kf?bsaJtj)Gi zl$>2>=%j(}qiU5|04>H?yPQp~xlJpCE6v^O=GV3(`uUh$05@%B+2(-W1W(mQ_X$B` zXc`=VW9V6_`ih^Zo7vio&li$vUE3x+F^W(**^g#(JpN@A_P?K=m&T+almg4VE%qc* zRmK)4(RVyv5L?KasdOE5xs4>btyf`t#FAw~4Zpnl2We~-7p{zuPUQoffaP16@RM z|3)d^U*_}QYhclp5t*#Ge|Lvt_{NWuEPaMhgF^JdodbeG=1CoB6%izFvx8eS9Sy&Z zKOg?Tpw8d=4B653F=$u6^C+Vi>BN50KzjYETyJ-TfG*GWZ%tHs5;~(cJ<7su3G+Cp zDiAA<5LRu~GfftKCeZskBF7h#X(BKyorK!!4%ZKjyQ}r}5^&{rV2C8H79st=xYfU~ zaBVXgxB@-9m?nu6FZ?5uOolEz)d&6e9W9gA*S84b_z_qO!MXkeBmcbV`@5|!n(<%! zit{pTIguRrPyYL_kPkQ^5Rst~+E+#raOb{)(L8E|RRJ{agU`7aC;pZNSHu1|N8a8$ z*+Ps~gg=dZGlc+Sety6th^tm!IsKn67h_XBR_AjTafx?ddc9qNNgMdOVA|L7+uaYrJ8g%N#(Eru`*?_b zJIq@d>a{)XCp|JwU?;8Bj0G!J4clKUR`kIyg263~-}yws?`Uis-XTsLeJ6s>XIl4* z?R@bAE#y}@XQogtUTUGsnWB3kQ#K#EYuUe zoCU2{7UWz=dmTHNH^TnF(whGZmZs#s-)UzaNTFu8kXz#oAav=k@`cg;opqWV&n$n#PiP6kj-%OD3n(vL9FPzQ*2N-rvr^uo}U)Lc454tBHWEAFr(39HVsNQ>pq&+ zJVfc4r#QRPk5-p!{^bG zVNlHf#|ix@9;c(^LM{%H=I1LDhyKYHB6+wyWAkgY%>36Eu}g(9n<;K`wKUWgWIz~Q zjgc9nyZ=_3614Ql`>=b>cV6r0J{dXkd&73O{$M3F%vad;(>r)IpWx!s-8WQiHUdX= zIty+M<|!b2CKe+AV{)1sCWKG#sLgjbxBw2^j{dpjIpcO=dl(E~|KAV8wC17QJ{`rE zkD$sN%&H}OwquJtGrDu@klMpbYvnIPPta@s#D~5>dL`XepZNR*0_1h$0IygHR%t?| zzQmT}F$Q_YYsl(i+};}zTEf+Van&(k#-e{-OSU2LEhi1K9aALoK)hav11yh~V4`k~ z748rRWeo6H)bn!_@O3j)9t=td%vX>sZ%qeXi;$pV6nYLK_x`*d~oQ`ZzE5i+sn^2`onM| zV8ZQ3-8C1QZL$koA+(eo!9ohwZkJg@k)|{E12UJC5z`GPm3sO~q@_`}Kh4~!UcDA+ z6k{CKG6b@mhYuq+Af@8P07QzGl{V@F%BWBtUT{b5I8;X1Mx3}m@(6H-ORFe#-Wmk) zDi5_}I;(Bt}Rv%l~e(3oqTIA)!X7}$jJWGgz-)iDTo6rb1yS@|GBtkBjFJIa; z3G*ntcdzyiqYu>F$`D~&rTsRK22^{CRt>j_KRorEWiE3B1;qSG&D>s(i-Y`6=zM@a z_TJi*dZ#6f%a@bMNIwj_Wnnp)=<$n2^t=}K%L1WCUj61iswiFQkr>4BG?MjE(Az~_ zqjmvq7yuLwo(g-46!?I=U{1!!Z%RzrRPUI~&*wVA?ilL<%G(79?9C`{8<-}pR@ePG zLFj7JCOBXvo<&s_Pt`TrKq9@UrfJTKdn;H{f1Yi5*&|})Q-DV~ zzM1o|3~`Go%bDjw#^{e~TUTR3&+h@0J0S^4*FNnedj9wU8BPtf&b_xp`GDsrtC)O5 z*zM@u{Ql*Ux488AK{wnE9bmf5dt(Rhk7jXw9Mc^*Lb?{q)|iQ|wf1U=xddrnLi9 ztMeFVX4VUxf**kg*XZ0>l&@oT@VS;-XnRlZX-FgnsJ4aX-y=(S6{$Sa%aR+)dsuNig;6L5tgb2BEP_Z zYB=)kVNZYl^M^g{i$&S9X&ep~H0E`urQZo@IW-cx{sMUHZxKi3Q^+WvfMcLSmmkp!i~8q`!^(M;1

$$o_U(0>J&3c!ZRVb=?kkbNwcd5Mb8AQslJO&@x_C(26|!X~elDRhJCz0czxi$VSWC__+um=MP>p+H0@_hqn2`6wfNqQYIw0)&K=yF#JWv7Q8pqz zfYF~}JJ_2zs?J^sz4k^G@pXmTFfQeVTj#lC;+_%Dwxh1+OfXT=S@hSl?&G)V#}1tL z?)S+&c(WmxRbsX|D#F_~rEa7&%F*|s!G6$g9}K!e(O$&ryjoQv0bOeK=G=6d=d+X4 z_Lj1Oo`8{@?lcq1zx3$p^(UK*o}bINPe!?U)&AolHE3F#?UV15t;S15Xr5nV$34a4 zqPZieT{2=~Z*hzBu~EN|_#WB0BuJ+j3>dY~Md?cT*cD`m<9D?$-mP6I%I>igFNiL_ zoin*#$AFg5I-1$|;HoX`k=dLVsf=;W#@HVhIW%be)ml!=`x#y7d>(2~?ljzb)CU{z zK(8jv`@Zw8RFEblH*;4nKhot^!#%ys>>jGVQrs*%ZxMO_rIabl21{wmmHoUY_;Ign zH)VUN?k@mXMJdX|LAu|ZW1_;_Tc@^=^;0YC8*A9>vyb$q<@D{{# zygn)M)8%jJ2(!r7p~kJi0wDc zhiq4Be~M&&za=+Glkrj`FI%!mVf}ZY*8B6!BEPAJCxCFefLb~?TW7driF5~iVt14p zzPqV!wf35Yn&hbC^8Hf&9()va{MNCfbaSl}ybCW?SaVTamakPJOQez)d|NXgI$^OJ zho29d6a_;qv2$J^Kc?EgL6NxLwfWtZb6hXX6)Z0efEy)uxcCBB7b_hnR~P%1Vb5to z()1~=5y8f$axgPr+yQU>Ys*(73&SPECKdG4R{aZ|Le(>_&Q>0M*Yhmq8g*q;&^NMw zjpP)B|1i<3xAs(I9^`XDGT*D>hizseC+^o4hIhY7^gK2idiR9!Akj_>5T zlxRX}(;MwiDe>}E=ExQY)cEu`mnYjrW}Ym%bFJPrzyk7LERV*wKjKj|)$XW!sIBBJ zvQuo0r+kmml#2J1oQ0r~kc2uC>mF~vo9qGs7wnp(f-c~VZ?Dgo*S?hY)QB-WuaAFX z#r|%Op(ku;O1)BqyopR#rc@X}0^+{=Qu3&W*d;SJg1oP6y6e!TO|DOiik!^;hP zVEF~VlJ1CD@h_*5_Qm4tUYwpJnOd$l^<~}=$g?1PYuaB%o}d$l-l~r1k?m-n7(nTS zm!_y5Gz$_5=H7_0RIe>SJ)xE-s|by!E-1eBVQ1}8fgC@O-ivl(C7Uk=P2~jp$hlqe z&8WIxOujcrY|~!Jv_|YXpqmSwHk*&97>0lCH;#zD&tEy$-CHSB96b)S6%}u))(A%t zcPMq4Q1TKZdkyvJ^GCFftrtXTHoFZIvZbjno*B4a)rpRKb(Jd^5c5t~J^IFjhn0`N zIg(6BkVNb%$vA!qn)j1>@neWCqM@H}GBWPAEIprLnjM)_uMLJ?sHmu;MXh7^er;G% zrhTkfF?e$67KB$RIj1t6}0l8SfOUJa(Dnk106am~H`t9+VTXy#>0L_r)VA zTt78f(+r3iJ-J450RWl}VUAE>!T*$Vqn1na71QZVCx2oKj!*6Fe(EfApAIV|e?gY3 zOOgA~YiI(au2q%ZSMTSzHxwnhH;o-ED(EA8;jC7>&h=>+_NssptZn9Q3J%7V7$a(c zS@FEYKNu*}h8cWn?oM|`vZ9wk&c zyB1R*n@Tyy;kYHrjqC6X*`ndW;T<`!Fs1Ah?u#+8PW^cCaxEA0ow0RiD{07}E|D@g zhu&301dr9IbOWPegH*<6ZSfS31yXF0SEL8~yz_g}dh3}cOPX8*MuI&&Ny>+z(;VXJ zY?B$!m!m)M5k@7JNVbIT4}U5QiX-|m^wFFxCuoBYDB34g^+Jva-SJIWH{?y$tq%^8 z&?20Adymgg^;d$8bpXaS3V+WZT4?LefXDPGe^z~u+ePkM={ov45=(LgD?l_sT1tM) zt6w8Rs+Vw=5qd2#m)b<@8=GS7g1UHt4sPy1u}aNw%vfRMmoKMrbD!LB?ik2F0O)>f zH^3PC9fVoq$-AeLgPwzdhWK`N@4Z<8+=3Tk-iFO@s7M zD{^1{_Zp!8GEC_cQWQLykdUit@ZgB?s5s(#sv@CL;V!bdQ$0%4f~?`wr}1yd z{FbQ_`GGjGdY{TvzLs~*p|EDf;-yVgZ@wjY=$>fVuDiLRwk)MAM~ZMJcf{l!XBDkJ zTEB5UUGt3InjH}1r2{il@JDS9_y2&w7%6_iQ;?Tys+w*YJCREtF|Q@f5%u}j=ilH8 zT3eevg}xe!;9AW0aCD5=C|3$mX58)O7(X>!Vq$y(Uii&TW3y#5A1BMj4_o|-a;1|X zb}L%F-rQ4ZRcDS@j19h$CE~e~*2K@!n_-{H_Ytuq5l<@dmazyma_O+z4F_I6#MSn+@ERU?zQBjRGA5sFZ8|I6Dq2G0sGuG_Fxs@G zaek45{7YjxJt}e7bT^q=dSd3{*(=DabhCH^2J)JCPt;9z>(tk&5hocZDN-BC4oUNt zPaT9g=Urg@(*`DAL~pF+L^5v%XA0pyzHVVRoLiLAmFB2vuBMkM_*?s5I;Lo{bxcEyifDBW>b$Q`K(we70tT$d8V`n=|E~wwxZTZ;C;phditB{Ns$L1`Luh~ z24)4=btT?iMI~yFPTm!nlatkuxY8%|YxZVlOH*-X%2-5?gj=23(0ERm`t@USY2Pv@ zLF6eI;7NMSiez;ZxU5(wzx2OxDA9B88)5kN;c7-aT6VtW-u>T2DRMD<;)~c}gvmJ{JUqXvK%HorPrFq)jO%h(Q(I@V6rb;2JVKILrw6n9 z=Ty2&RMCZYk5V=*0SlR>fJvmBjnEyAsVa_qRsA#VO7pYNA1fDjVefj&z`Xo(tNO98 z@Y=l>L`YuXSqZ<=hxbPrIK=x0EEZTcB6oPi>4k}Ig1o1AdJwT6bq`hR7^QG4d+1ip z_Hwl)67InoX|aMQZBeS37L5{&uitdd42ni)s^xs2lhtb(u`jmArqR=nqzM1~q0RyZ zL>BE~hcGU^UkK|`n8od8OY`051rQ|{Ji^xM_u5IruX6p&xCihjSVhe zP=7LGRFzR=uJ!A(4`SknPi)DlcS%^gls~~%CuAp1883x@TyY=&gct>jGlSI@1wxa# ztKO^|bM*wf@>~lfxf2(#FuawSuye5NQ^P!B+yPNXw#wJ92|D%k6Fhh7X}>$@T^PCe zhF#v6B8f{=m1oxQZgFv&Re5t`Xb#MUW3UaP0&)a1&IwFK9e20Xp+mKz^m);Mu*p4EaPyWV3V zyX4!qGD5dLwK?y_{Lk+lr!*i#{*nn5T4JZiPKV}Zh!U~;!bYHOm`9`cQoimPmT?PS znle}2>PdAzLrr@Xm4;+PN>1^}J9bDSPTYH%>3h4zgbo3vh_SPZ^!xmJwy+XL@(Vf( z``%Q`&IM019490p=<%?@v#A|yaqJ19Ma#_jx3u=1+*!nFrZ!z)3j1w}f(7ORDN#Lc zUJ41ID!yphpA}^4+#KxAJ+u|5d}&;rL`Ye$t2HCd8(9e*N&*Td&buotJJhM z`?)^W#1%&vcPfh)&|9-cJ{(Q_=RvZ=vrdWAIW~`8U-kI|F6n$ReL=KILHx&#D~Vkp z7Mhn25eT|Gu2vN?w=q9|-ch&G&bw5hT*v6Syn9}CRkHVmKmlv%&rh_>QzJ(ahRoBy zxTxzMn+XoolT(MomSIvzE|>vGxMEK!o|ftN<-1mZkMh1rt@x3Yf=V&x-m}ro$5y6W!~HSeT})2TzNTBLJnLD<{D*IX zTs#@yll^DUT+~2>OZ-6qCWjlOyqqg0xT{=iIbtUvtB-?r)g{8QIKzt4MF+?sfcF@7 z(yxzLupU9tzb63iJAO_m47a-NyPdT*!FEdWFNh0v@c#0&@L`R}_Ecs&NwD&@e<~s4 zGjsiah<VtB4+Xzon7|JR!XsG0c|D0@_+tk- zi@{B_JyECl4j}><`0L=3#Kky2o2hVZ^PC?IGOC_Kh=6pX`~G9Oo{*3Gan-()@DI)#X)49eFKbkKp8N2{I>oCAyZsDGuv+-! z+>jHu+Zk?ss)smC7??%-Sx~QGqgfX6a1%;xmA7UFFyI?)>(*mnvzQsLJ&KmCb2ynh2DC)Mkv!p>{-FuU||LG&xcXK?ogLjS>$Ss>UH zDg&#^rS`GP(j%X(c07fMpXC$k5?5|Kstb{2Fltmn5irxB(YTNBf~P462p^#ao2N-B zIb`utF)XjbjSeNI_3dzI1WF$?vSC7=Vf5*QD6*fq#*hUBv#62j<5q+OyYXNGWY(8^ zz9ek4ZnU;1#3Yub?Jay5`|)|nqvN1GX2iqWqe7f>mT}VhiU!GN3j(XU%dA{|WXtJ# zZx)s)R}{;!HshfD-vRw|*#igqBI$CKeKpEo+-V}?CzhDKxd>hD7x6kM%L$Xf&KAGV9mpWS(^6*3{hxw@zAsc)G+@4+-*{DFslB!jZ%C7bfaqgDDN zgH=i|#mbS_d}3HD3DEeMjFmg5RsiNTixt{bDTj6xbb^OvoYCC2N)2InYTwy!4N z-aqV-K6qIM=g`SI?wky@)onOF1K<$b8Kt)^$6FP}Fqt|~vO$EBN0;K9nKIkGx}s|P z^6Ol^yI|{S*;-3?jH-bwTG5pM`lU5J9n#hN43px1rQ5WU?u8oFo^EDhsKjNuMk$O8 zgEG4Cpi1R1%4)+r>|~0cZ?x47RdIABV=Z1Sjk>Er2*t{ylGa#YxB3VX4M!xVnY*%E zi?}SZn2cv@?jpc9?*OS|B|=thJu2zAk7UtvE9f8|G_zeFgCyGPuD|GA0@% zTHP@FuUkvD_g%x+{X+CS<_4Eq7RL5+Jmk!NaliJkTQPaV?Exf_g)(~g{9M}D%oOmp zCLvwnCXhmAHp@+4PY$h9KfgRaI5haZirV)VM7#^H%?D3-0i?MS=PQT-=*I z!YD1|<^=p?GJM%>*I&&)&X=iuh_bicVDa|)O}(Xo>e`0(_wM%b>m`Pw#*6FkGOuK{ z_ADCWqJ!UBLL~!{Df@Og;D&gdCNHrgt<}Wbh!bjSh4rsld%6U3l7>EX>Cg4}NK`H~o<;Bi$NuG*V`@Rv|s#btM@l%2; zpY=>8-4;1+99u^H$}wgw!9Rb-v$Hc!x=mwRVGx|iJ00pm*`7s-ph8&d(w5P${pKrz z^Kz+iV{mz-7Y1^LK><(GQJaY$W_Y|E>ASnT-!L^bRglf0ujbQSW1J9GDs8lM2~Om(X9w{e~ymj)rj+tedn9Qr;*EiyegyQZE*0WP1myI z{Y|ul@_4-(MnEPxH@9Qfp)3#o`Xh~4C7G{ygt+g@ywi&N$r!w~zQ*;xU|fMs<23^8 zLxSd*ouo?8g(9STR5*dlvlD2nTy1`x<@aOX=<`|@5LbgP)WNBIr0#z~bDPPxck)K4 ztCH{bX20OsZ!-F;(@iGz0R&U$BE;YEJPi%H`G+o8RM+c?&*H*YKl3g2REVn@ zmQ@7hgoO32pek(oOW#kpAKG&qG$~LRhI^BC%^s-6+~2(QwbY>N>ce@=gp4E^!=KrU z;#DeC4EH2mAtNT>4EB{q3!GLx>{NQniedP%YE0=NyKmC9@MGZ;N_W>&@hMa+wCLtw zr|pb;Y8EglD%IpqF3BXfA)m6o8a4c_qN1YhXJ2Mp2q|;Dl!Yt4fcuHdi4Sy8Q*7<*@)?*mqmom4Bk?IOLBO#)e15tt{Uq=g^ z5>o!@uJobnlX_g$^c8evRKUe>s0hvfvnAL>jZe=TPLQs6Zk!T=*HQzFi_v&|J=ZWY zwb%asS}EodkKXRkb+d&a%R=S1&pd+gDxG6VUV0Sh8(qhc`W5CU6+CC9-HEGzYSxzo zW?zG2M-QUz0wPYlw57ktb`S!oan@dRTs*E;<{8~VaSx^ zm*w>JJ&mGXULCmX@-`CWyRdY=A=Ki*-!3TOe?9DX z9yxM%3-FM9g_7izn{?Kd^61J>=Wd@s-^F&i;Hu&ZK6ocojd3NN&;JnQOCf_@=`AIa zV}>hTb&2tmq8~Ft$T+wmRd_3FIL@{!K7jEflT&KKu7u%G`5GOhz#gjmaaCri1)GP$ zpmZ;KW(aDaguLHoGfc{x=l#+8-9onG^rb9zIUHQRb2+`IaF;Q5xGU|f)`A$^-0W@h zN)?nE&gI%na)z{`NEqXEQjF^@>MTTor4aRJCbhWd?i5*qZfRg&B>5`F^}5Uq&Kzs= ztF!Ril3g>&qCqb3_4Ul1ZXKGMu3!K8b_NY(1GC;@Td4XrknJ}5-Y zW^G`dpL%ZG68m6Ac=!}b_Z9vEGEy&CWu#tlLwboNvNipmc|SHbcGVli*pdZs)K(xX zYlhKkQZ49^wAR&qHr20=bI|IvOa9hgN5TQgWt-vr+SbLPf;PyXF@y3g9(!VyCLCXS zU?4|_=9CK|6|cZkiM0?n2>OwggB%g#0`RErnmeJ?;Q4SOIgwc^f@Pq0@Qw6)Lp_6w z%Xh->Iu#=Wn&pSyN2BYQw{PDv@6W1lVV=qS+}MEwE{))eN<%;0jJen}3fg&!r)hJ( zPjr}2@>C}L^5)w`hv2Y^0S8Hgle2RZb(Ar#ZSylm6HxF4E!bO`{dwJ(koo-BS0TMS zF_ErkXy{x&FFC%ap1`LQQ109cJl=p~;n0=b$NXA%BSV(OmkRF~d>aEbqAi@6yL_&U z*uCqGJRTP!_i}`0n6kL16oQGqW|F;y$(3Q|6opr5&=3?eK0Y35B?3j`Bt8} zRN$;d@9`P}R+ZW5=va_w5RragSh!TC&AXW2QL@1ZS$AJRc-Ey2THchYw^ze2BJlRq z*8Gp2v^YRSq$Gq~!dB4}f3pA<@2`W%ZDAqVzX{`dI!}UQ7cBGCZn{%&sH8rN3v#g8 zC7(mjbvX7hsGo5Xwqfl1*T~ir;u}>nP*i>;j^X_UI_yfJw|E|iy;eez4Za^OQC4-) zJ!~%dvj3C)E76)?2G{gFR|7T598%pj`_#i*lg~koI0P-l(iE%S8lX)cFcq)wN6yELP7lOFC zQG;2jPl)|p8goal?m;ftYp{fi3Cw^ah^zIye63t4-Q_0=sVfGPdSp)C=-818FoI|%MM=#!|yk6&+v&OV0|5Z)P zXQ^o5lry(hahBaUm2(OkWwbb$50%$t_igss{i3$R!+?R-xe9xPcoVu^?005f1I0&N1yOeLq*;TkXO-+Ys zAxBGo&w8(J>13c7;yRR#nd~2`Km7g$dQ=ULb&{hMk}kOnEp~6`CR=SBrb$HIYX7_r znh{uYIarffYw}zy3kHU>e?Ug#%aS|_6fss`#u`VLIL%@zjup4DoQLyOi*;V=SH|G; zx3RT|4fjpMHCpGpQ?D^r4m5I3&qwU$w$!<*TjpV2>2f$gyFo?!a>#+{{IGoge0ze{ zT{hQ<1j+bZ`P#5zdpIC~qFs6B)M_S|_sjz@vFfnt|$MQs7I zLKem_^1x4q60@S(WmJ_FXxp#*fKk^6)`}Yp-4v)v@Dg39 zr{Wj24gPfe5RBqd$_jC8omF{#vCa;d(po_)W+L1M7#%Cj@SYmTaR)}@cu?R@yPIuV$*tEYYw*y*qFaJbu) zJHuoQ0^nuc&~(Apd@>82C@uZEro_$tv?+`#PNtbi*k5{Q$>z8tf)STT-VFJx1<$$t z`pp9i(YTnxmpDgt=ImZR!|l^VY$^0!8iAD?b-Z2YTv7|x%PT4f3ydCd*|#lF>AI$5 zOYwDN7t90q8y0ww$_3nt%OU8?nx(<#FCPr&Af(pADPlIGWM244TFp%8I}`d<%lhc3 z`@y?(HT~`1*)OfU_M-|v*^g`(g)jbWxY--VmxjBWqtCgv{CwMc8rK%HLVw;?yGu>( zqlo`?-{|&2tt$5-0$nGAPfzLfRz8PPaJ4o+XC@qhAmh7TtMz`kYLDh;VkV0R^Ljf| zu^pj^N7pEI8R@j1R^Pfc`-4_OsLef)ToP{nqhy}f$=qPd$`)feA zi@v+Y3WZFBkN1+**yn#Hk}!ljr8>8WG(gWJaIJTq33`D~W z*ib^VHLvBDb*QEe~BIJ;v4isp9TQzav&?uTq_X4_2+%3x=Ow(b5RpXmf*=a%$%)cTRo=D8rf&wf4};%nGC zP`y>O(pfa1Yl#@S?R46GR_acM_iB`b9e`%aaihGvycMuvvn%?AU%^ylJG-eh2Lnls>#j6GQPZcQ`1EXy-rw(!f{2q&bVmip&n0juDv-!Gf(DD0T&dWR%Euu?&ej8-XAAH$MS`X*+4vn81CP7lmC6~p)H(p)DF;i?E zF`3T)c#$uHRGuo-wJ4F#HhXDtk(<*SIUcm?K+RzPa|lxJLfZ|Qb>J3rF^*)|TZyQ{}ZXJjF77W13z zZ`p^_GV1AxxubZr-q*FhA2~bf5=+upULU99Z`$408&r8&Xm_i15~Z2mkn>gR0g^}N z$={Hk)h@%*ngr%bxNOh5msoY=tx&t(|#Qy4+yjN%6E@kIX z&-k5lt<#G*wlYw&CND{gXe9Uzq$f@I9C;Nu%qbRF^l@HtUC}Q9Ggu4*sW8;2Z-*X) z@!)42La{FBuou7YdAKt_Rj@d#T(rh?D&uXGyvt+mXjkWxp6Ask&eCr9n&>kDC!gGBI?UIT295VxZOTh9Rhn;Dv&i_>Q1$3U3j z@!yUC*ZP0{HklgJeBua@8nan>dhiN;n1V2bT6$=MYQV{0N6Fh!yZ%}DK5|45w5xxB z-tX&vR95w_7(MWfAkMVDJ2>qF)q~!a8f=gEKDk{W>KE8JA(ce4(mC}D4mWI>{LFja$3VPj+^x4)V>Aj(NNbdh zlftg@`?Nw=#D~XS2#p?#w3%BZpO;llJx$VdjfbOaF6BW+A}B zdiUG^B9QMTwgOUjV3x&pygEiLtf_GL7}YK)66&F!ajMKd`wgJ%vQB`XAohNxYkmyB zf$aSkTD6uqS4oj|;v7=RH>Z0HcUz+?i}llPVG0wGi5m0!4`3z4#N?k#hziL~tjp1a z=PXiv^JE#m&GR>Q3vf1b*5tWm=dVfoZ7&qQ-O-Z3;Grvx9{6c8PSV1B94~vvgU{$& zh$kK{^cZJ#gyq@Y9yNGR5KI)tCI|u73WqXz3I*2WYkMe-2MPVekje9Nku&Yy|Kshe z!>aDKZUIrcTSBB$l$4YgDM19KK~lO^q$QP*RslglKu{^^P(VTir5i*fB@~cS1PSk4 z8+^|>-#PcY-+i9@-2dLkbJ+X$TYIg!<{V?pF^>mgT{=t}R>Lj7UuN3dQO@u~znx7^lfqgy)h2ZC-{eM5ou27*Czt_Q0y8&x_LCPOxE6zy*9`|wYy&g%OY$P)@}uiaUO%ThZB);*8ZS6avT^r zo$&#(4-HpWA$iQLl71T<3c;p6H*E|G%5*Dm$nC);Y;*zM#D(Z+38{@sC@FJpmzMzI zztbPor-(VWS6!VZU!mS6?}G~=6@5gCsw3RzrV)oB?vLXmV6X;mM;un1F&Hjr2~De0 zg4)EGL|7SzU;l+DGcz-!kM1XKqE5d!3$%#QFBIzg=Ea=gYiig52<5~XLm430K2I~N z69>yP1%3PWP3wEObh8}(CSpp2_b#M}5@2_EKC{@byG~PdCz46t=)Bj6@0E5MM+9Et z6X5Vg96_UCcF>Wlo7WD8F)tY925p1u0nljlHwA=1c}%$tA)A@2(pLe|mC@+jzqg#( z{;+(5GspgVEd-A^?%n($ERarR9~ojIu&XvV#9jv1G;t{X^RBq^k_HV@k~g@(Gb=@d zZ$Th~x02%UN`n059a2W|CZI`+wkjZY!N zNF$UES>g=c=ZEiG0Ownavd6|TMd;}YPD%&`x-BJ7CuKUblXLrrpBh#ED4x_^bSOw| zRoZ*a3b3J_BCfrlDXjaMDA))q!z0F@`dOA;4acP6!V_{kSvW}Vg@-nx?gZh=Nb@)O zbE(IU9M?K8pSI3c+1@O>oK3sh*<7$I@FGB2V zM0N%oYGW(ix@q=q{O*sq3NP}b_+Iauam%5n7Fds(tGDSwb*NSpzlY1Yp2wv!1PRQ; zI~4&iDkA+r^IRKHpPb8;^ttEsdm7?It3N==1pz+AyTQwZhQYy!D z-zemd&`R+|u)++n54uxYzYeqcJ4TjYa`C9X#57o4cXvExiF^ET%_UNNFY&2;GSCR; z^Vx!#C0|Dbuy*}?-YADJ8iAOeb0*JH=y4KoWq2L#Euj9h02t;+XLG0jVUs@Pie)?B zVu^TFnPCbC9baAvTgA&Vi#ENSlBofN3XFkKal=6HYvjJoVEGHf(OihAdti3B$ zS|R{a-8p2q(Jd7H{qbgjz%0qPMm*F=K&Ma?M|8h1XS~d+nZxp1 z?QoOp%Q$_kg`!JjEk1r+bBZmoMuX*V3!@OHdOcn9A_g$BI|8hahRCl^xl(1T12%=5 z@ks`_0SqN5-FbI3$U&%>$NIQXIX>m*-cMq#L%uHAcn$iFRcRG|E-JrLD&z+G9{&|` z>%sE1$l^hu;A+O66DxZm~7+C81^M2@8 zid9OnxYpxZw;`TfcyyWqyO2f-BytaIib8=m`+kO*jpC0a}&!ExGO zK=PF(>{CU?Z*Dg`aOo%0w{z)JaxcUAUc;@zm*HLl1wP8(Kw=*+U(P~ zM(Xl5lL=!fodsnDKd=i1G(k2Hk_4iJf!O;Zs0gtN4#dbZj)vf4Yd#q&0)ZEKw^wWc z^Czc|yudl>)f<;PB%nU%kIm7c$F=yex7o0a(+rXD;3)q6=H~}^^tk5c|2Ww)s8eo&Wbv)&%OIARda=uA8$VJ}2QluK6U%EUR z`oQV3mhW|f`<<3-{tw%dM(%QI9j)s<3M^H74q~$cZE#Z7h&mpLeTdo5_JWQ;;bVto z+Im4ygN@>^OS2Qyp4@zLA6+QxFxhi>Brv$+wV}XV$fqFS2IVw*{~@O#kx$pDzgOwq zy56)e^^owPtb6TCvMm6r4@hR+F#+TMh+0L0et;m$GR&s^`#~*1_akLf5VRRBkeSJN zkUoX&nam126M@9g=sA?;=$HEP;={cRY0j&NZF^aQtjGQhxg+@R&iC9VqNAlzu1C_2 z7F_xO+5hg(0l4b8%cWkFG*k?xU;F}wq$9MYeJUsKcmn6g4nrGEX?S}_3* zxt7X-q@de{g2B&(T;A^ovB!Wy;AX*XN^3vvz(6E+fD@w1k1ZHb=r~iTL5``ijOv&X zJ?*80(I7PC6*bVS0Qt^-@{TW zmSK zw%(8+qlK9`j8qY~v;T|AA`rVA+e_?G?rk9)Rg%uE@Jc8NTunkONO5U}Or5ySKOj$w z@0w%o%;Lekqg4cwMX1Z!dzyhPVhMXBOg<8o{Qt|<}4%J*NSZSYs0;3T+jlP$o1)*@VH1O(3vsMwcO zWWY%~Byh(^1@XhoP1Fpx0+ZNmduc$LKm#?X?zct*s4xILyqU1D z3QmfSPcYpPm%WGIz9~X!ZV;onLiio?Q-9fFL1Ca zq-{DMG2HQ7Q=ucm6{RRzClD4j8gTH#ez4S&Swr|cQqVFv@H)|P!vVCM}tCr zn?c~ytZF0`P>(n(GDyN$hYI5;$;aPN2zwr~e*xa8%368`FIxqn0h?B-mK_0W>IcBu z-P~Q6fi8+fkcLoAegL|t1MGzCD%*pVz_J;uV9K z!~GASXKjD_(8Z{4YlP^-X7mR8<1Q5mLnH`?b3u)6OTy67yY#RX@yald{uNc)aJvzi zc~2+q)IiU=4R~%7WvE_yKP3?amMRi{jw9gh(?%@2`@OHDZl|s80SHYKnbkkW{DGpt zGrkA8pjrG3H~U??1Lh+SkqwIsSbbYJ$1qC|wx@`-F}ZjDW!=(S0V*eFmTXL$%w2>Y zs3zi+t3VJ^vL`Nz36#KMCH2(^B$6-r_GNilLRk47Abk}6zcwRrdsPZ`D!+h2Kn!TW z`|(|BpeVeT>}~%zntQweT|*Y;N3aqomZ|E{!`v9ztQmzW<`_BNy&jzc&B<~aNStPyMZaxZO&D(lDK+HcBA zkDybgwjCHK!W&G`+=UpXlQ7V?4t^U?!fn8%?U1QyqsLLe6Tf@k0D}lp*R6^~^+z%E zry9Y+glZO0{_^S$V4^#Yfsj>a{6gENv6}=x+KVGlSE6F3;&geCvKT-a*WFJ4YmA{+ z#K~!k0N(39LUS7Cfy^#(%bV0UyJ`J7aRkVvmu@?t-<3dC%Y+~8jlmgjt_M*WJHQ~8 zB0STNACAHo5nmO9odl_eB z>Mx(zz&Y|mfA$FoT0F_M&9w$8Rg)WNYu{e$PX9~UKT}=M+Gz{6Yh(W9m%=rvx?^mT zKHu$1p`Byo%^`m^<29^+2rO??Et`3WQ*W$Od9{Q7LvdfFD zWfaGG;#HE*hJ0{Wdgc#h z*%lxw!T~V;`STe#MVVJhRzV~|J;2aD%+G)CDYJ0shpO_&_Fv7)gzUq|?EF zR{4qc`n&|{8TkL25m;Ftc?a5FYpc-9+16irvuA;RlYlAdPT6c+o9s=v!4JVS+yBZ1 z;8od$_;M=V734#MF0XHB(&!%RQVke~4qOmof~KLNcIXniH2k8%(p9s}8mFOa=l$cI zh)?G~aj#bQWE)NEPnG87@jq(#)y(Mn0I?|KPe1}gX#14bBg@MgSS4kfv5k$5r&USr z2we^gz}4xjS|hun6p5-jSA5eD;jfX=?e*(#sfMBc=~ri!I>b?hf9v<{{K^yKUC5;6 zj?J@k0oF%G!F4{7%*Cvu(5y}_ppk%^dXLJ{dl8H&x8!*UtM#SMs_Z1U(svN#J33?wPDL?R%r-_Mp$69$!4bqO%<_dyftj>bcJ85=HBK2 zb6rDJb!Jtxzbgna?BB+Ju>s*XqmUX>w;@%Z(b{ol(dmNENcG=hMCX!GfHp>)ERNAr zbK4o(COcd?h)`VQUkIy%=bVzw6HTgW-6j>-g_Mihpu^VzMk)-siIC5=!|FPcCCfh- z`5$N-!1wOUtnHa?!;H)BGcZ`-B*=n|tN2yF@xP{L_nJ-JY9RAB-$Oj4+pF(O8DT7h zu0I7A&wn9Yb&Q(bB-x`D1JO^FAwbRqHkF~S+2t90m3eppn-Af@^?g%Y;;!s|I#|9h zk9>+SejRCE)1fP_OxQLPZ!aQjkrlUWr-e6Ci170qj<1l<^3rcP$HPFH^k_&edxQq~7sL zdbCqRw@QTy>crj_mP`I(2=|^8{IaplvDeYqNL(Y}l`MzudAmoD0XJ^wJggmb(Rkcm ze?87DIcN?mG802v{Jz=>k_&>jy~!SV`i0_a*CJM@TjPyrC^p%wyOhEa*b4>>kn*f?xSltt9HZhW--cex)B^eA<<+bJG{5JfG;7 zzjZA)xKCGy+m%fm`OfK5a%$4XZ3ZvEkZKvhq300k3tI+p;lP~a38V7Y3{e+Ld7TMU|z?^DF z34Z?=7TFP8k?F>1@CwLGJ`>-XNL!x!V0UWy&-C!R2xX-M>yJ8GoFioh%c&w!H%Jo_;SRaN+7m zt;TPZ+4O57eYo#1(1VI1@tnWHkNQ69YKF5Yc~bFD!d;E!h3AG9Qnr#{b@3k>$y&;b z97JuPymSaQCcRSq%KghF7`F7FS#F@ zz|?tv8z!C5CccC&+72+@ynOn11Ku&wOSWIuAo|_v>RI^uRaj!o>k7|FwPZ9Qfck6Z z%yP~8h4rC;-Ju>6Wt(~A3wPUgn}+#(PiXWwBr9fD{G!b$8aPd122aZh!`a#@T=spKe;+kUbwC2fOiPw#Yzr7@U8b?wjCcd6N2F!Wcx|M{ z)7gffv`zwQk|FsNzxpU~^?bfOFZ=JMvdA(Cr{R;;?`>Q0?IOofc8KN>kk+!6E+B8E zm1B5Fp%`yaiyRtczQf(#x%1X+N2%wvs0*~g^yGbq>bx6R?D2)quJ4r-D!lOR?R?D5 zs+~z*%ij)*ulz_m5*Emh&BFIpO#tQ&2^uGUBh{P#1OgB(`j(JL^D36nPQ8rp&7bOa zW`x>0i|3ke9w`mK^OeYEMKbkRO?icz|K;6*jl-_b1Ru4j{X$QPZj9OCq-6~^BJW?1 zX#1WhMbgq#H8DXGGum?;iu-xnt7wQH{I=s#BDowKf6PI`Po_sJV*N5#T< zkEoQ|zXcHIV$?hw2WiQscwI6`gfoni<_mPNtBo|DFyYp))>AtPjZQ>U7}nmdxQJH+ zt8O#XnxM>5bLKz@al10VRpd4JDn?@G1%S5hP1yIgkYAv}TVL@|@YY^JLjSp2BnH@r zS-Y`0Q;XizwB382G^{1$$jz9CtfHgfLFxsrxaEvw@Fg;(y;ARB{DmN%o zd13&hVkn+hqW4St{X!CI^Z&k(Jc5uu(O0-oLpuK7)R6D6kSI+#3D(Vv1QW84$DP8# zDE1kMx}W~ z?t3%{CHHB7HtI8Z5QBsiDt{_?Tx=mu(hdU&ryaFQ3Q#lny#jbbKAm)5WHLB;x8iYd z;M@exLER*8X7>2~%(V=YyNZaH#?;f-cQu_0j+-9_3*^C-rwOx^04S8ZK!>;uqzItv zzRsY{2Fn4hFy8eR-*44bK*RnTwg`}RPd{zD3OL?>~PszW-v~vqC1nR%!L-5`a1+0O~y0|KD^g2BVz!|3K~y56K%o;$pE+z|yQ2|A^#z=f(^Igv z;v9a8eeyz3?i~4%|5DG96Ic;GdN6r{b;5dhi+5gX+LiE;Djh_)0qFSAsV@A|jOjH9 zJ9)6bL1}`+5oh}9@Dm3P^Kk`6!A^C7Oi_cM$$>S8FZ&M`nz@3tm^k$8ER?<+Mk)-% zj|1EMC>^EAoV2jpUPJ!{f=vK1J%C4bqH)8=kN$PB-4Vxe=!F>Ewu<%ZBkJ_Pk7mWA zlXmYI=D;@L;G_ZZ@%P3*(Dp2rw->G4_zP=wKx9~#2z#oMR_*@kfwBWNKo;Nv_8k5B zd&#>il1OlKAe(2hr2YGoFd!@gQTzQHNt+tUKoc?pa3huGf@uCM-4q*G*Tq@ z+JJRPIBtQ5xnY{fkyfLx{6IT0HzD}6zGYCcTyEeKCqeITY#df&a@hD+JU3UYt<4J z8%6GNj|*1??jYVOP<{|8rAua$JT~aCyMp8nunL6Poakxx{7_;&NgQXDgJF)b@$Xaq zt$_H1@0IO7yx)-znkRpYrCAY4&~uPF{W?}=hm)5YkbYq#j5a>LzOv=?rD&zH?qSX^_dlKAMy{G` zgs^W;-%(&a^as`U&9(3#pEf)aEX-*VMRi88iV7Q8{nu3Ck>gjVo0CDU0VRs(9IEZ& zq;N*|OowK%oP!@=o^?|%Zbjah*sl*77a9v z2Hi&(ajtk;#5~2#u?P5EB|5%i2IW^0E;Dm?>hL-KtjuM$FVuD38jW$Y%YN)Ms_Rgy zx7s<#zCl%|kWrCM`P0Fcs*dpszBHp}!s(ICgjGo=+s*)8EXFa*3f*z?1ZOjgSmlA6ff?4bo^u_mdVjN&?5L+JQ9qysDSN? zG8yRD+i4u>u)PNNE7P!^niq3!IuGuMGKxU+<*NnaX#?!R{b%X0$*_fvN?cvl;`38% z(>LPw*I>bRz2aVWss4Hd2pQDIO*b$YiTr)|KsE>Ih4JdnRh(mV70b8-EEw+sROte5#p>*xMXTjhP zpBFW=_z{+H>vO=QWGb!J{B+{6Rj;gx(;U3KMTCuacEl9EtYxj!;^mGcHgs z4|_digg3+5eF7)Q{0b?+5=C3)c1}qQWeWsBJ}sqcWVU3K`g3WzxsK^~j4<+a<8h}Hj7NS@A}Z9%iO*9pDnK|txBtMPXoU{m=F+IcJc!$>IS#p6H2+ClO!5j8 zsj*yt(~HV{KDf2}IcokvI$6p9>C_7;KFF}}y~e6=Fi!hk8vA0k!c?<{)3`N+=2IT` zR+|3+VFldD2@ZwS%t;1a2Ki0uG|s;;o;=7t~&ZIzYWp~_&M!?tW2_5 z?v;R-w*4c*5iPE+{%EUx_9MvCZp81)(@0PKg;`vbL2C=|H%=1- z15O-6?eqcE{Ki=*IlnD~V>+|RFRjpM9D0ix9`zDVl&uamhsh}&BC{_BOU=82(( zf!v%XB;LPFxhUbFh!rmg<`F_>C-zTY0vq^0eTgaBfp(=sj&u4a!8jD>LDeUYnyWfU zW(l$=I9?)Cv?zu3g?LmRCS8Eg+HNG81YM`|D(ePR!Cuj~hW-@@g?6~wQqAhWpvaE0 z9bqTkhUY9m{)o1SG7<6L@p|*aAdjvX6Ij>K-G`X}Ys3fHnOG96!XeE++u={P&1lwK zip|t&f!2nAV*ViYbL^U1Cn#4a#~h7xh^;#c(yk;MkVkq&96VS3VOGPW19_IXF!U^2 zF*+(NMcHZus?wl8`--M6K!+ z1-m~Ne0H{$`O`cQ4g=EuXl4J+=tZ&#>+k)!FMjynp9`5cG9M=TFs>F|$GKF!n9VOj zIe!$7XPNcsJJ#k>YRP*q;LUU4hse1`*9<=dFB?dX*lfi!_%TYyhE+_I1Acm+ceH}D zd*{(Plzi*L#jo>{=nCXi)D&PQ5ijzvn+8+dPI&ESaY)cm?==g822zmFoWf=Z;MPd7 zf}~V?RQ(mIN$bym7=rOPk6wi;qnYytp*L<#>#DxTty`7E1k#u-PTzH3inJ7d>L2iJ z{nc@#@U&_Ks2w6>5H*2Q62WaVj=%qXn5+@&WSX^r{Ff7OtVn=hcfAbLzTx*^?N)@! zJ%h`m)*9MUB-6PjZ~Q}VFm2e3b03KV#^8=!Z>FI~0(C4L4Gy?x9V~~8^=M%4IQ-L> zz@_q^zNCih4{py0Rvfhzx>0?JWZnR>$>Y}0^CWo0ujsM8@rM}y@o?m7)PsM!)5;L~ zVh0tFOn5E4mnl2uA(#IcPmG9NX(nD?>~o0&OlLZIBOLQt{6S&I{D;p&H~+q%dGM>! zA}a4N612lxHfyIKv95n`ux~+HKEYg%P23ANi4#`XH8jv&UI7>N1@`A?zZrp>M4>=S z5MB59c9{O7PUDbW^e}7(4}(FmN761fcqKZ?>?@Pi;K$*Mw>X0?St|lm6>TvZB!aSN zrEks*kYXQ|KNt<`-yec8h6rEH2V)56LwR=+Cae=nKYjtZpfmIxA!{S!RAZHCOi6h< z<Uuw56wjV&;IE1+4yP>KP)T?*K>-0 z>F{2&p?Z~xIs%xOG!L#VWT}LAO6ka{w7;__ENsn$>XQ(~R4K66GN>g>2~2u0nNJM& z{>Fk#1EMx6;9*Uusp3{WVNUzg%1Vj1wONj6L+-GHl5RED(i%g%7RIU6=P?CXtEhUl z#T6Zb1Whe)^?c6sNeE|Jn(%G4L|@=HO}~%9J01QB@7!Nez&quk6Y|UABC#(3i%U0` z8MV{7+Df4g4edY2yDs$MygF%VR?9K$@?>Xxsi^ zvb@0ZQH9?--}F;Q*e2e}=afa>s-$E<`9CJAH^Hy?U-I1O{elWq5Q`wmSULK4s%qYZ(&8T<^6_xJ zbl@TSEXayJV3E$8MBjev@1&}HZhh;PQHBxb+U&l7uBX(>Bim3-)+qj+&mh!t6#_T3 z;$bElaKinZ@UCdz1uzrcLKi<8YN8wXkp98lvoCAW3!(4AxD|nW1+;Fp6|H}ff*_^0 zzlSRZUfvKzJ|<})PtN_1qBiEp6jo^UK!2z|T0A4m^np|6BVEZ zklcIJ^!fr+coPEMNK0BGDtqKvCDD!YS(QUy1b$xQf4?4H7~wFAJDq&!R1#sdeX8aW zB!ZGu8>l-ps!q;_(^%ZzH}oYqNL>+y8|fh9Ip7hZY#E5OUgd?>gO3Tt>rsSqDYG2e z?UY`A0K?~7nP7!i$QwzO}(>`1>TXeAE2QU2Wit<;0(_MH(a1+c7t_4aMo`=G#TF> zyN)SMZvG5-1pL5N9lk~O$pWmMV(?Wz+rg&B zz+H|pnkp`g=h)RI3LD_&#q>A8Gq?8<^lXbIJxf3H=i)jWvbI%oCe-4b3CN!p8fQDs zMb)7y%o2(1EML{X;DSST9c#ZV{X;kGA6z5D?cQwlj!G~-OnK-Bvm`oTMp%c!*6*Lc z_a1M-AX^>K+FS#vNHS_O9=PMy9v?VbGSJgwVS4k2?lX~@iV<+BS=lgpM#Veo0q5V@ z^RGT)^Bw1-tra>=n;&TxMZbd3(SNdA?9Lzp+Oe4qqF5k6z9;PS{))17emmaZ+!D=b zn+Bt%QxSlq7gdtO&J{%(A+}tU6ldng z*npks#@8^PbTr?jx}r#sAM7{0<9WC%9q$b5ze2?GEE>i1IEU4)S9G_v*-aTU};| zPQ`QTwU+|ESS}>ka-NgT(<>kh_^iYV6I()%67mDsBE0$K{GnHBs$#kBw?C4UXP#ZV zMCeUa&~I!n3HWda*sp23dx1-owJYrjzWPv4To(UL3lZ?&76Dz8mfz1#dE#qq^_#{! z=XILRo~>lKMWVvMDC&8}Y}w31zUkweAqm;aI6)yZh1)C7nzeZ_C5Jjn%U)%>Ji%Fg z;dZlRCUot>EOP+{HPFTa)i%G@7#T|q<(Ua&{kL5kz^wxl;n{xWgfk`FKs=ull=Tf93Hzcg@aFe84PF6Txl!m zGs6+ygYL(6vqQ!05m?hg(}mjgT2%$m6ctSK;)$~uM|L!(`X^xAiWLF0S=GxCSLqbk z63UF#raHQ}3~s2Y&z=F%gXbw4EpCP9XzvJBLl^|*O09Fsd(^5)jUGai@redCXl7=f(C+nP{;H8 zXwV>X``-Z>ib=U~gH!Xchcz*Sog}y$wRYZwIS5CjqNmdUYFckUPLlFwvu*REIsR22 zB7%Q-MA%!DCflq1Ap>7u_*Yy_T|bZogrC-pG3_z;IICnu1M0Ey?lV_f-VZk{!L96S zPV@#vQ{!}xnXpJln%T0gTcm4Q>+lb%3#T(CJ*F1#89H9wepoEz72+gUH6vberbOhz zeQJvri8FC$NqI}Rvy)77XLMS<6ydnB+ln?&H;+eqat}!P3-5^^pMMTaCxSn`?-b`6 zLe;)XeT-vLaJGRKmvtoowO`S9?R9zD8{9Lr&VH}GQd3dn2x@K555;yL-~b8=NVtyG zlw3nHPl2>5W9KI~21{6E&YY04Ucg&WA3@X_zvoQsf)cVV6}6DYK1<)Br~#vyjuwYXOZ8O9T9>( zC$LRX@Dg5e9NWruN zv!ZxPH7_Ji2mE>#V5Fnze%6%fIP8X3?l}ZDgnCOV-(?JcKdjqg6tQf=r;;HmnwKdS zsj_A(Q#bv7Txc{-;w5PrZmClvoy9wK{$Rt1S@5S8Z*Xp<(ImRnvob^5gCi{N1R^bK zfvnfZ&xR^SCKeD8ty;Sym3?E9qvFrvZ)X5nnBnB2^cDC;aB?qkJ8rcSbZ|yJo|%YuBY=P#+xhoZOPO_%n6JfrUfYh7BUXhM*Ld4VBu2@7 z1&mIRFY%4ZZ9~$fBl-nG=oM3&rNcPplW)lr2t7SLTSksoR0|xcZCChp15CZBWy3C{ zos;lh`|P%nQd2e!Vsk-pyOAwNL@_9If)SqvJ>hiOifQQZ&BoCGOq~1NZ+Sn?uy#R4|o!9CV5~@EA zfZu*?&CR^{3aF{Ae?&T;hzySyoUPLI?lj9et*^d&E$`~(n!UjDjDc9w%gWfXtDb@K zlP7V%a**#bKI!B;EocpQF%#2&&cuA(q)q=KF2cE6qxu&vN43e)ItVN~T56S6Vd`hW z2#L33Ga80a2NJk-;mCX=#;Yw{;73!w*Bi1{F;L}WHBO^{`C^~3v--mQrK)7-OLMpB zOzHJyJRYAEC>Okd6}^7W&G)KPYPR&Wsrf3Jv{y#|kr2=?UbjopT*7<8dxP|R)FJXF zsB*h;_LNT)xk z*~h0*#I{z6J9$Lc5#!-JM^5zzyTiCR6aaovP7|Qb_*mB0BfmnAP1A713(a*3G8&jL82mdrE9Mmg=OE`Bg(JT9X{-fZsm@ z%=DB-Kk|jpMcq%}{}kUg)A|+3>eVBQ3vNNZOQ|wo&Nb~y5|>MDu~UE(b~a+4-j|?w z#%YkF{QCDXEUe$ZiKQ#W{BArRY%-E+4^RjYZBa>ipwrUT33@TZYHV zeAR98$;VwCAu|6<>51tT*$P&Al1xR>=f!S3aush+e;vXgCL z$j6B(mnzM>gsyJ!nUSI$=>@VAM zo6NF1Z7cou*LNrVcQzY;S9q&%GS9UqQ=tKs$y1Q@51g8O4FW&&y4{Vo`*Jxu^R?jj z*3zn4y$YIsJpO{ROZkh?!XqSj?&-FA(uwF90!gsKd&pZGA?h|!Zy*P47pdNtPb)J_ z{3`8|L6&sBfu>*AUPwvTbiR8~&yGV$a$jZh8aLsRp}&sL&T0$Y zBkXgeY@;7h(nTm_RQBBIa#IUkv)yU1j#Gb%v=AX+R4NEe=OLWaW#Zr_-@nP|Qe`2! zeRl|UdicytipnKyyYr%O=;}b4GZh9a_A3?w2>D|CXzZjIeL51OiTr z^lVwjoeLzDN%m2{o3bGhp;qBrR`=obDZLv6MQ3zO(t5Ao8)liWzixM95f}(don?H~ z;m`n`4V3;hR53UMsaQ*fpW=t|>jj=np`xg0C*b$?_$gbh{+wGqradb#WjX69bT>=~ zL(`G+J?a;QFK1H>Mke2c(WTi+B8`YZEXYnzs`uW#`T`PkgVz(-wpNKfH<23H(#QSl zy+#OHERkdt_7-3Qjm?yBkvW=4Qdp2+nTXYGd#Mr+{PYFJTX5WUFDQEyi;in#fr9Ome+b5YBJP!Hgr8R)A0C8_2_<~|`=cE|(Jnp1oPaX@_9b=By_-g5)H zK*#omE7E?XlAq%;cP1W{JXYx{gsIP&n^p9pc3J`*aKSLuY|TC_uG)wmG7FGBwFJqu zy$ZjtfA2Z&x2>9YOP!SC@y)3U>Rx^uuM7C9GXm3%<%S`Fr0TPDzub$jOkE2pMpG{n zBuW%HuWN5T>3er22D)I3CWLo4KaP6M5@8RIzrMNsi;Z+<9vgdPymhi?Pp>nEm!F%p z$3olNdExP;qh|JN3Qy2WIR5)fWIxSq{1Rec>cKNK8Fk^y&eyy`%L`dqeoMkdV`HF< zr|LVt1d+={MaVa;?8qo6xsAk4UgFOT|1}7U&rc6_Dw_?WQ;CU*{XWO{&;XH0jfbI3 zFc${|x46hr(ozy?dRvD*Kh_W|W-sCI_mx_&iQn3gNVSyoD69LjxS4eghmu)5VW?t{ zH+waKOGtEtD4;CJ>YcoN;n^p@ek{;6b5s!5)8EET2@m`h$17{eonMT*E$^H7xOpq& z*pB=SPfu>W`Oo(>^+20t+EWVtWS76>s4M+!zs;iitZcpGa%c;LI@N-jwN|B{ZpAQ$ z`Z>zcouSj|tMvF*4Co!T)@M7keV!1Df#rvR-*zaA(o-~$C;Qps9I!1bSW$J=%V>^f zdJ%RL$Svhv11J0H%An!rZW(zZXXWY z5I9yq&0hXJDYl91>9yBT{1xAU-o}>^tMOgXkHr!h%Xn^~C_7=nXWaCRru6b9>+2m7 zQa4T;oyzZ-O0hNaJFo6P5rmhi)byfwTHtB@(;4lra_(`~CSavgGs%oSY<3~z#|6Po zoSr?dI$!e^yMWY(R8(>DC-7Al$j;sBNjiQZuYH9eOU>Qf4r7W6?aS#SS?aE$w6T`x1T`Uc&;II;kfI1e{H3Guyq72gCkh>(|j!9T|7WV_9THg(NLBjA&P`GA?Y7{vV_&hBNRn?ju7(Q{6V`h$8%*f)L-*kFrOYj8&@pL%VKRrD%qec#n4P^KD<)ab#ry*M zhX&ur&T%;vm8wyj<1d6y`d=aF4jMAp8{kXbkoXj&vTa!ckAjxp}fb6icGGdpxD|$cbNR3DnbDs`AoKiq0hy z>~Ag!pPN?q)l+-ur*{WS+^}1^AQ#{)$x$~8*zVen1?7poF_kNuzxT#}YcJ%NUdRzR zx_3VIkv5pEy2LXLlTX7FoOq?b5DE8g9@N}YJ2j9IepNa1>*6~Xh~4^N;_6%wX}d?l z7T9qil!TEs@kQO3geoB|e>~Qb>yI+!ZDZNQ_8bq~tRF%SD#4PBIPMb+lngP8D>i{m zJY$p-MClc|J?x%3#>U15??6@HU~E8h?U`&Ij-t)+#CkZ@IsR}R(Y>z&xa2uP5awBg zRMTm3=j-zh>(27t##$<*)|k=0U_Uk70&2mXno% z(~~AfeTK;OWDvv;jW5iuky+i^;RDBhZ?Ue2q`S%YJG{R>Ff&MrK7tT_=FfbxGFaYRx&i7v)N?#CcQVt(a^J)4p zK|eYj<>{?5+2rZF37I*5t-J+i`g?jZjEs%XT9;K;&S~?KgTXDP`ilr^~C3wGgF|BFcNYnrwl^f>2+ytgP(0+|6Il$7+4^pgF~a*{vSi zeO~GK>MK69D1^#3Y`0)h$d$w*9xRfHMP!QTXAN&p`CQ}) zyBS2iP<>aknZ)p~b1nY?apZxnFw_QlEb z!PVFhf)_?Q)vxL&k0Z${zy0tK=WIS9+%|W0SfdL?y?baORoRR+-w#??1YR0}8|{ui zw5s?h>+gT~y&3!4t@A2?+j=l0udb1|yfY#>5LXTTbhu1%tu%1oUd|DrONgH@ipMoY z_iUzMchKC*vHi`!8N|iTx^!6HchHKY?0ufJ-=>=w$bH+X#Yf%VxZR5cQy8J^N0m8( z@?j<0)b6aV$p>yokQrGqtFodnpW#%##!t)aMf&UD0sll*$C4whUvr@;O&c}z5EA7+ zyKP$QlLN_xVF8?TChw#p5kGy;2a?vXuv;Bz#P8lC)TV#S8AM0`fm+v~Bb4jOp)Tw* z(x7vqPryxy3M`|eT{65UrlpsJs|rFV_X+POgoFw>9P*%WnTOl4pr9ZlYH4Z7w2g2w z>ql4(UJ5HKvloF@uS@ZphsvQp&jsRU)fU-x4cowfsivQT_i-flOjJXk(2IhH>OFlU z-@qE!d3Jv`MG$yS4Ga#_wu;%b(=UI%cOniwMe=uAl`ZsflvR*v4?qSo0A{e%<1LrU zRbOI0d;^{B;qm;N57Q5I1)ajY#DFwTuPL-L2(ZVAzAm%1rfzM4Z|2Zq#{o-i9cV!% zx(+^?SEBFOR-k{E$5B8Jh2OwubRM=;f6G74Znn^vId1xLaKk4 z%k^x6@n4gaTfISZ>30A5FdoOIBv*o5`{LKkB6wy=Hr9eO92%zf8cJ+t@YqHSf8HJH z0#EYm9OO%eG56o+!b69%$>XK*+_-1}-^WB@9>Am0ny;YCv44QN4&F9U@eo*SFcB#F zCbC(I;;OQ$;0_;ajTg?-qKQ}vB9*~=Tlxl+e;45=sNs_Loh|p_ zVS)vTA*e^a(*`#@JZ7sB^N*qx+k}|)ruodBFO=t|`87=Ao*fdZ-F=%@^oH?lBO_XS zg6=c|U8c8O@Oql?daHa^hgAb{Z&5=29e>N{KOUp*{Xq7{sIs!s*UWcBWLZ}c8=mEw z8f@}Me2D{W5D(JiJv}RfN%nu79c@{hUM5S*kB8s@EhYSGQjq`tC{G^rS%8v6YNh7l z;lr@htk{&#+0d!u)2nqc>pK74eog zz0d3LwLgcjZ)@Cpj`?6c@fS){p7=;FU1xZAV$iuVKFHeq}g*CJ%i(`46b+ODdEM5?AN$X{oG}N^*U;aiHLCfOfdy4vnG+2{6bNN9|NkV>iU z&IkpZ^V@++@Hvc3_~!?74Cel~CmbOHvfmo_ZgbK`ev|^^ju*)Pzk;4skKSa0QqSz> zjUQ;(Spk6h4sC*!?kUO<@yAfUF*^lL)iC+dY?tr)PK^@O{XzpUoh_4Az#z`3Ty|sl ztj0re!5`k5EapZXLIGRqC{4HNoCUB)2bvtLW;de@82^qhB0^%V<>J{QL)Rc%Z=-P7 zSy%ozE{c2)UP?hV-p7&$jGn znk}2>>^pz%8|phA0i#CWyGjpewYk9t!tw_iE~0HpCbJTF^t7A#)2i??Xd@|y$)Q#; zk1NBMUvIxKaxNcG(<6E^4FbCUyI1+V@oo8)T@tRbaLq(g1fd?a?;}}rHTuUODh-5P z=gjWj(^Dwevp;Gd(B~ix<^()?DGjuyd^_ksc7E{=tFb+H=vAtsNH@9;7?9))GD26QtVjrSTv(R`A;E3Wb42agVO}(K<{1fa< z3Qh*cW?V(rAk+3G&c40G%<1t`<>7G-LW4-^jmrL5NmVp3HWy$GZtDi$8yuL>J|>l0 z!lxARIj=5V#EK)GCTeN3QL~vNN*VNP`zRKFcWBzvixZ*019p`Pf-mo=EBMYnz7VD8 z6SR;iZaXF|EBOr?L%DqCo{T+b}yiSUH{Ec(KD+}FRp^8Y{$+I=4r4Q&|8VV>^ozE9o$ z#ay((;*X-KFY0`ldvrZUYC1|pA-VWEy$5;Q*>u@;{m4zR-@sRBqeyu4l%-;Py1|=8 z7@RBgDvW3;?xJh$k3^6ig!O|x0hMnIBWh;;_06;H;bwlnM%3t=7s(!heqaoodNVJl z%N5$=#N6k**^UgbWItzi&o!&_Cw#)OR0Op{4j^&dE(XEzLBqowM~ghhjxair*LEyO zQB2_1NCmw}rym(7Ocy(SPIl4Nh^Eci$l_TDNz2b`u+PXPr*ezmT{C&rv;imf>^&=p zkcn6Np=(^#z5VPtPg;9Jw*K%4W{ti(ut<+*LraNYpSR0j~G8rFAI=;de18!#Ux z4-R#o%n9Gi!0gjhycEZdth|!33X>ojM){;H3o;<1$em|7QAFDVb=y&o2WMaw%BNv+v0E_$7QENp$}*N?&g=CK!{-!ixK*N^sfck2;%ft)7~ z8qTXzltcxD;8axNXYq>Gj~k1O>h>~?)H(TULKxwIJ`5z z0yQe{c?)D?LcP`!OEyeHN804zoVW^3$$2n+fj?jWmVMTyK;`JQT-Wi}lfRLRNxr#| z$(so7wC5)PB+Rrb%JcI4FbVTU^fndW04>c79jS%$OPP-6tyyHe+-Hr4F6+uctL^_=)f}oO$2nZr@pC7EX_TJ}x&pl_{d+vXCjJ?L#YYmvpIe*`H zo=-h)OKZG2Hih>2;D=;0q+#o@k8uAl1MAFagoN1BFXA8?B~P$Jq0DLZ8*fG ztPz}}G;|2waK$hB-r#~ub6;(ruZnd~U|scSi2b$?9>?%=4t7~3IvF(2wz);_(^`KL z9z_(4j!1i}`Y@a``Z^!Xa~Mfl_BiXK%Ka4L3baQn89NpHPYLxEs=B{?bKj%+amiqE z>8CK~3gMkb2q2N-x$19ssjPlX)t9D9i|Ts*K}Lfh1Bs^yf7s63JKB)w)I5BsB}ET2 zA=o>$L-xz3h~A`3#uxj)jp@r*-9+ni$xg`y;h;Gg5p6=fkjJwdHL6c=gKiZ>_m2+t zdu6&(TRs7h3#Ul+AxS(;#>DvX8r=d`Kj4-I&VCBUxK{YytU{uXgfi4V>x zzmV+JM5_p)Q5|GzxtZkpqIS&SzArPzepOD7FGrNJkw)mwubJvtlIDi~7VLlpQN3MSR?!NXfiu zbeLQUxXU6-jZ3{Fw7 zYjL=HC$HrD>-kpV(O);e^InZ>=sgmPW3c*Qdr?bzzg(5Y3IeCE2m7YX(%&(5t0Hz?lr+0hCBl06*aTd8s{N^G2=IZezhuBv@I$l}`k zi@r-iD29MJkIj*>PtlUg9Os7C^>{zL3p82y16;Fv7^cen9Fb*(-ZBx4C5%s3`bfm?{Hk4prrbf#*m_qBkHwEOAwaS ztF29E<6Ibj0M&>HmSu^D>z#!`Df>H_BZn$k#^_QiPft%g^^NEg5W+Hym{SWk z>}ypUDbOIK6}0uwT#qNc?WsxBjLq-r(5=dJi*h>WDWq|fI2w=KGUTUEmfWp<@!PH7 z+ph65D{-LAh<-H2(xMnn1;tee1aC~b-=^FQ^PttvpsXx-UL3;RM&rv>8W&yvvvx7U z7dTt$CNY8V+rxqKS!bcLBa@S!TJ?DM$|OW5*hH|Uh4tkT;g;u}(KTwE@Howx8ou%96Ei*HM*d#G7yPG!?pe^>4MF@yu>GTy?|h`HA(Zto#E|fEOD7RbZK4k zMGMatJ2HZ^KaDFn5W~``eM~E`j{kHVi#(q7!n8AI#?F)75B6*KB}B6hnr7bS9sT*5 z-iAlBWqU?oWyVWLb=u|>d3g$k3VJAAkmZk2rmD=C@hDSeSx6v93^aX#iO^R_7ErN^;v zqPDH(IJ)sWZ+1?EIn!&KT^z&-reu2YGUa#Iqs0#Ii5$)+bgBYES6jVamKWUY+>VH# zX}EI#R(xWX7tYMDw-RVa5v+A{5spbdQt{Z>!F3k;)*U>Swj_FCzf6s(S#`G}4k8k# zsnFJ>>@XD_l>48dZiQO|B2)L=@l$VxbUGE|wkc>{kTRs+j2J-T!LVZ=%K}h|wckre zF(fMs`5dmVa3P=0?6GvLfmLh3jdvsW%f%MMbbjEF3L%E>vBopwF zu~C&ieixDQk7(hY(&qtwkCD?2;u``blZWI+*Yvks&X&|{U#n3Lw7*OrP;fVTkmrpb zK@ExYpX;GY+flIjAeLj=TXtN6`+6X_VH)DQK zxJpax_AP@Y`=Zx1BbMP@EvLoT<^HV5Qnmtz_T`J0kvL! zv5^_YYaM6n*JcTmFOHyr?Ah%5)S+}?Mlhx6Cp9OQ+sB+WXi@;DxVuG>&EdyxG#&|9yE64D|-AN6@hdIhw-}**bhy%=Z-;gGcP^ z`tP03tlsP;sH<_4uV>(PcaK4zTcCF2XXz!On>m`-)N<$dSe~5_zzFQ_n~Ks}-MUGE z5ca~ccQ?8Z!*DuzG;905mNY%cY+t{8x=U7&&7k1m)Mil=!wD}I($X7`e7hP>d8iZSvlLf zO6EjZWPE;L0ZbEIpwiRO4Iku`oN9ht`gloD=F#1~@^Z?HsB6d3|h9_22!5U^vDKMl)fNAGYh9vJUDQxIe4Rbtd|Pgq23y zRw%}A;Sp*{r~K)oS3e?Xo|x+n)!3Qf<;pY>xl>GBut8(1B-&c=ig!gh-I?zQe5MzL zO?uS|5)AVS8<-;r(%Jn%jvS$*?>$8CiI-dTO^b0td@xV0UF{I2UKz8@Xa#mCIM+J5 z_}o8=c^n_Fg?n(76@7`uVmr=#wFh+b;=L5x_VI`ZJQ`XvVUL90bbrH7^PbDlDa|m6CSk#9zR}xH z$rOC?6^hfkw3|XkMM~G4J|LlQuq-KJE}4?>D~n{JStGw|G_7Rbh;M@R#PiWK!c_4M zP!u#2{(i@DnfdOGQ#M@Jg{MiRO+P6ATJIwzTV%;O>#wk^KnjDNI=`%i)@Q_wH zDD5de(hFujD)@n%QLQAStoa9V$*v~*`_P&7c$C+d)TsU$y0ohxpV_UF*?QYW;yERP z13$Xbj(Jj+SPhw&&c`!Ttbh$k$`N0({?GMS+S9NdB zIZj-Wc-J6%2PE-CLW+|x?#Njiy$~>*HJ|LK zaj9h^BWuck&X{Z3Aee6}{V_}*nf3-j z_an>GEAMHu6!I(hyb$1W>kP-OXOJMH;qM0Q;ZnA2rm*iDt<Y32U;Z|;4OZR^^gIHH;}&d^-Iw zn!noh=7Q|X!^E5EHO_c5^-e1;n}Vv|Ose~)YFY=?m;HP8-1(5hJrNeM3;*^$s~J^68qH}iuJMU{P}F>I>PFk zmnFF-{NB&3{{cYc4G*DTzRrrCs_LGdL&lX;gSL3?XOmPoyHwC}r!G5&^VKgr(^PAq zD4`FlV^H^-n21`vz_QRn){$A~LIe%5pXTdn5jOdK%h)Fw>j39oUia6n0eorF?rEzV zErJi+@2t$X*;@T^ty@(w;Y7=1j+6{HAGkeaj(I^}-$N!^^k7zO%5`QY>uX88tKona zoj4xZu)ff*prDt+7Hi`c0|_W=O-_lC8xKzTuxgal=XG{EM6%-LGROs8qYv;>yPW6P zR@014x*D0$Fwf(4BW;?;YH&z=AyzS?)WiFwOBKwPV&0ZW|0O(}yPaa=Iokcq5gjJ+ z4ZB~0C!sov}d^&#!;z*S0n@lI5N5 z7fIjcpm@F|w5x!z^90bE+xSYHgn(-kafCZPX`kLDkqcc=&ER3OcVB0DVOdlg_RCP_ zh3V}oncUP1i0r6R_JvGxk7;J>uo$4l)wxK~K%$=qp}Jnh_YOw8CFqrDZ$-}Us}~Un zD3ZErN+=cvqNivRlTa8E)=!sOGeXG%)Vr6+NV8m7WiEYYCXvjiB{RH+uH+orehR4V zMn5jSE{jSm{+$W2&oipAuQe@$S0Fq`XWauGC{GI0M!3jFCc^L}w9AeBCw%6V{h9fi z;`0-wmX}&oaOuy$bUT~4oV!1iNpIS6)ebOy-E9wmfXt-JUM;G@U+z_1OXPn#7oI*i zf0~O*z+es-55hWY$59vF=>i`2(hMO)RFi&ssl$jo2`!EmT(irhAkq;Y$GxdKBUe*Y z^qraJ@F?k1*NuqfVsos?&6h`ya)!Vhl9!2k&!ODPwAQ-jJw1y0Ojn2t4zK#Cy~{bt zb!Ta1u066JtS6wnD+l++PgR;`R#dyf(^b*++7Da}ITsJoZ-eB|S@W%#FIy;6pFbiT zF`Np|Di&}h5sX?dpg|`chW93fg?_^+zkTpKLNJOKTR_DXA9LNy2#Gr%9ebE+ZDzI{ zZ?_95sIJoDkG5eXt#JiJ%!IGTb8a=cj9h4Xf3sgArdrBDLn^h~hkOltJ)<$&JepYuK0ao5uggM=+di@Mq%zT1QW0NMz5nJ%5>CYn^g6^`{??)PFB zT?17d&2GG+Lg@V>h`Kc`QPr#hDM1zwp=z6X!l+J3m+#i)hdO$4`|pOv3ArDinN-r{ zHwLO%L+WXfez*JjxJgh-tOekx_CsPYRC9Re730!5v|t$&O-xNabnYDE3y9ozl!U*e zY>(?eNzaV^P28zz-E{%d>+Y%*qjHiQGSL!?J3v-lE{gQUput&xUiEhc1)g|?S ztx$}$@8Cd6&2?K06CU8jTIXNC&bgnp^YI2~?C*VFhg?MC`C$LieGJz`vDv#2Q0io1 z(yEK~?3ltOm!%Il{SM#jrG}_ZtTI*-zRk-l(b@T4tvgkhZ@adOkK`H`eqJfd5OFst zILPob8(>ihQ|{(0%Qn~+g1q%k7r}Xv((@&8#opByCsCI)&#_W{k)#vX+?%%Q!-*~B zNM)DQAY&mAG-x%V+c!s}188L`a^f?7*8hB|CKuezW=O+$P7`D2k)g}ED(n;;7#rfx zG~-m9BUW{w?N<ozug)EDN1 z$t4+dXNA3SrH7WP=pg@Ewj~!|uJTlgW@fY%7wMW=F7+12n}*M-KYMQCnrUTr%0b_; zS3rUO=t0s7htJ5QB&4QZ4E8uWLpcyL#i|#7qOR)y@#}{KLx8MXgis;|!FVc0Hk}Od zsI{5T+_uregnAxoUVfZxA&9tl3<@O*ngJEd+Q|pvEHshQfbSOoK{VV%5hka6GgiIhWCZ5&USHfQXdVz+T0ac8|?zh`-l_+_`=xjBwX4)fLrR$Vv@(tBmDLAA z^#Q?|EbAZ{fTYT#U2UmfR8wXsYwi$HLl4>0nf;8l% z%qQw>ySqHL{P%L5dbd~;GN1r;G6Jc%HLUjb4J?bn2%sLn5;|@FVXFf(T-i5^&lk2q znN|bF2)PvPE*g3B%QDfSIr8(Y@_z|I3b2Qq*Y!j}ZXpc?8~+g6@dGFVLfHS(Ehv#C z`{n)8_S^aBRz>EzjwuZ5#52q08*73_eswi3eOQ^lt5%%hN+*!eOp?(Fx_Ppl-RV30N^c@FocaN|!+}@i0X}A6`8(8kT;7aA1iDdsN7BNAlzw4t@e7H4uzd z)>Ueyg#k8tmRXPxkz|aAB0y>A8y-YK!;B5j?L3c2cqbh`mGF2MlzJvv7f;v*t!Q2l8NNsLY}U$ zNmIXcmw~#2RgW8?m7{rnf<%isY!qZ9TmxnAazRH;CAEQMS0dz`UT@IUrHH-m)2S2) z06h*?l+5%_a0V*jSzkCi>vNv@Mtr;G0qs;Ayzy(Tv{{J2qEdMdnh-rurq`n3piR9e zl3rRzG>EYdAJ{@JJaSOG-(7AQuuvzP|+dhxlqf(_7_T zzB)X|(46z8IdqQ{QpeTUymi=b?ph@b%oQ?Etf}kHIix2<*&mKO$d!^ga7wg<*AVv zW_(In{0I9Wa4Uxlb$#1k1HO~ys-6vf?DGdMl2%y_LE9@-x%_5z@@=qI33a-`^Ciux zzQ$Nuc46SttC;RhP@5@5Fi30S8uES0_Xs&N8}d_o8>AT(P}f+%7IT|#b2~y@!0B?E z2$wVd^u~e^)eq2|RDjh@o7V!RRt>85708>HJj zfci||jOGSxMPs?WKHYIJ>tLU6q;?iUdN0&&;(XbuNngH>94(2$ISq@gV+%=uaec!sSbJfv(PGU?||H{9K@# zgNq(9c~F((Xe<5Pqix4JUu=j&!}%(FG?`9tyIa5R)(IYz_=x6w>*3U`~19I*^@FfiH!8TW1k}ii0%7Y3kBG>*DzDZR zE~a**2*Kt}kf+9~YX^Pe0D0;SWZl9494cDxD_jTWR{R$iAV^a0_g#nDhp=*>h#@-e~$|Dh8un%PK(HV!rR5k|`Z{GvKB0 zDC7uANk2T?1Ch))Q2AA8G&-8mwKrq5=v|lVBlzLK`LmOER-y6?H~kwMUw z*}O#Qiwo6^^CjTHZwEC<5LbvFxFafl4#~+$_ihb>MMTy1krf4qXh;#UYs6|lBFJ__ z;z1Kfi?+tE!>sqhRrO?c&X<~Ky-(2Q7?w|ghrLS}am|V6JV{WX(zy+ID%YB{Wi)TJ zf(#P7tr#^;Z1(V};W>{wh4e$@Vw9eN?tbEB6a~=~Wbv1`9R6MCq$(@pJE{;JERWF-6xU<rN`}jdN&cD>Q>S2{!p@xcSZO8%j~xV8CgxYR`)bHP=J}zhRP5m#v1V)RsEuu5 zpT`@s0R#16;Vyd5Fd$yv+OIATv0sPzl~Xa!?B&9ii`Yk-o4qFe?5b)ue^%$%xCHHT zggA7$d_mXv*2EK-#vHdx8NS@e&oNUln(gXl%+8O$PgB}limRpD^IDR^v^JH7=SmBT zRwoH+1DUu8xy9b(v!%dTH4ddkeD7H({lEUTCM5;Tj{Oc`ADKY`y45XHIdV0kac~HhGBcD zSJC3}K<^wLrYk>&T)E>TMLLjMHms%z9_vptaW)o8_ee|!LNaJh?uHkLm~=7bMP5_I z>o9+CI#9g`NcmY~x0t<`ROrviTbS=J%PZZDFDA8gMVn|SUg8VwzJ5SW*#LEJmcAco zUOeV=IUC>qc)$|>;B$3ZDfOlVD3Fw)AagtV^m~~+hiPgD-x|XMMdkHph&o~yh|{F3h~;jaY#H+;X;-&odCnZ8!7-FA*K{^=cyFRV2S zdue=A^kc|RMQOt$7YD5j;XqppZY;khkzX2d8;xR^orA6TK{C%3Efy0|&v{k06sAgJ zLG>T$x>xoCXBW$*l_VqyYMnfkf;p_1%qXY>28ul=L221>5o`3f(&*y0`jojt;_D|D z@N#bqcSx9UbznNuOQ;(f3m!~L=RtC=z6lJ+u$xCR2~4+6lW-@KJnzZen6`gN_04|8 zJ?U=W%LseUfgd7vFCQq%dRa_da&U~<{&D(l`A>>XJK@4MDc(|g7HQydBYe`Y(L3JT z{@xRN*6;zgYJs^F>7W+ug>-Z*f5Gsj)+saYq1@cJ%Z=x6UO!Y5)|>6a>- zm>eu)_)rl==}|-=$G~~yysxpfo>AebmI+mPcg00YQZmKwEk|3J(o>}hYbEUuF!u_5 zr>N_L4H&wwMt=RcH|U>|TT=mZrx~qR`7dfb+~ImTF(_NYV2@~hNPa$>zkhBVeu?Ir zRW#1@73${`ZoQc;z+O`MDT+u9GX~6sY3OPa`I{nlz=Y$YD<;$HMp6Y`yVC^QFy2ZZ z=0(_<$sXEFp%KaoQA!ftv{yB%W-Vd3WjmW-dcEsRPo9W-TGzKm^x2X_5V}$ORguKmtYp}uLR(u#Td^6SnP{u=J2H@S_+HdJ?~xUIMjQw^==HU=0WOMHp~bU zY8-P!d*<&pEu7hU(2%Zln&iC9O}-EkygZo+^5sYp20EMHU%-4v&rKR%1XTgIR!l(i z7{S=(?@-cgWb9!}=;eZdr5B~A^%3Tl7gz;rOGwq+if<^?;Wcz(Q}xKV{hH?&-=dW+ zaNYz*w4`pTvTkIbWLI#}1vY9ZJh&YR4R4OfW@h>${;7CTg&Aw)9X77uniU zdyb)_y>^eNm)XG%Isfxu;~uU)Acy*AF&SeG(8OCm%_Sy}?*VC~XkQQog+L0wGMx*rC%#&?-!(_X+AxF`f2eP&~!QT%y%xzeLa^b z1TJeX83wVGn`o3Pp*x8X)JY+lU^xS#?7DC7rQ=)vB2@Y->RFBeNg!QV3^Z(dZBLCF z=_NNR)h|T`+ETJmTjqkN)z+KHl~6+^d;&mSfvAYzdZ~(NIHzyZ za`JlIePnnWZ~LFkk(pLLB@DSz+u}b+65+Pjv0nYxmX6dqWzBYxShfMuFT7do>fJ_I z**3LC`Ydo1X%_{3MYT`Z_ho-V93RDf=PrIbVvyYUGo|0n_GfEMm+H25LpMN%9an#O zGGhU|-&qO0bczqe^JB55u@!=ygdv~-ZOT}DrNKdcNh=6ATEnWl#2KP99*yH)L-e#T zzJEpThPZQLHMa2nr2wG7=FX*> zZnuvG*tO`(8B7EL8u08p3}Y+ozygP7UCGrJ5b8=!Dzl}h?a-DvoH;kRSOP0H&5yHu4t}a9VgQsqedQaWiHiqQLGTmag&A7$#PgI&Fcka&zQii z_GNKnUWp4$2YP6VrZx_Q$n01~lwGsN_N-(7gER%g##vD zsJ^VJRkaR=^s=e6_`b3xR(LWni%a@C`eW{;x85YE_J*dJ4wm=Fzuq*F zfZ91T3YfAWXqS(|u!wp8IB0dbu+9-U(?oe%ap4I#moS@|V zuA{2J??$TuljhE;TrEyRI+Dx9Nxq2W#LdJnS7>t83>P$7-s)*3=r^D!&AUi{Vf7}2 zicxJ{Y`~JB0Vn!b!fy)OE_Z@4hLVd<%7e=T8LtvvKVsR~dh-5p7>h|mlASaY!3GUI zQ`4w;hRYCL?+dHG=};Tc_)Zs6_FCbfJ7HE|2ekQFlm zENP!j+%bKD{>$DM@jtcq{WQ(^_1^m-axq$7!h~_8o=s@CstKMWp0nKwQSl870=xu+ zZ1;Vz5UVl_9Q5VEq7OpU)X_Ce0nm`KUGu7k@mftF`0d%8kNb@r7P8sgoDn-u$lV4K z@9&5)1v%Z}U|5WXq_PwT`qSH2a%|d3&Ns-gB`&p8K%N_25v?F0u``(D)zS7=^qlnvudKpkZ z z0t>i7AcFF*Ei88E4qtGA5?#D9DciXILEpJE1|%v_TynF`=+5ewe&K-?a!U_o6`#n_=4dsSb6fA=a7aqS~`pm};; ze8BtV{GLW(w+i`n*N~FFbIlQNx&H{-5S?3O#y!6;#KXMbmXMjX@Z<^?cS zGUp#?d(eTbONb`QpY56jb5x`7i_t}8OT8qz%SDckkxxeGZ1z~^_%X$0P7KtwDIYad zf_Hd{@&>nDD$^Tr2YI9*>W-#f+-nV)ch_Cg>8o8^SdG$i*nVD{3zvr=O6m5(k!|c16#$!&t#QCbep>U7&%0mVzQF&)s z-ap_FujTky+H?l;)5->Z#*i1TX6Wz|kzQ;4(PI|q_12L9 zIc#~>A;cu>UozX7Zjjl!-bv0pvwoeN3d#!yqz~(B+SumqiHY`akLu6HFl%7# z1fBcAIv;bd4d4{}8!EB1`jmL_E#*I5K$6T8ZO@lfidGqOdUju_`n``{?0C$VCh#(x z)5oI0W0agx#~jlrax@C67K>Hmx*)z**Ij9H%0cON36>8{bvMd&OoNReZ@FoYoS~HI z)|UP|RIHv90zSuk>fS6p82Hvv1Rj9(z9YIZr;DNQ$jpLpmxA?571`=zipi6TjZ=Ny_|J;%$H&j;nzEjj9`HIT0S|fg<^BXa zU@YfBNV;0ZxCd@V`G5JMypicfip;M3Jm^O9Hy#@-u?F302@vASt`Sb=K0=Z<%PUI# zL_|(hMeD7>(aNnJY$(ta+yEgLsG?T1!HOFi%MHmd7!wu@Sujw?pILr46qA;W? zU0^9*IlM#rjj&ge>&=Vu+hdVyBxOaNSI#^3<$N`IM# zNm~@449N;9-&h+CCe^q{cSmz4%FmCB&wya7>MF#uDVHn}Q;o|V?F|(Mf969Osu43t z3RACxubzK8YP$P1@%oY>kR{g_YYc)G^DKadxBmrhqDxI(gxoK`m3?>$4R>lT-(%$Z zqb~8fdOg1YA>O*0Bu0v;kf;JxZ}ZLaoc!Zz|J6tlEyZeu>Q)8t4`m!L{FW-><4Ky3 zp8dlU{w-nqZJ^<2UzBW(JNUxnbc8@h25Gkd{`S@}p!M(j*@u7m(^t8pXF-6x#V|fw zddhL0zfwhBH_#MuY5!dK)mLd-9jJn8?|T48&9mV{cJKKsVsOyVTx+sli^lH=>rIHm zP<`yM@Y3uy`5eFa?UfRJ0|;A)4jFpW2C)J}EdE=Xa?)}jzseDMmFZ^YJy0yB`k44D zxd7WlannOrcWm%>+DH9ZM~U}3(Uz#xvGt`3kDkxGUNS{<#Yf*DaelCEM7k03w&C9B z6}PeQwCF%Bx=5I<)>?IN^a1DMuIl!zIKi$2WMzC+|q?vB`59uTn+!9P*_{$P+V z?puw?ZG3Y+k-u!%X`&|nZ-e>2EZ6LY8A0P#Z#k14#SES|k3^v7$2*;fYR0^h{aScM zB1|5Z$Moec+DI-YD0$qtl%sT5+*)~qbTF+rVetWgt^G-b)E{O52e)q+%(zs>5q~R; z^VvvHarIZ{GIw|Pn38Vv{nK&aEwx_o)r-P`uX7w0ofWe1LY}c4+sLnOWgC+q#9@Es z6^4iZW|}-nYS1bbN}japZ&AsvxG|dPmg?cLF|0)TivF_e^X5X2$1~KvmrJfZgekNS zEO#DgvyX08^xPW1#K|i0Y0GD#xI7}*ELcFD-9tQyI_Z=1S!L(%iXNNk(0*75wq8|5 z4rww!s+huqxbETum;}oJOf#SCd&fJk@pyMx?w)w;_D`;qf6u^3F4GcUiDl`bA1Ux5 zM%

dC5AGeIi*HET3cPzlZ~rHy)BCIv}u|6h9p$rm2ReyujJ6@tsYZK*#&#Eg<39 zl#z+y_HPApo*AhnJ%5ic-PYsym_9b%s>R2(qHy=I4#EXeEOAl8L3%oQM{F(ZPVY`&{97b9KSohd6b0(r5uiO-;>!#k?w`Nw7V^Y7*Czb|df4x}Z zTfgxxr)gp+cM!TpjJqo@I+C@h3n7_G(aItVk)?UC8LMzEJaXzD(+jp z?e671!1l9k6*66#lT=J-AbL3-PH~|HFiiSx*slUFqRxJa0g!oAE{`YpdG$&BM z;}@~X+?D>1HiRo$Vm$ z6`fX+bGyP-fI*UuiA5w?g}l_M3r~a%V}AyuZJBAEZ>uyRbtB{A`gK7X^hY~JS!;Fr z8=qNvg)@qgG^(fGgq{|wnooyFu47To0+owTU=u_dx2#0+sDcgUd)Evn!c@GJ-y-Yei;(A3i2E5E(T739(q$e`UW) z-@o3HDJt^|+8&ELnC2b zcu*40%weqNpVzjyeX#$Fbc=FEUiUlEoS75|WbVR>l@pQOdpS6wxaYcv zd+8B(2KW?5hMIGV4&uBE4tRfkeED@iO&*HA!g$Df8KyhV%=jntBqNV_D~A7t+WWS_ zZ)&MhQ>#6x)%4k>pEvt{OZV*wd!kVdd#YtVxj`!xj$^bLHDGWqltieujIm7V272xD zyAjPPV`GEdEu61F7m=pk45@XQpQ7WvUOVy5q+x&cvbOU%4?PtT`P!aW zG#2%UdwAWWSOYq<-$>$5<#RwYjExvYO<)d>2CwsQz5?6NSCsQEp@u^>nibw&8idYy zxBR|HYkVf%6WpG-2X(aLAkFI6j19nE95nDEFP4gy;Y!ayzbN!MB4{1h`%d5nGSV2n ziy?7A#3u3@KTz$T84TV-Xu7zc) z8j@5svooChzFr}`-$i)gK^NK=!ZB|0(Ve4buI{_Ij-LVj3Yjzvg>vlgP@|5OSLF5LhIcHY8AJ;#toFJ5F#V;OhM3eM8A zGs4(9;BcsWO*+`c!}1v-L{{fO>!S-M>`;lxm-ZW3p)wZZ*VVIhL38Evl@bvY{xrYW zGi7FRRQTiNCcgr|k^Y@!``7Yw0D2f5{N5(AC$~Ih6Z?q@bG=T#C3$eilBN3iBNg3e z7N3FcPF-fB16$*}Xw}vdZxgK8DkL7rv##@$#>03+sdZGcg>3}Y%W*FdhilZvm01UV zPTklAWHNGlL`M;E?pHvr;Qd~aNi(JDmX*p%(4$;fDV-uQ(v>^#*$o@Ky;a17)Pky> z(D1s!8Azoc=|p32BCk?B{YUoQTJ?O^@3F(t!%ev`oFs%KWt|B7RjCQI9^SFN6^V!3 zz@5v{pyPt<8l@kNUP#x=4pj>Ucb+j4oWp^zWv-i|M$f1y`4ah|&y05ZJDTJ_gRuZ_ z*N0t3QNl-v4A5mc;RsVFjaWBJu!jm#>;HQ8%p=Im7gfmT`mC1m7}Qar9I)D>3}_+% zV}A|D;6VXQtcdu&hR@5(JD+%`QdQm}Sbks`&W@A7B*+^2f0nl+Z)*C;&E1{oGaUq_ z`)}&!|0d6usp7w$rh*8V?sov>RCS+(p%Q|)OLk>NjRJr!WDY}9AhDD(sVk|>)$N?e zsMxUeN&W`>mNf;$Cs@c(#sj0<`!C&+x0R*|#D<5$FdBqmVS42gb$pq<9t(RkduXRS zJs)>|o)}*Jh=tiRju3rG^a{dPxSU2*O5Hgn$6^&#)krr_Phthg4x>;@B~bHzpnjZ% zDQxkdr&1%kd;ViS_#(xSQ%;47^4_tsB|jRrUu31(c8jAj9A$3*pZ!wnNvCUFo0@ll zo*{tKCh1yZch=>IU-0?mk23&c*CEje0(z8yo9Xd#u0Xgz=C~8#Xh{>^YY znI+3tsQxE{1r&PlqX{ZDNQfh_iSrB0_@jkCWKRf~WP}EAx#>Gm@*BKyHh(Y7Q2tPK z37=tgui-WMI@;kv>%qsh{}kGz^ZmMAPTB1^yAsrzttky!mJOeMdWYx&zZlRC2F zwKwur4&^NC9@zlw(P9StXy7E@Dy)^cOJUG?O`y?q(bKcD-x|vcJl5JLOq55_*e-yX z4zvILN&Y>Z#Tm4USlUP=7`usV6}+ioy^|f{S$>n7!|FItigMt8GZ_k=E6kNbGW5#B z1nqkk)hKQoDT8uA1iEiD$^Xk#s+STl0qAX_Jjz+ZfV~Pr$e5!$AlsqG7v1oi7oA7jXk6H%4c!t`SJJ3bM8;z#j%{Qik3ueGk|hh?k3^R?!5J zY;nLJJ*QrN99D|Pf=75nh8RTbjL2Pt;2y+4IcrbKnk6O}yh9f#b<3B{1 zRR3dA8;tj(`c3=yVfvqEzgjS60A5|JJOXW`>T@t1XRO8HYh{b;6Q2|1c`Jh-PM6x{ zfxJ0LghO^)!k}gNAH;|n^>@Si0>iB;>8~g`Y9VMU(9|Xo?_Gx~F%r#>X;NHxa#a{~ z$;bafg|D{%V(`$CD#iK&5eRXOfxf$^So9DMMDqD2R^ZRnMorcoA%986KJ9q3sv}#s zuqznZx_ZGUd#1YWRF!4;aoi9J^Y~o*FF}g`U3S|`k+<$MyTwg-!;oB8p#66>^1r%~ zAhw|REtiopQnU{#{5V-P5XlKLlmB6#z{4yYS4MWm&C3wdL(kbrXqK9Q*-lup=Jn?RFHdm{mx)N@oj626|YUI**3A@4F?sMY`YzJbXT@bi zy_(X(!2_LbJbs;A3X7vj0oO?uV6^iIQSq;|PjPq_^b{wV%aFnT_6D!H2=a|>JPf@Y z3FBMG*pd)48#8p6LHjW>wvl4@?}@=*xd-(dMWuMOHe40Cu=hS0zb1rJ2A>UTE*65O z6Uqwq?7ZB`7m8#O|GfsAu3f+W>%|VrdA8~S79v4OfftZ4=BZMK3tk2pT#tM1qD+ygZ?7(cJ&pzy9~3oveDw$U$ZNIOszIl6E5Uzw=9w^trNP0Kq8*v7lT3VPS%Q zv083XW*7%^xkEEF$uEXsR#*r;OsvODNslO0*&w?=4~xqFX(P~8uyIsQt{o^u7LnE9 z{?BU2Uii-Nk{nAt*mfj0Rgawsxqan{3Et+)SQ4KVx~!cc6yYu~k#MOI*%z40eC9Ha}@5{t7>0) z6>dXOf_nKM37<$tXp<-D+Z^Ox8HGFg(Mg6W{F?`SaWu_q(zQr3B>ht)%Opm>jirMo z)St_>9EuX+S3_O!hcl7BQ~Pm<3A{#r0aED}_s!^iHqm=}yrel!vLpZ3YZFP%hHO;B z#Mf};Mw;~eZ&qOcZ8j$`9{-Ex=6`aPAfcux+j|h=zVTXa0N}94fxbhTh*cr&@2tl0 z6i&lk*=vuv%p~E_LAp0&bru)?U5CDhI<#%+WPc4rADRtiM~`jp)?QQ>rE|; z9siBF&dSEdOTA4F6Y!tZU`i}R@gC}HJyJtLKf|RN0+!eLckVIF(vr`@dfJu4x^m?T zBI;>HTBcuC5WN47m|Ak_l$}ehzNrf-u3rD6S$gy?tNNS5ih6Vg%ABP~vEE*ny7fFui1w(G5{jq^Fff6*0$l^o57r z3adOe9mY~L z-c%OTFV$E(?pcwD8+FxM`-X?e5U0M57P|G-K)o%b#*l40G5_N8gV8YOu{rRAAp7F4 zGOYB*uiv*j5;8)sC4`wc?e=9qAVbz&cn5{9^!Yck4o%L8(0japT5f6qf!7 zh;tA1sM%KdqY>_xDgMYp zJbBv|A>UUkul2z917AB*#QvrkT}++M=30$Vd+bW(Fo>S7R@8qRj#H@{nc4Nz#tq*3cKP&YgW?@BJgPucBE+q+w12tDta9?Fvr@yZa=jg!qKorL^}BJ6u42?j@u z`n`H-?vfixGnA@xx5h@f&bD!Q;y_es$z#GiY=E=|&rW?=p(+@>+T_YFhgT;=6<=B}M{{YPytx!Xzb3gr3Ptgyc?Jn2B3#1fv$_Tl8_ok9f-1KT zHXpUe8eh?^_)PPTmTGn*PTpc4qDBoTz&WINLjr3^tXiKD-m)|WL=O+4eQUFh37xyS zf491S-!xV=hQqB@r*T&VffYcqs(mkkp84av{3e+9Ae%XpMb=Buspqx z0R{5%qeMhk&5)UlUWA``pS&N%LEBROl*d?~v?jd{8h?F{_P}8e@A32K+shs9ByEjp z$N@C0g~JNh%}NBk&Hym`NGdaOVg=d08+#^BT;E4mxv`?6Dp-F&XzG$2b^Qsr7c#Z> z5Qqj&bWBY3gYy+DNGstDMbuwiYm&8@-$in_LW<%(x~jPq^q{jLar0HBaMh0l)9Y|E zeQZ+RLyqkHAMm&N3DAW`CfO?=quik94{%epv8Vb6^Dk$`o5sc&cX z`6Nf)c^l+SoTg%gF9==+BEeU#=97#rYA<~Ic?-Fp5wQC>J)hXk)A&Nfbsq+Hz4^@C zCeL$k+k$XF`Ch}Wm9Cd^e&#wEiFJThhu_>;!FRWb3I2M)LY(3){=%C-Tl}Ct^|k># z=kF|j$n&Oru_iy+%pmX!P2Qn40(l_RREl_N$FCE*GiCo-PZ(JP8wsH)hY^uWH@V57 z1g?bo1wK`f>M@v8{QS9Prwa3~G(#%GYjiXA7i&v|`?Y-WwyhaQ0E;dU2H{o!-6Y5p zOR|I(ay~|lZ2nG8pXZ&n|3@H+K?mO%iU2;djUY6QyI31KY{X;_NLugPI2RaS=-9Dj zOL$vafCFq&(W%q4XWqbdrexfCTyj88`-$K4B0}4GyEvXazrS0-FnOQQMfoUgE8OOf z>?XJ4D|PU!|H%V~Qhj`4q6A!7iaom|FZHe<8~y)c?#<(|?7p|*%jJ@(GE*{V$ShID zkd!$YGb>X_M49PANr}v3hC)K-A@f)nqs*C;A+y_XA=A4~b${>s_dMU{eLv5C?>~J$ zD(AWPKKI^htz#YQShm+dOyv}8g~O@-?lf>5^BSL>Q2EXTPyXKdf>mej2ViN_RCON% zKtPR$dvZB5d%EFwQydY}zo)_8@z>_@TmRdyZUCnw3wq3{F9M_gd%T-?7Ci#!Ah0F) z4}a0$zAVOV=Xa{$5V(1c6J~&9H0mi*+X@=vgn@{*za>%rUNqi^CN?63#$}rtd|nfZ zS=c7iLy@BpN{4t5;R7m&(0DNc3}^o|Yyd=N8(sp`s}CwA8gHLt27J!>-4e-z_8c?+ z5iUor56vh4cGBTI{b?N13M}qWPPIXgX~V5sD6C#aS{XT|`)X z@PF)nTL%X{QaS-`=onz~HXYn4&TiUl-Z<1?9^-(IJ-t*62t!H{8@zswj9nI;i=1Pl z`Dqm|7`TYs;imoFGvM(Cs3&KGhUdGM;tz9&XzUN9w}=~~1{&F{p?`|VDFuSl%C zzN~Bn_d5DGbC@iEk`TY*=e3;T%LZrtix3(}LPw=Z4?XMahn{r{Z`Rq}bZRpUw44Qsjf_fwM4ySM%>L#5k~FH25}AYs6=HRv zF*(UaDvZEA%?MB|u=dbHL$H$~Dn{}U11aO@+yH&t66I2zgJH-WuNGPDm|##5y`{G|hzatc1t`{{AcBki;bfme1A-r1zCS-T>G#aaG;RR6+OT&v%O0b(!}8WTmqDwdqFy-*eAQ9j72)pvZ-jp9_Tnom)#CxQZ)C6Gp;2M zoU^&0@dDYIx4M0~)I1}hQwL&?8SuFGUO8>7Y=ij4@j;Lw_}dn*HWW&RUQsZ*JQ8}l z{T?LwY#}vC(o~GS4nmSKBJ|5p$8jA%pQwhj{C8@$r{_S2S%ddo{v`uyicLcUgC6MR z6=Y$#!{2ayGbpLll*p@BM=vuqM1n^LXY4wnqn3Qp7+e;o%zNxo`HP7EBXa2fD5ex( zgtE~xD{SM0Ze#wD`=QGWDlJzDO}Hh(tbufiT?vO_%gVNrwk^rhf1gF2WuJS9@lU#5q3Dy5LOO^3u}zuRX+E zFH=6fU^Xp_cDqEFWNMIoWycsHWrL&>3C$2t1sZXl&^U;$WbA9|Ro(%?mip4#`{PP| z?3Rb;`4@R;SFMmpT8rIe^!+sOi46a?sb==?N=r*M1ZERimd9;<)2i|pj;DQBBurwx ze((d@pk=uLG4fJ9(&>BeqpD>2u@kb82sp zhV6OAoBK9ez&Hy^g!n}74wtQbQ|8S{TVZNEmkBd^OZKLE&6`grA@Wct!SYtek&y4cKY|kIx_-bb8=PR3?qAD8ez&1A?+l@Eo%)qme?x>?6#1Q-pFQ0BdsaZ1H`-An1|Mp%-ru%ZT#5Y0{s!x&7FM zWhBWM*pR*^R06umFe$Mwr$1lP^)Swnl`Oq=Z#~|*9)#76#I?l~- z6}ZxFK1en3l8RJI#9mZ$vM}O))P-}X_(7wBN6f?`0}r!vqLG9`-CGPS095o6J^Nb4 z4ro=3ZDys0`<(0`f=sb3D6eu>V2k?0+o0Y7Z3Oc`gsW@Sw zg-d|?MajFFYi}cRv`=HrJ3B~^*WSkZfBo>a+rc0DXdhXd=_DIx>6(8^lQbKq@-=ND z7Y4meLGi0YWoHCbwn^=^9jvbRaBc*CW#g(Ibuar_-V+3DRk}I8lHCpszG;OCiynG;XXLw$kc zhaqT9VcvN*%OS5CN3!12Gf)H-gHnAY<0sC!|G0f+@LLG+j`%3YlwXqW$;+0^;!YN~ z3Xy{FD2Eu!WEmhVJ)PwkvDndhRV03GvEM~=83Z1DO?J-S3EyHqanZ}|`p~BX7bZyV zxqo4s2kXER_Q&}yA^tx&-|kM>6Auveo5$)Gue{s?HthKq4XGxsSt|^wxP|NYW0mbB z%Y||pjwAIKVtc(Isji>-F2{jc8SOJ>fTBXE@#>1(?ci$ruQN(v%oilR9G$PJ3Ep+H^YB%s_c}4CpT(2U|u7 z?Ks)5+m%G<|Mm}%YMMhYMWaKOjw zgh2)Qr0lHxI3 zem9OSN*)2htjE@|N!EjykQ*7tPcOV?es|rv2?}TDN!$lQ&R3^eeD_)Zz@!v89Vj2a zw6tV~)B*62b%^(z<$np4g$jS!>n}Pds2G^kj9Kq|Hr-K3kc0~Cb0s&QhkTX?X`RFN zSfEcqd8VH^{03LDW%QSa=fxV&ABKe{6t_#N4I1JZBkrm(o~?rhT2bPz{yJXoL!j|ky3U9bu|K{S6~oj zT{&Z7DJfTR^|vb`_j-tlXOc%1E-+XI&er3arNo3kuYb6GQ8PDUA(Hx=O3Sx*i~U0N zZwExK7FSzQrKTC*uFn@S*muAi|0a_4kZB4W;*FD=m!c~g8$x~SpbR_n zpH`O&)V_%F%J&Fr0|SHbyS&x-Z@ScjK3F5>+Q2eOhcelwNofKMim565ALyb^QF4E_ zYakUNogj@Hkw$scy^MQA&-3GL=!-caesxv=rZB@i62 zAPi%?+9%pY5>%jdQI0uKhY#HdRdKFwoH(aDs?QTgDQGLAg^ZQ$6 zvY=xjaN^|yjq) z|KZX6s_UT`x6Ko)ZRiaedHi;OyBBx@I>1?Ozq#C3JQmR)p(4Knr1ST|(b2T&o*CTI z#40O$RUqr1O)5|(i?mc!eC6+D`yr6xYj@R7mDOkZ-WHfIUsl=nFu_*LK(x1o0 zl@Dg@G#-iU?#&w*U-tHP^Enrw)LA)Q#%qUmqv^rmRgvT?K9l`~nXGpP3W;211Ugb6 zG))x6R!SKgx!S@e$3xtQ6xG&SOBXzfhw>8>A=S-0S`~T*jz;?@g)IUH;ks_@R5~?M z>8e*@ZREHG5;hQNsgaZ-S4p~O9Y-SVLcesd_E4$)T353>6f1TNWq4W;_y(P~ADmo# z;q#?vkw^OZg#ms$;w^<3d@?DGxbvCT!^hT`)Mgs32bT{Box8vIsFa9H*FUZR8KFjt zUwaX@uv%m?I~8)M&i?q*W6&_W61#gfPpZ3ZLZ$8#*V=7VXGWE0gdU^26}J|awb=4f z*iql3B$q3nS1Pkah%^g*DC&^lz0u}k5Ixlv^Q2LKL-2dmBJf2%&~`Hw^2l#kbtW+; zs0Y+t1n96FQM}Axu4xp(Vd$M|c9Q}PEhd7X$dI+U8!Yq*&Zno&yTM)YsOfWns@GWv zN7v38wfR)O6ZYyeS!d34Z;IhJv|`5Hg*eIP{d8O9C3Xw3pv8WT=F5CIVd&~=_U3k_ zecKLaccIXu^ZcSp;^eX@Jj@+D)1)U0Qahz;WK|@Wa@K(<5R^t_p<4I;;9fjP9)u@Z z*ce^`GPFf(V)f6uc!M&>SA(}xgBM2XOZS{d7$QeKgzsMqz|Lc1;?LU~h+%ZPn#nIZ zlbu9o_+#ChHausvJCmgS*B*=u{@k|qsB764PSf$Y(@NRj*s(-&#-=^r>|6BCtgCg4 zRsK)c!OUflrE<4P+x(9BJ{+4q(DX)ClSBT_hUZ@mop;K(LQ8PGqA6UDuY;3dE0m^W zQq3}U3aQvMnL0`@qBuM5F1OrUb(zUKg)r%Al!7jWEXtRYxw zCUd+!nRxVB=%}`6@7=NTnf2{g^&AO=xnR={OslAEI5gy7x1rW2^J#^5(RIB(}jQ z6M`G;3M}R@lm=xx)V27LmG6y>SnN?d{+aKlR-ub5&kiM zwuwP&eb*PCaGD8(_NttJENR^_H9}nz*a8=B3b^{F@=Lt(l={2m%?3M2&qQhOahv0!Dxu@0Sd;1Q@VA;nlV;jJI``-p`e>B>w+Ru0JL zhRQ|48R^m(y`YG->Ui^OFq^G`?9@gwVvw>%3w28@{SFyt#YkX3wXtxe+f|GKv?_bicEN(a0v`1Qmz<;JuLgk_P!r8FcZZu}) zWiFF&3=4~(W3-Zj1zRYzP2y*zsibh!sw*BtI$a?;V{)9czB5+FVZ0b}#SE+i-r2F$ zgbv*wx|u}dIvUKF6`ndaZZwe?zbvYAc8=AV7)^;fDQX)zH&XdNT*g+Pd_@U>70FJ_ z%AI4S$R7PgvPDb2#$UUUBb_S%W8k?>_+K8viMoUm#0wlzCym9~S%;&Na_Y~+2h%Xc zL*Lrin3~(5W5MeAp8b8}qR{H34`y}>wkthHT!FnsGC zJuJ$^+q>Ljj{pX4rux4xgBh9yvqFg-SS$trB)kQRRI|Yo&cee#B9>?^`~;qnri47h zt|&cS0U<4wAxhlyo<+}15|Nd?_W$`v^+btG5WbO2H2wWJwZq3zkNx`9+DrV;phD03 za;QYVh3gZO9{oh5pAPMo+fTk7L?whrj&{2zvEXbBOWMMPC)IBkw9AT>r6=&(@f>+C z0ZVc~fBK(w0ZZhy{n)aiNYJQE}ZJdi2Uigz8CmoP^2;0R`AxsqGU1qQIw-x}oY#oI!^myWZD%U*G_^@neRccak zLpSOe*v)*3|6Ii}rZ;f-X_;(jmMeZG^RPLL@>o~(RO|M)RZw}c)J=yM`Q^Hp8B#67 znyLQDp$t*j0k%Q^oPynkLS_~<8j8Ael<$mTkIZ-+3#+|_#&w#a-9Dy?1t6!M1zjU$ z8auekU&}dmF$Wf2!|=}zJV~`OMHjt?vdIcWC54$SR-lkI4&0@o8lQ1E%NW9nTx6FC zyxeO7rxQ9isE|F#{~xRGwN225Z}ExVs`Z%)O3;!fzF=Zy?SeUtIvh!quP({fs@K=n zTTUJX$gZzr`@N*HbxL?lEQ#-Eg>~Lw;~w{^tCMLO`rF44SkO9ddmw^|NULUxDBQeL zaM%Sng9(4mU@30mmOF9=dH-{KxZw=8*s^L_l6>)Ld&CykjfUp-8b|@hsv?}uEH-B% zmQZ25G4!pPsSB2$KekMe*JY;|z(cSKniz6NO)M%fqW999R_c_!XkrE73^*#{#?lsW z0c!G%8_M1RKtuT!d115N z#04v2dVDel`}I<};Ue0N zSZD#x%T${nADTr(H<{pe?Ztj3_Y3fmpJVJWuw{Kkh__(B7l|W#^~FCg-D`*!cYZSm zXw@g%rxS>g{7kn6*??ZqU2h;CoYo@{M;sUOx39oR$&2!NGzuud9b*e;#w?85_c`$v zj&vS8CJo3jx8C5tZUiq&*+ysU=4eWL;@GkMyICxRVC3McuaP$0iZ;#Qm0@*;JI-vv zg{m${>n1arC=lyVn?E`8&1J|wL*o8VS%O1bEuvxetmuJli zf9=ImV#L{{|7Y9pJm!$;obE`>4uNvZ=e^V)6=2lyyh1iz><>VxfgrAV%IWjxqpnwx z$vEmF2F#jOEAlKnMw|0rk0Icd>AdoSDF^a@p#|};%W6oP@7%*ba&$#>XHxP9+GU4P z#-~c+uMbrC6lk}T4kxIY;dy7+#)sAv!d^SRy@DJoEc@R#Roz@~cGT|k(2q}8{I}iD zTX!(@me}<_@1^~?aXkh4eae+0JvKRtz812KM_@;MrmO|^o4RIy=Y4KNUm^?-J) z-$e7ncV#tTT&e@;lVxDUu?djgczkd+2NjYAe)3B!Jp2An4N<7gn_0aYK%P&)Zg#=8 z=Zt4eJoMe8t6!~BU1N=Hfnby6>#yV{0JUr$^2+u6WTdU*ESRvFS zaAb$Io55_kPa8#rzQXvHjXzm;ib64wqH{?(4AJ_1lFacYi#_i!_j5 zzwmM}bdY(A-u7(b_rtt$7kZYd*!G1ImzjEL*YQB$sr^e7CoJ=gu!&X^TouUoelBW?jZ0O0i&-tK`Y88FujjT%obO z@zp%7ju{wX`BeV0sajn4TzAdgVV434MTuzb*(;tlAd_oD_K)Qb@rVpNL;TooDeyF8Y_DQiZ~))7z+7tl;pQ>fx|vF*1*OHCUtcSfgBv#8A+^>%jDX6;mAA)JUovyN*`u|K zh+PNt+%u&d+z?z*Ki$qn68b-cQL%7gNn+r;A>C9*gqjwhm~ z8i32r{6GMabEUc_2~rccI%feOnGpZDkNa}&$ET!CfKO*exXx2Z8uEwk>KSDVCf1O` zvZ|Tda8sa(wAn)6piGK~bB<}VORrp6Hh(C9dm`V}_@Jw?(0k;kO9|Q5*ZIh`a{4CQ zR_$P8O;nn8dHCGxfqkbTe;=6}Zl2;9nxsmm!R{i}u9Nf;3fc*W+8=j$F%`s^O-7Tf|awx&Bm90R;jUneH_pV{V(zQiqCaT_g zbs)^^%8l#Vbjp*HICRy0ShPTZ_83XV!yqx!W@VqGmNeR5uZrh{bv*BO1&s$;rp0@f zdogp{4Yq2lxr4MBm3HNEzqiWP(IrC;shpHy?dRuA1jep1gwItDY$mo@D)YRE6t#I; zYRXxl7=B9ziJ!3iFa|d$aqSr_v0G%X$3YO5<1*&G>9<$-1LTRdd3E2N5$nq}t~5Kp z?uYNusWC-1I)L6pLb1xIJo2R`4 zm$5Rz>sm8oL923^P9s6WZIxQ=u7GHfr?O2p@kF}mQ_2b+@_rxBjk)v?=tnhPpLv6& zyH_|&@X+#pBHF@veLWF%XY4U5N#|hoY9!__i>IElXEy5W4#H?%YaZ~Pr;uf_gc#J1 ze`Af$vRD3|6ixm^!6Ah&@{}|OMRq~?i?3xhcNoQ;(h%h#KfawW=0ggGtB>FlSjd4b z3*zfhDr!?i32Nd~)SI8ReERI%J0vMqi`T+6yS+?O2gY$sf1ixIQq-GFW z^_qM*JNr)PM%_SGIK$H%LRJ+HAqmJ>VaUg?(7 z{9_y+;yNBPv>Qaa*NrP(9*D$Pkoq3b2~pw2O0PG7hwV)s2q)hf?RLda6p79rBEBUQ z(;)>^c+aJk^Y=A}>viDvKPwv1Ln_E=Q!jvaBVb>E8>|U4X*~?x!TnhSL2Jd4&((s^ z*Uh(V>~j^V;OMjJCE^byrY#(JQk!$n7{JVV5O|S}r@!YsqhA8HiE=V)_VLvf0lbJ2 zu)&eoy|v(ELv@{Faonuriyw=jbSMiN(Z+}cH4>GvlH1pY^P=7E_8cn*V{?ayysqtv zbVhk3JjV9I>_cU$aKcJ=a)CFEXJPRaSbmYWbPDL%26#nM=KFVY@-OxyaZcUc+C+&5 znut3o-8M-Ex0_-Vm6?rx@`nWRr1Webdja?mjUgJh%q;EJ*G0Q2%$0iZmDI06GLgv6wZNk8|Cslc)a5c8T&{R9-vjiJddZb6^GQB9X)Nc^ ze?GgM`;;pYDLkX~HvUJC?-W(M_@V4X7P8>^AK{5(lQ_Ks#jx4X;~2Beik(p{2@Mk_ z=zFK13Xi;k%2O2#M`A2}nw6<-w(O1W=T200yz<7cM<_o884`_u%*-eAT>x*Iw2!I5 z()KAT5Q4NI2+~Y#pLu;EZcmidTLkG#TgbR8`*0Q=&XR?k3j$dMqtS1A%&=6J6`wX5 zPP^%GGPRI5V?2p+#i3%9@oFl@CC_V@fLSDDXn=iYvX#Eiw75OfCV z(nfUiB7qO5u92V=JZl1hLE&44kbpya`}xT;Y?!D0LurScU_pv_^}yM5|X zGyn-0yO;x*wbP;qmjifMj_*BTfwF&llFwlp)>(efi!Z@a@vexK%)%nZn-EyQXBOu9 zeN3&2^+k&Z;5Wxh;Wr^{KJ-EVuYPkf+Qf~J!4Vt&#Ecd((HCG%YI;v>2SYGRm(g(1 z6!zq{rq(1CTw9#7vV8WU@ztvkCgyjf(06ux8~7{)Q9)%(0~u2BQ8of8|9_1frr|^-kpykxkdbIq zk~z&K7x)g>;MG2Tr^<0Evioj~qPE&5Z=B1GMy)ThjAy~Ym3%OF8P(bQjLj6Wx~GwV zPOKq%?hasG4>mN#Fyyx*A(AsYd1>fvbiKDNw;7>q`vc(_r~C|sen7plNXG|G@cAPG#uxA;QMc}1LM(-gF%~xu zK#B~qVfewP6CCbRvA#<*o?c|aO@k&90Z`I3A40`p*)n4_3=y{HQCN_O5*9ba?shi_ z7SD|bUiLVn-NZ5hm893JV1>B-gK!gP2cOond56La{p8{Neh^>_yZM%0WP>QfeIK|T zJ*GVGza9mjGkf^?Ib=gD10>;Y`Z$gcen(Z;)0RYSM@>yflN^vy?cYJwMzOwIls($b zM29X82}EsA%cAnmhm_Q`LVwPl-R-rgchDNmxui{?@-5_I^brwc%Mn`u3+6z?01;8q zsrKB|g`aunrA#R)Q1>d|7!3KS%|fj34)Pc|$E8+ei+E}51|4{^!Y;wrXhLH?Naacu zGQosWS&>WUBxS6`gRJm8NQPtsaxQVETuQr8L~k6rKk7ieekP)%FLwpRn9`9h86|+~ zQ!^B~8bMvYEr@I31WJGp_!j9mp9od=?mOOlvZxQAOLUuPU z`%Qnci;hKgI@8a9PRfvhT%J(Sgw;E(D9|keTS_f;<`c5_eh|1}?^L;GN?wHcc@E@# z_@h*nLM^~k3s$4k%X4V}6&Z!Zd!J6?1O(szXl>yu{+!banKJ~4d?o*cr~Jx&B~Mh| zpM7LohsFE)dhH})fKt@qo|?n7S)SsAg}&J{Oay;>y9U0`bibSkAs`Xj=d3%9x&qxt z5im%Fz2$@bJP+u>)vyO-zC=WVwBFG`37QJ*Je~y>w_pj#=wxRk`$(J!xP832+{rkZ zZZOZYq(2@f2|@M)(7*gh%d?1A1!yYwNuKuK7?BvvHMnS6eFH8YJ!ZXJ*v%)YLX+5` zzKK|wU+#z=9hzmSw>^;JbPuFX(>>OIz5=h6sicX?^HH1^usByQwpEfM`YtPFcNp9! z!{(p^9SIJM+&x=pqL5`jRBHPyki7j+nyt?)~lv6Fx-uzsaC+_Sn*8vuL=g^ zg+em=d|opj#jxVdo; zZbZrFFam`yzl8pE)1TmQ6GWjAG)4WO?s$!XduBzF4^X_GY_GG{eiXV3l(UyU&4OoM zB$bP9LUT02j3np}C(c;Lqv#B3;!_uaEC*geG=E#7Q!6*Y><7hbY+;pR< zbZ2DQht5;5{1bFWdD$&*<_ruw)#2HvI|!MoN^16-0tTfsQB$VM$R{7Oc!Kq*Yj{%q1X+KhdK&Imt zneTm|xEYUh!Z;u7Ao<`$f4*wX)W~EEczXK}6LF{{?pdIQC7s4vi`+O2UfIF-@d}H4 zi0e0jp)IPFvq2Q1Q5R#H??OgSWwBaP@hSKe-Ma0$DIq&)3z3W`X-7U*faXr4>lWk$Gov7nN?_#C^|b2#n~aeE^suV$H4 zu!>wQiI)q@)fo8s0TZ;&@Uz=0JADjK>PONP&hBWt1TS6{G{So_7V&1`-S5pCh@kNl zpD!fOL-3jBQ)&cvnJ~M?9NitG@pg>P`ejUkCP->JZiQNR#czOEKBZ^K(HRk&*LE%P zMi*}hPJUH)mkHcUssz3ZZ+kUne8C*T7@`+BM&-ht%A;Cb(hILmp@_uYq(TN7Dt#AK z2Y%p6Y|@nIF!PAmY+6La{jN1Zo3B((lat>~IzPm9pKA@6Q8ZQ-ABAh9N^21{dhEh* zsB61dzUMe(GkBKX6%G5OmdK(6c^a(Y+C}+#ko;`WsEhd3Y`YU&s(^O8PI2*Y>-K1}{9<0htk`C@mC&<( zxpMb4v+fP#R>aJm$v3MU>VPAh`8*?ATjS^CWa`9@;?k>wm4!QcWh~4E04TdVEsF`-{Z0T^0ukDmN<6L>>3K z8j&_p>W`elBu^!StD!o3GxNJOaX1nweV<7S22hz-+KWxh z3$Xf3&|@MP@^H)QsvZefC&kTgE@zh757DYoNsA!}dXC|Hv-e91ZvpK;#rd^Sz>g*U z+biob9m=<&K}&yS05uvro*JoxPMpuHtr1^Xl(5H>$_(Ty9lMRsj-{jdeh%@TW%&tS z<5d}$B-pWjA;$ww$>BZR9EerzVrdlWl$d{K#>f~xd#-VHs;%*R+Aq4>qowUj^zR4G zeQsSSDzfejQHhRV&A%qSp}piw$kd#pU&N_ueodBTrtM2dnF42x81z)}lloTlagNEq z9L~eEh0DOt$x}{@?xv%HKq=1WLt4kzPc7;@z5xBW6$!Z1TffPBA+13DoV`a_mxIDe zRd)6c;=;Y{Yh6xR5as+%KVxNc2wuzIw)6JReIi>;@YA(HYg4b46c=OOSAw#1FX%B--%(kvKsqJ0 z?&1E-!~Ls!>lrel%>EDVpK;7HsrHka#ikk*-@YEJo7bE6U-mz0%Z4V;U&YxAPHdYZdm|e`~sv-6~=4p6B7>R}4*k#G2Uho9H z`$XC!d8v8h@CqHKiiPZzXIz$C$o)G0g`77J=Lv@XrjT%pHN8CbJ_P6I3636*gehl| zP|zg*64Sk%$r=x$Tyar{y*y|OrI2UdS(D((#{7~J*cwqM6MqbPbk(#(a7pTz_F7%> z=$kY1D9LbUI@~o;XyEKnhIGHAFrzgUGQ@`@$k!YL64sFADX=u>H2sk7j zaK@ul8494K%Mq%}&H3FSjFJIG=CqYfpw&&SD$FWBdD!Vm>|MJy@43lV(P#Wlrg4uh z$qe(x-H{IAt{4U;f?9`D0k(1;MB45;#ukSNJ_^qoeiYZD zvB1E8yZsXW+b4bJj6q@}rxuR-G&UT~MtdvnYAFSz^@>fUC#8R8s*p)pXFq$6L?RGc zrva+LmHk*1`Edg-EIm`xToYzIV(fEzQJ&=q;XAPec+8ju9IyNuF$DN_hXFne&=p== zu0=O__*w)(2)OQ?N>4I?NA9eTz>UBTC^kh>^MBq`3h#M8|6~~X@c|M8(CG>oSVL0! z!IA0J2Q(>reNiz%hjxf1)B`4-nRkl>LD8v60ACQf`CX2N}jUES(gvl zX7byiFxew=&IuKOd}N>Er$b=cVJe|U0$Ks1w_xrlEgT{GTC$8}U?qppmi@?M<58k7 zKGm5#!VvUuBSxX?*=LOcfEM2ta`V7M>hRg^>#Xqx^EykejxORVVZ&vCYCrc{GSAOL zKexveoXNf5a(Qovu6z4qvXO;SCmtXRZz>Bxp8Cb?mJmc;#JJ7}D-y?wwML*$*_{Un zDsvXEa*wi(m-8b4w1;~)@DEmnoTzj(Fh;o9@T5-H4?ci7&Wm2&e}+`DqD;XYM?=i>ofSZ}Q zJIsK5VgNj<5WwRXf`Gi>)w6$I2lg@-2lBDK(1b4Ks*U+ltMQ+6TIUMuR0FvHBuUAG zppU?~NQBQkbEh}7Ko|o$NUwfvE;Fa45!#$IV?VT|FjiSCis=|0ysa78P)BuKo&_l* zpk#vYYl^xySq`nsNIKK3C7k2Cb_W;}$^XZzqs_FJcU1>dwX<{S(HCGs0@D zTp$0#m{>hKKZyVsD+Xr0SvpzPok_N!x^5EV-;V758aRDxIiH#xwBaGzk2;Zo9>G`; zP5dEBg!*=A!V1dPG5T7zMQ1{vYpOcu4CQ8Gh{rC&uUz4g`1p|>8#B?GejEASa4cr! zZ;C`3``-!YIb=)q$dgB*Z0+K8D+gA^pz=5rpSz;QQXflOpyQSUwejeaiQ7{|mPZA9v-w`na@W ztB0ver#J0-s*P%1h=~*?saC2~SeMsJIy1J&Eb39W8io%CC93W0sb5`ye{vF8bOpEx z(mR0JF2J4rMRvmdFXI&&{)%s=EfNT= zV4m)np)oPPzmtV33pbXFEH0ht@{bU`#;d4DsKAlC6N6FTtO;cnoL>*YBL}JRel8mX z@Z$Zvy~BZ}+v}J@;(jA8C`*ePPXW`rxMp^r9P>&X}#Q;vN(Lf?C2 z!bgdk^s=CSA^TF~FrtpTl`IHVd5R0-ke5@J;-bG_ESL{P=%12Yp@d+lWwn$HFs}h; z373V~n2vpHqbt4Gc-`h>evL|;hMNwKWgK8ism=L7288t_3UtA%*w@$bNS5RcQ;$)X z`U1i@HOYII)od-1V>x;i&R*+p)8<<)5sS9yJ444UpSm=)>ivN=Q(*E&*Mvu0_^R=p zhemPG4YC{m!D(Ig8 zdcnUZ_~#wt@i3Ks+ef?qWetc}%wQFYga$7EmsiTxiNaKJdx^IHSmFP<5On~|Ti?uo z`d?m&#oEDCjIt?K4`HY5CbG12)^drU`e)Zf39&+(Z{`i zzO*u86YxIVL{`Linsj}>|627PZivQu60z?HRzxrDQT(fsc|Yaiub-(a^gxo-JlHTj zNbdu_J+O&XMIkq^G5~G8-$7g9vA0lpqqn#D@Zf#L&(}dc0G&`Vy}tncgGqa`sgA!M|k!Wl*CboUGN9$T9be-j#{*I)0JQoU{C^6+iV#y9V zplfV=pt|-J;BmHLhx6>qkV4Tx+EQh_bb2=T<(0p|d9LInN>X9fH@pqZ_J2Pm?bwB{^S|>%$az^YU}-wc*I=vwe9yi>CVhq#9vpL$_|Ok-o@YmHV2L#CUA%^qYjctw_(*r%gXv1vYZ=TW&Jh zmb~)6X*pvKc@+2e{O>!`q@YU5bhLa`u9aG^ugqJ5h5n^!_Hu$>dp`c(2Xp0ytouvX zLIgBn38nDu!>l~I4K-y1a+r}dzKkc7)9huZ@ltQgeg~Q_*?pu>c_IF_X?keq0S{97MEq?BSlJ zeI~iAa>ciIC`?DR_|CuO^1V{@)8r|^cC^#clfvwqK*5^4zg#u&1Nv*~sc*KD3OIL& z&wPj$oV$5ZWi%SS#Ysu>ycoW)cOYpDhUgbj^P}lQ8cE<=5Z0Dc1uH*%$MmaW3@tNPc&)J0sJSNV!mi zf1NTVd|uJ<8ktJRVgC&x3#EPA5-(^7;*Q5he0TO$vH01uLF>Oj={$S2)iR`|LKNpx z9fZ`L#kwv-H6}mTh9w_ItR0~|WZvdhV%IGq>WuG)VH3nn{mj2AwAvCDUW7u!{Nwvj z=spb^-ci=~p@YL^!Rs5=lH{DylW8+4`e}@cFRn+p>R-io3)fwB7|g_!s(ILKTGf_Y z-M5c194u3+g?XPV3-M(nMVUuOS675@O}P*xK-qlG_n3YgxvFQu6Gs8?lEm13d5MoG z3(2ETP4AYmguYDSVrnM!P?Y`j^0aS3+Q;MLQ21ER5{HSLOqw=;gr=aP)?$c?KskF= zE~fP&G^Q9(`BqO)95AR-f&*Au*Uf=@2(-)XOwSKTR}@0e0@UJW?SZ%3u|0vZ6$Vf9 zkI!gs#-~*jx-qsSuRjVlwEfGQ%GLNVTtBqQe@A18vmGjjSLvFt{xh6|8*5z~ zqv|mq_i^c|&I?r1(Kr!bO8uegGIp|izC_X|FFA%_&R8N1w*5LO*`FFNuqJeAwJ&XR zYYtZSKCyZwFS*}L439+S6KL#gz6L=Tjy=u~z3)FxIr=f`shVD&<( zi3640Et($`DCAnlm`YjLTt;16dH~(z9&KLBYPz+3O@dR-xYDSQf0?PL=&$KFs&Sg+ zx78~`zM0yV|AuQ29xy-{J=EKX( zN6i5v@+w^kH_HwSA+c5vxrd!|wuF3B1cHH>@D7sZj=#2C2u(@j&UVwO9KfqmURb+(wYfJn=u4YRpMleR~+ zs{%Dn(aQL4IMI(iHF@hqk>86y(pIW3IDdA@Tu!*vKm9k84k`%1WlXjWg5m+V4eG}Ar1vzbj!P&zwVJtMRes>L zQ_hTie|yu|WpF#a^5j-uz_2Z}Q=puRRXzZ7?qTaf^0Z1ob$rT|OeQx@KPu?X%jgZiM*QvkS&vhO}cYW`uwJ)CD z;q>7DooYNhS%9LfmE)Qvk&|^3vBUM?H{8-ZunE8hqziXyKidfTXTteGKEHFjQD|BwU7`%?m9;fJd1nk7n5u zYsQU#hKy6BEB~zinRCq@Nz#&A!m}3kP}@xGg36=xBv${{kJ#0fJ8an z5jJ*>vk&HastlfzUY_Lm!n`nQf^;8syfK<-?I%>IPCu~)F6C5uL|veMINFU@FSi|N zHT(`ow9A_iQ>JdeApyO(+iNPVKF9mb3xEQN;DL)D;5kZq?W0Urcr7EIo?cM!d*U*p zn4)IX&Q`B8bN~`P%`MMMtezfu3yn0dIG=GCd5%g_eRFGO59#a__w9k&w8h(E#;Qn^ zaRId0NLr6AYU8h_N^NY+CLUKImJ#fXK#I# zyfPq02ysaVE$kKAuiE7;iU=ENd8jG0@zm2+KBo)~Gf{I%`CxPaSi)yU2w9$PmM&YD z25t0^kQ&wp91SbxY7%m%a-Ln}XnJiKDN3lk~VRx13G*`#K@)x-$|~jX9rw z^;OFYeMFtb{eATJya~&}OeO2SHw>$L-4t zVckNmS7bjBrOATNNx~DxhWw;Yk?*Pejcos`{aP;Hy>hU|Ykn9P19!R@X|FFAIfwL2 z+%4}ocB*qVntf*TjlHeWpl;aWvOePR)tRI0Oi9N9=cApLAq%xn;^#9rs3fElysb9#1NN&GP!>#?AFZX7Zi0xc3AsA?w%r8 zk1CN~PD=zf&=e2?IoH>C#Kh;D9j|+ajjI`vogqH2xGd>TXlO(xK6{d5rd6_k!tX@k zHBQ1nf$&ib%_Ke2Qc)&hZ|WV#mW4^5Lq^OF0Hs!8C9x|;T$ekV%6J7OWi}z9_>;4();JmCN zSS@mPdD=_AZSxCgh*_$W_9>a*+;OFooqS!+y%%+t&O^$k8HytgJ>D+PFIPy?4$j>A z4{$=ymg0X?xw9<${)+Z&3r`QQ*HRF&2S8z_n>%gQpj#HnbTOxN*hwWF|Cuc$ueo<> zoN9a=lDdj8+047+goluRc1|hJ&aM?_q|EmELhT%!c&%R}&RM|n%(Cmx&4Rk4&+W9E ze-b<}m^1RQtxhWbokscFA^#09rRamTjOXa|{s)Hm7asF}^Q(?VvZHZttT8zfF~BU$ z`8E75IncY$5H)pJiH5vw6%rm(o-4$EMo?TQ+HaJ)el!i?7V3Dyk&p*jwvhxSrYwQm z!Ph?2d`dAwGP#EXRrH1ZM6J!H@MqY4XyJ4JPkYxM4rTiH@pB4cvn4d>AVN#olpM=i zhpeoWw2%>wf=Z6*YzCl=eg&;zxU_+`Fy?;5aD!B(wVR%n*>u4|GBWLQ2uF(eP8LvF+mUu;MJcuK5a4Z}p>kwU!_iX!mt7Jhb_Gb|kaT6{>i-Xv zyvQ2qr#-HSjwF7Lg>dXDVCh|=MEcsi{MW|Yp^5u5MM!j{M;;53&j2{8`s>t?zm1(2 zErt?hU2Orh%sNx;f6}$W1}I&Kl0EY0AD}(!bZweR^mWTKd2jwZ`I0vWBviQMVN{U78C1 zIlF04*E@0&gb-c8uj>LUuF7yG&bRu_uL|a*(I}hNS0L$0P>e-*Ib8c2re5`1_Uj)n zsMkHzly$xshg6WQ^;}3!$#k{==3D%b*T^bWS{^s|PuQrdHs!p5D8Cao=n9rNGVMCL zw;md;z6&`3E>0@vJ(DrvwrKg$1=w*`q>+g9MHl6nG243vGR;WjBPcB@7>p8XD9E!7 zi36{cF|O8`huY1d3YV9DW$+jFBPenThzD_3r@x&0aQvmF~@GFayNCS75tzX6sNSIJKJo= zK|OUoXh5eeS6;BoMsGBoI3@*nufbF7R~+o%&cG!yVP;?PZ`deUX?}-utG(#)VhX+k zN9YK{&q!r!F=2GnWenQ$ED!MD;i$r~toF546IpP>^M}E&fV*_=m$7uhdB&TN-d}W; zjg0~EA@Ul36@f=L5!`yTEXMysdfqmGUru;8ZRE6cb5*QF$^kmKgWiHpWvoKE(Yk}? zv-(D9^;=ZN(~37^2VSo(hZf@{G+Z8F9koB7zxPz}M7?Sz9=xyOIiVL0%>i;R(pO(Y zu7~w=Jg;Hznfq$p2a%7&NSe8D-`fiV?&Br>$YOKjd`r}*{CpMoRow=0!4(6O zEWv{YrFzBNUKomH%q(spyQN%xwAAoOy!@A0P}f*ci_(7rcF1rLrsPEZ(>(aHdGQ=X z(q(r8nN<8h^t%N%r{jicSN;mOm`%x=n|P_Zm2nAv@FQ}K0ABk;t2a)Xwt^C7nr+Y{ zJ-D*GR0f`odzQ$-)|2*0dnw3(;*yqq)p%1L(c^yA8XwkE=(5+}#$DSB{FwV<1Un3{ zj`c7F^oE<<=$DUgXeCneXQ0fEZ zCJ%fINs{k%-Yi%Nj?L||mY?17=Xo`nPgub|YNn*;XpzI2JF=KBB{*89PNXe{FU*+I z-pdu#e$aJuh{rBiEK7fOsi#pXEOvjt7A+OZPmfYLT*Af>1R-R-1_vSWRCYXlWxR44KdM$wOJ+iLq;I+n`AiP4_}w+%G|?g-|Hme5iqp)a)fEr`==T&(sFZAu zlFD^9eyj3WlVe*!Ae(8CNXq$oU!0pOuMs*oSq(9)+;C$hfc;PUEXdZ1kS}mky+!#~ z+Wu`0t!Ez*YA^|m()yp<`lIu{8oeeZ3wQ+MhX(XDZCQ+b*_*r(-&$)fgL%ur%M5NVD z6xg%$1tNTo5KJ0}{)8pc zFyUpWiRJm~u2V-2<05jXI^Qs5R9Pt$*?~UI$cUWjmyORm0etaQl6(N@R?GoNXSzG_ z?xLx2uZ36mc(!K(?_PHMtWB*Q9UOz5!hYVMI-){~7|M>CT`|OH()C^2rC@5LGGx32 zYfb|f_U#JqkrF{4$IHWAcOyc5^^{&P+>PhlWEQQAi?_EOdlmOUDUwK|g?XwZ# zrI$ORAn3NB*ik%Ep|6KP1xfHauG~GCK~~v}7bp7zu(UoRHVlhRdndJTu@~3hw&n6vlzFOtvc|v5 zw-06W8M3)2Uwz-)Q_*kr?4|<~ zjH9>S>UvCd7e2Xk3DaZ;j?hWHK$*;l=KVQ_-0>jj%wOo}YTA{2nZY8DXSVTR0b^tk_PFSjR zLM`NN+ek57Z_HYCR^JJgNSHO$C$6fFQhc+zXjvK2G49t+&1~1<<;aDHHQ8Ef9ot8A z^j?MBw$5~qs_OXm8mmX9+J_wQ7Lzv^kHbr2q8I6Iw~bd9+i$p(a<`cm^azBr^e4`l zr#75@CCW2bc7fGC<79ri@On47Xk)S}Qu7u$URN~E_ZH&%K(m(~J(%4Kp#m)wf6Bi4 z#n%dn`Nz(qO43LxXqja_(@2xlS2J7+sMQKIJ>DrTI53hWSmC|kt4Dufu5<^LG~2aM z1B`s3BaL=r6U-#5<4jw1N5`j2b5q6KhS#lD8gUP#j24uw%Y2(onk(Xpa)1H2b}^G1 zfwjZJT_81P-%YRUyz5?OUHWb!H2Cfu;hf!Nwbgdic|7?@yaHZRQ`C zvY&`BR2PV-=oDQvQBD*f&u!h+us)RW36RCc3#^%Ji{;`l?OeO0)2$sXMZ+cgLv;@8 zl;@TyrE`+JTBV&ZY>kd=u)%DT&^W;RGN3L=vU_ts0jO(N#+vJhzS*IsNzpL;edoN-_JI<0#5>a&QzD*plx|(Sf2n-riV*)G*hsWM0DXzBy9i5~ zRKfAM;??in|5_vX<;F@-(mgUzVJZ--rw}dGt@gfUO}U>gw)Rst^=yb>Rp|D3lkAFN zt~^~L5tFgK7-;quNARy(Iv7P}_|=voaq%shqDeu4_kP)O8<^mz2*sTe2Y1GtORp9! z!KZ|>V9~DkCao%yatS1LlQ*xsiX}0vqFS~6h%O3+(JH&upKip)q03+pF zDN9LavTEZy@3a=<5tn3R$a)`pc&ze!r(m)7N0tKp>UzEUO_TNW;0m=nm1$>v7O?$= zgXc>SDpYGa{oYZ6kN*=Su+CFMV2a~__$80cd>-^@R8sAbI9cdvH{yq)#|IWX8(8p& z#{IE1?^XS8As*yZsFSXhBXR4}M-cZ7obkI$H&TdyJY)U8SQ*%vTOz!F{qpNO)($qI z!qO;GcK&Y>(qFG-=sQTpsosvW{$CeG=SpjVhK`xo_}{L#{sf26)PBWZS>}^_KlQmj z-opklr3>Yo1wYXkK7oIly7r3y`KllK%l};mX2;B^m^GArW)Q~*f0kx8UsW7+kN+<( CS_j_RG zotbsNwZ3(K+@H7C(K=PTPVL&Y>)B784pCN=K}R7$fq{WRmwog49SjT{6u7QIdJY^L zhb$>!V9>-YB_)+*B_*kp9l_?7HfAs|Z$jcU5w%qN2-0< z+7PoCPOHDbfZ1WFH*ugSf(5k{DC1BPbP}Z{cT>5*<3V8Y>tHn5%0Yvn5fO0T(Q5qI z5AEPyPZ>DHTHaMXv=S(^eDgycbCJ(o_ zA>LocscV9GO}?024~A1ZSKj9(apv@`o1+_d>8(k<(9)tBURAsk8$WK9_f2;m4dyAC zu#t6dvKi(8*E%vt_jBN{7^p})qjly*X{4uxiA$9hZWam2!N>QQgu8B;T0oA`VoL_v|jiRDdsfx8;lGTQJKIVE>(#OZ}& z>nlUdoO;=px%ay^urh^_h4>Z>d7!rezboG)lA%|SbSgfpM~;$9iRzl!+Y%=Y;Wi*> zebIRgr=kUspk(Qf{nX8}FMopid^5~;n7}bB?h<6|YEnldriB{G5=lAytdRz*BKo-t z!n5q`=bJwX5rP>vzPg~kmPM9J!(93K&Am%2m1a0heRg%@V-p847^A{VT;-$V%XB>? z!W;?85j?_fdghhRFsC}~ict3>Gb)tt;v7aWOeMrdG`+AtYZk8!-1E1PpXpJ-YJ*_N z;s%$$;CVzM8Q_9AFL7yL8Hq>;=??MzX?p$0P4G2kAWVm>(UHB+l&FUhqo^q(k%30)gUv5@g z-0_;8|J(`XayV@F@UCSLeo>7}fo+E+h+NBnv?=Es>WH`X={mDt@BM8e>O6^+=dySm zrCM-*9h)5znS=;sN3#YD4Q+(@9{0SmE@WX%Eg7tr!ul1lhVxr(*S$aCJj^ zujcf(T6YiOdS#hQOus-SjrcyeO_EKDP4>_hiFUjhA(HP5q4{#3v}c0Q2ySVRh{&)< zEcrVoXC}#EW-g?TuRU=Y9_*+ZN|)+A{9u-+xj8uOPlX77A+-6j*WE>WZ`vdsjc1~? zUC{~`=DcbDQ;7=O@?zqTA{PvO8{F@oS9Neo%cyYTNMB$$sjzcC9&=pV)D6EddXm|q`nTaG5SEMF{xUTU+v9`qx zXfu@2a{77_1KeKz2lH6;mNY<`gN7f=8;R`a2MGe>SDBxb6@M%8zvYL&ct`k$JnGHF zi&?DZWrMdbHn8%s332(bE^w4^=0YMv?&xs({MSHG+Ac_4j>J1;OL9xT!Vu@U390LE zSw$vAD@8a((RnoQl8WDzRm!h_>Q|4mV9w|xdCyXbDe!raLFoni3otETGPSHBuQ=g!rau%`RpbYb#)%iGmQvU{GwqE{o)3tv!)%%$w)QUB9S-+a{0J- z3CeEDO7Fm`OQRpSt#znaBQ)OVN)+d%<%xXrQO+(FD0G;(w5pn5p4gcvD{59_DAp~k zn&>GeEKVqoK4oNW&tq<-_JQoz*67xBj&;L_?hhN*Y*X{O9^zHHVL4`{ z_*q@k$>l%PcdMsN1*L1hRc#Aj3t#hI6MH7`s?15JWT$28=QYbTDl{s|$oJ?oXj_CT z1`Ru;DeAwS)=9`t$!}D<@IO#kQ1a~dd?^yJNYkv-T>nPm#kYnGtrNg~LE|{FiwFil(k}s4m*}QtPV6tH8 zcPIa#mQB;}Wwk?jNSW+6*{^-fF}U<1dCsbSs+rr&sjDm0{g7ecSS^>!W3qWkrA1OxDQQgk$Ww^8m?!kj8tKJoTDMl42)~ z!&2K4)e=IjX-4`V}(!B70T)EPL!i z-!RfKB!bZ~_B)U}nuEhWYoZ_8=LlAkg^^j4Q3`GgN(l_P6uEf{{B(Rh`(>SWIO1|+ z?@#Jx)6i8}SiJnF0lft2gz2*LGT~{I>0LYSTIE`8J9NA9dGh&Z2c!#Nk5Lh6kC#Uu z&M$Yi*O`t>&i0NgPvtI(PIru~xJ;A6k~W)>7sVH=nj$^?PLEHSP6d`v63Vl|Wru7_ z2T{DV&uD8Sy!R{)dd}unrng6qlrEO`mWYiA+RzJ7-0?nO_hQadT!x$V^c(M+H={XX zL?e)6c#fPKj~Hd3b${&mC?7Z{Q7sWIF%>L~#Dh76Qi{BfdV-Wf;QA^Ri-NdCxGB(d z@Q*{0fDg6-HW!T!O<@#r)GjTh)QpU57Kuz|HjQ*CnG5?YvksmSG>U9!Pherp7QG8r+de-$7-aKazd^6%$S>8e>$BoR=yCJk+7JqTk?%~4 z@un|LpI``ywP6RLbmMd5S#?leX%v>85^i_UJYKs8mt4{5fE>r8NNs^Pc$4o+3Qx~D1 z%azN1IlTOi}$};99xp|%Hv+bHS+f857dyVD=d#{=Ea-r8kf^OZ1HihR!Ws7Fj)|Tixw58iK zBCZ#4BJExzR^-;Tdy@;gHJE!b2VN~=J`Y3iQHaTyuP~=5Dh1U%5`F8cF$z<3rZqG4 zmu4WdqrP-2Z=ABhtADZ;W!%WPgftxVHZR9;V{uQE(75sW&NA>a_C(@bb{tMVZ8YY~ zWV9^eOpm$JLkF*_GA)!cOw+6d58dv6Kg&RsVSB|sWe9Cjo8I~5dFd9pgjwrRGpT>x zFmK;=BWHTkdycCKUzXlL;Jv@+GJGb_mj$j~dfVVpY-%I3$$DRUFy~QgxWLqKa&@rh z^~o#ug6AMXjKfp;EO>#^OGrd0QVh-a_ci06Ukg1=J?!at!dGKF4@~=(S6t&3lM7`F zJ)RO?Wx|vvO}jytU7L&1VmuEe-j+AeQNsYgALkJVyPhvRc#b@eNdx(W78_b^4N(pv+(y*#j_ZuT)D4*_DbFD$mxdtzSjHmro;*y z5K)AhX~~)^D8MiS*GMo3up}_gfGb#F7lkGL=UN(;5eEJ#9VioQ34`!=9!22%_=5oZ zW0}9s@X?=P5P>@!V0Zrp_dmJepx@yC=NhI4cm^Y`Dk&=qoK;O7&CKkaK7gHzOD*<+ z1Z4X++D zF_pWmjh&OAy9muw4ng4hF`1o)>M4t}wFr%tf-;pP*wKuNmyLssgGLmEii%3u@x8g= zyVufx7YFV{Xg)YQ+Y7R@yScfsxpA?99WB^71q1}xIbN~9dc_LlV0H4ab2fHowR58V zOUVDod2Qxo>S$^2YzelbdX#Hy0(Nm0p`m%K=%3$T?`h_4`EN~jPJcHGXdwIJ6LwBE z4)%X!14V@&Qw5bR-OX&YUt8J&Is@tu<>Y@Q{FMLy^W@(e|5H-y-;#Vh-2W~4pC|u& zNp&YPM@g_PP^GizzuonB;r~ATyPz=pW6S@eiNDnRlnUrs6h)Z*pFR^sX}{3K1-g;U z^0ks0aDFt*$Db~+GyZ)BuJQchKwX%?Szh+FxSBic-h%cQGAW|AQh&FidhAu1}z(p_H=)V!5x5O`=pErdtW>7~W>tDfUx zehIb1aS^G^O9Mw+Sz9%D=8xUEF@)rkZ7A?cK?5hVf8wp{ZlUz71lSH0B`>^={W3v-VlolvPtys9f%493wH%0 z6v0gUKd*~K1)>_^{P)fu@xSm;Rq-VuAcY>8AhQ!34lOLt2C1<1PySyUma9 z6d>I5pF22M6s%sv-$|fuhFQA`9<(z0JTQC|J~T zcdqNwOC4M=t|X?v-}-R>qox!&NPT_oQ%&5vV2z_$xACv!LRn(>m)ZHQK3D4rzhoKG z=&-$S4`;2*_nBoVh)Y}}m*v#0{LO*hKgho<+v*5fqk1$Qn7eec#2 z3_o(k6_5Jx^pS!DH`?Ga!*w)m@@?iMF-0aO`w|R&uVS~OUA1a#v^5Pj#Br`km1p-l zG_M^#d|><)4-(ye^Pc>%)gTR6%GDJ8o*qg+VTY3Oiq0q*3QGoL^O@lCUHGW;YD45T zQhR=pqc-^Cl;!%(^pr~JC7%m@=YFRAYIwheOb@s#Owmg!Hce<$@hvJLTX;FyZd0wz z%qz3B-w@!`NH+ ztri%T*mUiVlejDtn{1C)xK2bXzm|_ztH!G9DhrktCcU7`q_u~2DjCG0Zm^xs2>Lf9`=zyi)?tN-^B~l6ogn)Q&Xwr@w)03?-vH{ z+qns|wa*wbd(&_Q)3qw!%Z{-{7)}!gH0j-?i~0G^6E~>pmCK5mE1;}VdaOmAmy)`Q zLeC%q&1X}(4m0l}b7T*!$FF)ZZ6uSuZ)PlUQ7GfYNji&H4Gruz^R6NE1i%%yZyZKJp$GVeGi zGxmPblh$zk6Zh5mk5Z&HTkQBJzW_zzHui2jXxLpbf{w|vN2GgfByB~@vxR&WI;cBu zK87lP6??elQaWkMxBOU+`sc^ALD+(1bE)*~6wowp))yDK49^y!mIJp8H}k$X`}%iQ zh)`&Vmf#fHvf2G_)5wT)Z)o4w(+y0uHD=#2=d(%89*^c*=zwe^w`&^Mkn}TXk-+%L zEl@>*aJy?yhK1O_V(ukMvdb$!oJ|{UT4qgiD;&AN_C|jBQNBx4!eSdUbs?I%N4}1FVU5^-d`85)+9%Y-Sb=4QcYp^9`4hhgf`vLH{2XFs`~ZWJcmM2w{!ZM zXv7xauS!~K7y_DDe?B>IkV?QSxFO-y6k)hIMYlh>nSI03e-Z5WM z>$xOSg1vPksv|(YV9Z81kE?#`qP|;rj50%!l#AYhNK?tQeqY-26>DAW-Z`3=PtF(* z|A?nWcC;)|&ZBufURN;L5irs2xpfa{rXnD(ti~*}R&1~1AG;tbaM<@JH?6GJHht;(aCgxoo-6$M zo!t!w!wL2@j{GOVE9j5rHDiVi#C&v8-rtP~Ki*Qc);11;3=SlPc1x-R1$E#~fk7iu z;WWgTL`JsceWK9gYdu{SaS~0rsHmvuYJ+K+8O~>-VK^83Pv3qk@Gn0e^1fZ0vB+4M zMDuu=g&{0?KgZPKZ97V6=*@YKb`=<9r@8bD>Nf8nGcOK(!{8MyR%t)2?7a(})*H@AIS1MwqfML%1& z&cf|YAsKwqXDs!ywRa47+a;W;9EuozO1GNUN$1(PtLxY77h~viHA@fuptN7_y#3D0 z$$r3xu(w?22lD~dcEO)Nf3~_w!QpG1k~m*>hsWpEcCJQ8m3urr#(U`^6{2r`4RXI? z*PPtBGJI;ba)#O5g->Iv(gM4KKDl5ovGcpBJ~ zD3?S_&S8Vg@4ETw-@PP!X6DV+(}>Tau|?P3e0@5p1r_!kcUtX^@KpU?4!lxiWTe}( z5%c_{6Q~RUh4)eWSH-w}K}dpu$Y!cjL(Q)6p8ql62;B}?EYp3?XN)lM^c^T$STxJt zx=h|>8m>nQ|K-J~AX05TmwCGkn(O;L2D_XJt^MP+5-mK_`Q@(ndsQPV?h?L{qnN5d zZ;UY+dLK)1TMVbxRh_1QTKn_o`Ifw^7W|-y1UX!1ydFI89#;sKel+pxq;_?g#SiN! zE#3;It;A*zz;xMS{h+KlvCpEp=H)Y7=6!U%U%Moco4febdWvT|6ib77{iUtY{VL+f zt*fpuLKz{q18z4=$Hn|||Gq%^wG=N0^!-Su20?ESZ#Bt;TMW1Lc%H(}$%vSykn5gu zFr{yu)w8`Km*TKwPE+aJ{kTdQ85#9m+#11^FSN7k7-(o}I_3*WH>ahIrLn<}b2Wz# zf_EfXx5h>#g%7Mzy0lXW%)Dyo7#Kx?n4)!H!M56R(7|GJBUI~q8OH7$sD%yGFa9ss zX$yu4Xu@>qfBkRLw?~{M4vPK{hFX^N2Q3B*n$i3Vq?7@`)A`59tC)}9pf*WnTXZ5Nq}N&iX010W_51S5Z# zeUtaq@KbSlp=Xw7_Ghx=@K*rbqHk!Njd7*1&z0qZ*p~H823y`O%VkGa@7az7fi381`!}Q4%Uow&mJ9nd%zMp1!I+sV??;)$-zp{7xE!n5MwS zwZWX}K|(&8?Eb5hx5)|I=K5Yq2@O~2O$bmA%mV{uvkYO?7)(4L5SxAjHwh8Zmf+~= zd5-Ial}5L(<=LB}l#&6jmvvBi-HiOm7p-nfK74`?wqRD|Sz)m}$hEq9eA&qC4{P<^2z=GIZ{HSFRB-Z$ic*xz7Y7f^oO|pP=1%}|#zNGv z9;BukL(WNkGR@c&J>4MXWY`wo-Ll*3r?SqR*+Z zUpv)XizXlcQsrXaDRQs|tomW($dDE8dbei^HgB(d(y9<9=&wLkIRIu=SD>I+qx2o2 zzBBf*HtcX;pRCJD2t&~`R01^$YY*hLF6kx8>FZ8S|Ii!{gieYJVaJ>&hy0=Pv1~IJ z5b}K$z20(sm6BKAbiGq-_2EOoolp<6DH}N8mU~|_IpQA|#et5nF_e<;dN4n8JQ`h@ zqCjO>!+7+dmGD5yOYq0}O_8b2s+`QyDJ!PjIDVg(F4azJ^1t}2 z8s#Y9tsS)Hz877FgIe4TJ%+^Z>*C2?dPXY^mC875OBt}>U^(4jri2J-vNFRl=stS% z038IPrGDcw7ExLVdwy%v`n&0XmC^uoHg_YVZ`rO0P;#=DPYpm|ZoAZITHOg9L0SvD zh?^$c5mavCt;`H^%Ci2@qs;mwOad+(9CSGwvinc2Q_8l{xaBDlR+YPApRD4Y``R5{ z3FNP4|L&w6<(IydW9Fg#iyh#z>HuABa^ zBWuUItE;PBTs4jb!jNetHDY37l`4x7JHn9|krD&O?;j``&8f=wUC!&QLmUkRe#Nc6 zVS5yy0oz?$Vls1X;XSYsS6xer z6!stcxz5sX186-0^vs2eq2aoO>5(|%c*g&bOtL-#h?@966jgF|*|(BJzovYpYQ1N; zUjwJ`!mmHQkPP-}z2$1x=Qz&q)p79&ucX;SUxlq=UUPlA$5WR(J_GFMtlG)cyGp8( zJ-^Fh#>pw43gR}0&>DoRf298g{;3mj05ii;*5Y9WK*3GjoWQR$hs`l=K!OnoDo2aL z>Tv!xIe-Mn$?kr%g_N~k$4ot0-5$1)_v#yv(V>_4K}tDJpx0XQgu+M)u6_fi?i0BA ziD%@ETD4T*as>w%z*04*fMb^RTTSi;nwXdvtM~6;+X7Bfv!5unH%E$dFS>JP|Ry%&ZV-K;CG2~{~(u@Vy# zmGjeFlV`sh2bx9>PvR6jk`-wz+M&-n9i~dOk1Nx)>+CaiDolb5*6?8wPypYDk3i>P z3%5H}s`_YaVk+N5^Lg(PVPGB}rrOdoVPL4L9%;q>G{ixCDZjzzZyM445kKOERcG0k?$Gu{7C7dQAHN`(z~0=SlnoE>VT#|G_7d{z|ruOl{>iXIrb^PQqk>Jz8%{Y1G)UhsIk09Y;PQ%)u z2?)jIC|7D~Rc{j4t-AzxkB*BzhHaZP)^qN_hFxGmw6f}0!|MJ+pb%Uju=V~`Q`p0^`X zenIE`O@-A1j{^vz3fQ1J0z%SISor5imc3z*f!&3~FUSZv)3a!h3sN|Rh)yaH4ppFd z1A%84;5fxJW?dB~@|7q#U3Mmv&j5nP?N^~{zOwfi(p=!r9NCyo4PDzfHN5c{7-R?x zaxBRIvo|uN@FT)(T-!=lSXXwWD07}t1{+`+ktENzD^=z+g@6D7x7qXTUGTgP(sJ?Y z$7iT;7S|Vt?2m*LNN+aKZlNKh-f8oBrQgkdt;vr`4b;wKe}p%{_Duk36axP0YQ*&B zn=R5yy*lNR>M^;1>$2}ZQQ(fr1)Y&PPreDm0TFM+)?%|4=uc5`88e{zMi3e1V|jms zFo9UjTli(HXmNgzQKk^_z#{L&a04oyGd_K$a?bIJ01M6bJ z&5-IVA{_`0*oGvH07I2fbYH&d?NAauR8Ec*79F@aSP(jd);0?h>0gd_JR{WmM(Nl} z1xIgj3kti8B2e?r|IJd~^e)w*n>gTv(7-*D|FG!<&OsYvnd^Q#KO?lf<+}f&V{QXq zQ%~h|>zCx|{@gSrQS90N1AFT&4PZ&EOSW48Er$h+0Rnr4?U}0l-|u4-I?YpTH#Gko zwmuNla6ZVh{!lAnz6U6osO@+?$G*3QuatRXv2WR%PsjY(swV*!74OS9X1taOazHEh zpA5fyY@!_0T9)}Q03){=rtLJuDYt$4bu?EJOgYu#?- z<$!w;({8i}Fi^k*pwrF|1L`n8>Q0yGuYL)Q+W_p20FdgTK-iN0#~OXs3P4qaj@?YZ zfZo)qO#FfX`lfyeHmk9-JSR3UH7;O-RXm)jtBE1})TS0#`s!{5LM;h*CLzMPZcOo~ zYDDxzI-KZ_tY#VnBW<%`w`{kM*iqQ>h(UwJki;D-o>PV)0GbfGT8p6_s5or7@f>4B zixVfYT;!dz!A;Vf+yvNS^z}$whaPe__vLSEISVGqa#uVrzRHOW0deUTI;H4*%x5G0 z1*mZ^si1Sgh$lewovf7zR-CP+9pz{ELeJ;GtquEMTY_DuJFLr_H!|7Ihywe75V_0( zeQyAfLP^x-N;PmQCaAI>aRzixhv?`X#(?n>>US@admebt-cA|)hd}V0Ld%XdzcXU zr%)XsWo|!_8QzaIZl-%psjB|=AnJC|D&TfsmIN)8To@HXjl!0Ky{#z=5U&x?%tZ9( zeM+7gd1;|Ba$p#>F)&pXzq}mcfcQuF&sgZJ-5f7VoahQ9ELOp4a(|Mete@bd{{5cOrtJ8D#&gZ*@PXl#Ihax3!{dIL3Fi zV)s*Rt>w=oi*#b$;L>^Xd40D9eC_Z$JBd8Kli$wTA!^C1vUN#x+wk_q@BRSUWIL}{ z!q_#mn}JzONW~x~!8hEO41W~!#9OPY1M#uPw|sDKxYzV^=8vD$w_vLr!UgoUTpMFO zq|_bg=|zA*(`T-HT+zshf;W{bBiQW|nDS{|@dRhuvN3bmWzFj?4?F#mBJ{u%KJs_F zJDc_b%+lw5(r6w>%gx#DCb?rT^&+q?VOe^kZIPya!t=QzQmjgBnMm~95^iw?U?mrQ z7xXHP>zEhHScxKux#%mAE?10Lgw`-AgntVicjEdGe_*J0Ph>z;)ESrjoGQ-nPBV%j zlhpTs&47_|_UsGo*P~^W?2lxLJLFD-oKa^V-}Zhc4R%U37rE5*qv`gFyrOq zst)B)w2P{ezjimsscJC|K7=~VzNA)Y|ewI!t<^o z5|L`*H_u6X@RE7#Fo0m*&O%H{*@cOuf&h2G-wFHDuH|Y&nO9&S+KGds#pq8H>vHj! z0^h>>NqBv>3>V?IDYiLZDZP)x^CLyBp~k`F=0hpxunQXO-9IQ`CtD|K@DIRdwb33* zkliZ0cb~@4u8voG2;T*`5?Z;&W=Tuc3R&phO+Hh!E%wa}jLO`}0>@nclK48*+o3Co zHdaj7@^J4}9QEne6Qb&+JrKrM)_QlIlFnUy=&0Wg51RF_ofhQV%MRCuzf5s-*mLBO zD;nY5MP3ATEO+h+vx($^uh9ACA!;th+sEd!-Ge-+(bjD&6d#U`ES?kX@?Jg0dPx(W z;lc2__ZLfkVzCrlbHB&3B6jLmK}0RgGf<$qIcoiIDp;wuJudau)Dz`G(ltXtk`HP5 zQH7fo4&1wNWtY((3-L32&b2YYi^j_>(0&$*JkCy|adQ-4Lah8DLa&%g#7w^vAl_h} zcD#HE_~g%HjcC?&`)#1eli*+qRm!iAh7S>~b4arMqxc!N-y!fS7aeo91~Ik{noOPM z3+@m-1YgyJ&TJO>`VAx~3R~v+Xjn=b9AbGa`mbc#?5ml}{JPqiuNa{;Y4ORNy!=VU zwTmD&M+e&-U<;x^YomiLLpIm8F9|1{+Y{t3&%+DP9@vmae+~>fy_EGJ;{=)S_3B11 zB@JLt&Qi=GozPY_*(72$IUVByg9UWS*Rph|MK%(i#f|XN!zWcp2gt=-OgVS`oF)=h zPV_wDh@_GZW9E4iK!_sYc22@v; zGMUtd1|7AtLBr?tcfeRyL~x$3bD-|nI_khwqJs}HbpI0sA@k9|tXj%uWox~K{`TZh zp0EMd4R70bk-G=k@|r44d!_~mcAJLmI|)OC;m!LhL^u*%9LWdP=TBZLvQ_E@%2|x0 z?+-ix;!EI*|G2J@Y=yNKus!$a%wF1s)l;aq)I^zDuld>17xdY?rBtqE#v}QHi3B>6 zRFi{Hl|2e!FHh3lmOxx`suJZ_yv{rOiOP=FN6cyRdPSO8(D}m~jBqo28&!3Bhp>uL zqoN1b*z`KxI+ny+8+gLie%0;oS!GOo`}3!A<`V?>OUp?2?)H`SE{W?ZJwvZdBZ`(L z3RNZdlr?y$Ce7(qy(eBOWOzeE)bmUrF2#C1YUDO|-akJf?>Os%6{fh?PWCa@QquA* zGfJz51!sbE=$t{RHs61KY|=szxUf@bY0^d!V9a$LOtV2)Dm|XK`wZy zd#2=CG%JALW9tbQ_#7SO{nQMiqS`VDLkCY6-0bo#$I(9(GLH6qWrsW!Sdo{w)-zkO%$w*-m>}f7Nuq)GQ}i1?q+bB)7xM+eH5^Ck3{di)j^MBXw#JuO7DDa-{nJU| zwTCVE;%>NW>sPkq@fppUuFfOQQ+{>Tt?9h?n-edCZUun01Y>bD5Gf2r{IPW!tb`31 z;dQbTiD5?i(S+Qva4qvBGl{Gm2d2jmAxEsK?MxcuH377pJ=Sh4M^<83mogOm{xtxu zgUGNm=r5dOH2+5=#DY+Y{eEP$6?-WOQh+bwyV1+^SGW5Obg$6ymH zV$I%uYM=EapKKY@M@AKGU!Z4h8`0|QfLlp*4^;?8>wUcd)Urd*ftB~mxf68M2 zY1U!p-v$$@Q$I%b2}Rrdp__AF+Qr8k6IHKpNY#tV+wI>_7CPS zu_;O5lGWes1Fv~hDRQro&X7zjyn%BrjnWL%K)*fxUBsHwWN;}QIr0$0phPeRgMp4KM4(RMH?E7NCB0=6CumHSX1Cfq|h|Wl06yT46 z1-{9ML0IfiuA~hjae%2I9pbwkz<&p7@xdFT#Aavy1T4^uh+L#XP*=BP;cg1@Ph~JW z3*iO$-}OGr`NRzg(u>lNnyx8pNZyM#^z{f3+umhgOZla9UqD_b)V?o^Y5xa;i30aB zLDx=z8vR4SE8pWTLJ16BjQVJ*Lm;mCqz79~zcDI_i9eQDPQEK-@7HPJV)^~rw%+~N z#4z|2lMo8qKU?Eiq!4g2L@;Wg-{4%R`%-{IgUn?@#bultR?kWA4u9}7Iw`&9*^c_n zes4LY@7yxqKgyLELyn=SRtT`L$bc7#bP8X8ne^Mf1O0Akt)Hd*z~rbQ6!Szw(^l$qS%wa>$dEv1n_ZsoBlDM z?6)||o7q4yK98j986%txK`Wz_eJ_<8k?gm4ULz={F(F{NrL!A&)q5N-#tcq&y8fx5 z(e)rTo2UFX8xw4Qw(MD)D$NTug3U8W-zru)n{}Di$7TKcss1E;Emy>1m4WiLa0;oi z4mmSr3yPXZK|=xSAy^%(5--LI+c1^$LNma72QI&+X;l5&FBHa=7YTaNjelTQf8sA2 zKqg_j0ul2n!%-6dz#mW*`B}^~L3^SASCfciojCl@}ik)G=%GA-0D z`Ff=malcehdyo;UDqYkm|6b#Qh8M}LTe7%=Vjm|*J02%9G`l*c8RuT*3$Qb}>srH2 z$fJ;vln=Olt3hQUB-sH{4nz62kLoyku4h7FH#gzo;JgQr4C%?cjTF4wuF(Ue^3%`* zA~MqaLD?~*Wj61bRfLz}K><~=YrB@5b-&&tONvIab8ESQhF5U%NzfGnU+ug-z2ja5 zshQcdY&PuG-CWWbkwQvNjlzO+>!5nuM`l?`t*KjQ5mn%+n9ts1R&U3V;M|FNUb9P` zsgOV;K1@bCLnSnEw&*njiRISRqLa{JPZ^qEdu(_e&{c@!645*N!ZtY3h}>i z%QR$1W4;40E5gp)4{J@oZ>EtHBAt0Dc01Ems`-kA{g`${;UVV%HfZ-IPy8&0X6L~ z4qI)9$nf32_&DlxwsnA&rf1H^Sg)SHt~FT(e7Nopp52_O41aWuPC>aKQfQI(TD7xD zg1)Ot@O(7Dse!f;Y*00fpA!xK?06!e01nCn_ZxKvpGrS1EY1gcI4ymG;=^Rv@q6QI zvFA@LA1qF93P5u;g}kEx1n6QzRNsUGbH943)389Mz>jqp5G=e{Sh1vT`+GClCt3^# z8<2=ZhzqcI)wr-PKRFMu)?SJpXw|*J3?pEL-ZlLpJVbG{Fa7Pl8lj(0tn|o)0b3G4 z$Blf7Oag@H#KEAQ1;8U-v()Al(rWO=aedF}>lgkjAKfS(f*k4IEZz^>b_t`k`vR40 zD;^drHD?7_H!Fm?*Y4<2lCE}jf1Xq4E56N5k@jY`8GH>#H%gBHK;FgCr_i~oR$cN% z?4$~bP9MGcFsi;OsdSkq^BUPq2nsc!CXS;6`LANl^4C|1iH|(rYzWLDct*GJz>w9mn_?GT)BN%HLxoBWx;h$p%>qu)q+?Gjt+tdySP-T`tp%-$Y?GlVa^07hfz7+ zLPvuwk|hq{lg7qG9_x7j)+XY39V;e3>hy=ZL%$6sip|%cvPQ=9B|kq-yZO3Qx^6-) zw*x&t&XP-k#At*!RBV9Um1xKOCo<#EE>JF&sW@8=PP#8>EvW4Mvf8vW!grm+<@P|* zcWiG-Q8bx=4iC6RKDv)jvwRAs_iQJXkpY6?vqq(C+=;|=4tj@4oV3yH<#9`jyd?^c zl-S*AjQ<=SJYX32kJ=rcs2-Wpwjk&9xjxn~~4E=iu@=ly^u)tPj+fOBzyk6bh z3}O>y$+J;_v$x%uF3-4x4zSA?Q-xp1Pd1c1HOaO;#s+GzW4FL9aN|$d;LZO|?g#QM5MY`-5{rL+ z1_f!nPcn~HSZek)qg;uO-Y8m;knD2? z^gfBRA1OzSErM=)w5*c82)%q_@v(pvf#773j;>QaX+W#jQc{zWI;uxxrMe z*@#FPI`pa)^&}@h^3EWJw%u|?+6N!#(4)RXU#+21 z?pX=m$V;j=U9N;xMslo)8;yPwzc(^@A-_4$&KxSosT;$7&2z~GR03>x@@-A`a@n$p z@J3;*;@0wfSJ3Y9zhcY4u{%33a-u)h1_4bX9o=jE zATad^qfxG-j{JCntDrJO$V#XbQ1^Q~z);eeWS#+7PJqQJpaCQn{kD4PgpDA_xi-XL zq4w#y8TJrytYzcquSkyGFEW3*fJs`Q_s(X+U$A`!F@0_lSjJb+6xV(mnEcKbQ^nc# z71Qz4xP9}@3b4vzov?}~DJ;e=h)nxcAWPQO&OmkZk2WJzq7$^gJ&LkAU}^Z%jsQoH zv^7@?c0ieZ{mN#^wsxUrlsHa~Yi)>nOJjM<_TK1< zimU*isbSyUT#U$Qk;(&KH;3~d)UA5MNj1bInD=JW_kYs^K8)=WI&7j9blv;it)OQx zIVp+9=ID)UNa1Eh^P~dCc)-9Qyr79 z#=f)5WA>BCOaor_){2@H5ix~z(DbRZL<9IQ;Hu)ATAzi)c46rZjN!{*j@9jP@vIA8 z%Q2M=j)C)y#Wk?xn>>NfmYewe?!7Gp)C6-PZi0nn1G#^j3``G1sbP){JXb71U1t&6 zIty>i$V(lX@BYB@*US?zMT2#bo|nIR$NC#F`D!u3RUWr;)}q%iZvj#kZ`sQa{f0YH z%pug@ipg=tFMwqcI>m0}aG|>|5$u^?s7ik{`1WaPQDzLNyWkelPxcb8n&YtC$sx&` z+0E-udXK6r4?f|)Y4C{;vWRvpsz&EqQy15p;|67aYdnSVR%^-G9Ie-6c zK})F*_GW*ilghM#rs%XtM;yh)rvk@1DD`3-Mj>z10OEt zwBnM`*0S;v9}?(8l;WJMUWApaTC=#mht2U8+2!1a?1)rC6#beO$}{-!@$H+T{`T(f z@bTTuLm3ASvrd8Mw3_zoO5>EH2W9?FZ2D!wK;-6;bqz0|)?-xhgZ-wXRjoJEN1(1O zQ{}j3*Pwa#y&9#{wgiQ)eQu8kbrD&aH!t~@Fy^Xs2Ie&ReC=BP?Xd}T$1|*p7UDQX zc9t=wgNF3V3*cK(^`BqxC%y$fv3~|6wVC|*0jD)5I1O?*vl5yHo>cR>1G$JEYDq&) zq3~mg?Qe@S+}6WVWHvMV^9D3;RJQdR1}mldzHf`C8qgu_5Jf2Yj*v`D1g=^{1c=`2>Zp%#_YE!@HiR zUi8pHC=>DZ6#YQFj8gvYn3>e}GuD}zycoNVIsD~C4!I*AO9{TE2a3lOv32n{{ngA BWH$f+ literal 0 HcmV?d00001 diff --git a/docs/source/assets/kernel/q_vecs.png b/docs/source/assets/kernel/q_vecs.png new file mode 100644 index 0000000000000000000000000000000000000000..f55b3742f3c6a862883c6a966e0c2ba0dfad67f6 GIT binary patch literal 42065 zcmeEtbyQVd_bwo*Gzij2i-gjh($WIb(j^UtZUyO*mQG12K@bjzbayvMgVG%8uA}e! zeed|iH^%+r{&yYYxHmgi%r)1X&z#TPp(;u;Sa(V9A|N1O$;m!bLqI^~LO?(yL`MZz zB8V255fJWHSV>B%$Vp04t2o)4TfH_zK#&bhNJ4v|u1(;7;KkmAOZ{*Jqe;#lG4IU~ z5|b<%*+UFmdGvrRV=W@BPdPM!J%QzSnd+IPCIa6X>k{6B5Syy1OtuFWy}Usq4{W@> zxV^nP8tC`7=WsLM7e342#6e&VVoxY&>p?J;buwK>6e*pmcPH{iXz@p9YQdg5LYY=m zdifGv%uMZ8f5GjJ<@3cF((sdm+gmMFys#=11TiBf#=F%hBurE?osR=vB;G-&!l@-= z>x+&0CH0Y{~=>~6(5DWpPM^!BjFF1#V8`Ba=N?*yXln%)e-Rr#Wp+z}+4A+U|<(-bHh3 z*q5zp2?l<--9h#5-OEx+RZO$&#&KpT%VBzQ9BdYG7gZm>9@a3u6_)z0{(Vp(G$Lh9 z&+)iaZa^szhhd6&bZKF2qfuEm160&2WRGQC)))u-OqFBgjWqBLDp11kG1B`k%a zq)3tJ@FeXpRP9{B*_-f7-j>l_KUGivZyD)i*~w`^lwBXc7=;NxUq^eZ$bpWC=#N4p z&IKb25YfZM-bL)fVL^-p`G&qBahxb}%}la-*-9Y5g2C9$cji59`c4YDHp)hudJ z>Yay2PvTZErP=X5^=#qm&^riR^LNC8gv>Z421RWV5hBTuowWDg*zO7-vQQ(EzCyst zx`!D+@=0RH33b1P>^y)PH*A6&?Z+KNSws>sRHDHASNE9^q+W4OVfTokOx?o{Y+GWo zL}F~!c0klaoNTrIiE!-i`x6N>;OrG)Ahpo5x0^Ajd1xe(i7~?8gM}W7G9lj$S;<0X zB8d}ovV2s7${ZM$%~6D39PAsQBxaGVphE0|v;7t|n|FxuLBOLYcP0W9Uy&taIWN89 z!_PotY}0n2{D5QsMq!!x9Cr>q^KC^--7@7lickc?n}{cPKHu*e(dy9=hRVd@l2nmD zdL|#koQG{CE)bI@i*ty7@vhZq^8r!E-PI_J@6U{olUe$l`aGYL{Gcwt8;$xBy48+p z)NK6mrEMww1XgcbhjD!6y-JiR{9WJIcTR;)c|KEJQiilWbu8c`)}v;_P72cv)Qz5v zEa|LWX>jnVHxi<(2&nt+wIun=+Z($X0ZQP9?uY7kXEFL#jGEvW{S?_xLP%WVr6SYA zFCm2MCOKqJ-!sv%G1$@U(|*RSz`G31rMn;Zs5^*IPM*GuCLwNHc0&$F z&c55U3UN9?D2;`of-Wg8I)**YtDEXkG2y+tNgPQug|G9-vuf1#RTtDgz91;(<<3Zz zoR=*t$jB8b@Kwoqto``c`#18B-}fYtCfG4G<=|O$aADhuutsZ22ue3fdwqQVf=BfS z^HD<5KGc`D%urlxlVFsV)rYobG8eaU-WCumv}1GMn?@{ZRs`m zHwQOI`(`W*wmf}KA|%b6`_3_Wxch;cx`y+Ma82Dj&2or*SxD#;#bhCK0cV0Ers}i{ ztJDFm0j}X1*U{~Q`DC7)^8EcWHZ}Ga6bIKP38v zQ#ZH`t)JG+)gtS>)sd;)w`rRpnpvJvs5!GX9H*QZ9>*Ap-QFGJFFaC9`nXe;R@|s~ zRYrIrV#TDYq2|$rBCU^^_ke6R9Fx zxv3s0W+@se^BkQi)%~M=ZfS;f{;VHZ)L430%eB8H4kQ})&L`@RR*vcyDimlbLQSM9 zS6;5`jO(1}aF$7yYU%Q7L#v)vh1N|Qj~X3~PjvIlLo)a72x>G^H;zMCA@;khyC|pY zr*-5-*f-dZ$x|r|1a$<$$j|r|Tyq5bU37NFejK+?w`>2<+Ys6`UW;3*>Rrc9YsEIH zurPFKZ-CX8I|y&VhWn>kJNJra4fi$ot@gS1?{Gf1cxS$wHoRf9)OuEQy!p%2gTeh~ zZF)8Tp#7})&~wFhll6Gv;5x!&KyICUYgmeelkv&%^oXwLzSBG- z8sQ+wAmMV}=?kfju#hw7ejBJ_usJa#V^mu?FONv5d@vwovW+4t7n}!gTK06a;*6%BN;4prF+_vRj;ORUNdRcgj_sl8&M66JC92T9Vi^=Y*q~3Q52UwMI7uqknQ4ruE_LFs5pwjH zz1w#G;xSAMwN$7+tcKZk+`hzCi@3tKj=@|}p;_ss=c<_s$+w7Ag*oQ_oc2!h=M|a0 z=oKo#N&eE=FZ(vxpME;U+N#P8Z=3|4B=Jt!GR^Vlz z8YlKhr&;HGOrp;+8t$ft<%B7VhWa(=lBi6-G*i>4BCg|gI~esDz7+5^ztw9pboLv* zVMn7w^TZCOdR+neSYAF{KKm0dG?y%}}HYhFrY;Zg+IOxAu=U!Jnm*Lu2 zB3OL6pZ>+obN}uE_M8Ir4CcFkJ&g94%9x5sM9ufb;mb?M@q-`G$uqvQ58EeuO)$EZ z?jRBI#c`}(xlhOCaO^i6QYvg;Q@`DF!lSL}&2NL}gZYMi z&>C;78`w|Xjyij_<_OV+yVRTh@o@<5&IkzBPn8}f~M-b#=>M*?(TqubSWrLLsdE)JnXI(;cH3R*+ zo>ATE&)<%T;hosS3^i&{2`$q)`T&9rGkYm#uBeCrU@QVU0x}{g0tz@o1V0f(vVWaR zBQhb}`F$M;0U^W+0r{_Il)w@GF9!VJ@BBI5i3>(R1ApOzpL-V4-%q1*W!?Gv9FY** zLl9S&l#>HTbrUBuGdpKXdzZeahuz=;hJ&m&kN^=q{D&x~M)wo6KWX*irOQjj=K?17 zwrobG_Qqyx?zRr_b`XTz1;DATnTrv%yX|W`X90I%n%_?dfOGg|b{gv6kGR+f)4Wtv zp_a6FGNa~Ud&2gFM&vFvHMNkFskwmKGwHux2Y(6ESh~142(Yuexw)~qakAMvS+GCl z=jUgC!okkL!3v&Wb@s4xF>+_MbEf^%$lvWeGjld^vT|^-vbUp#w`*i<@9H8vlGth?!2QQD% z@8|#9oqzZEk2hcb`^~4^e0=|W^&hwX>#G{hW=@j!wxCZJk$cJ(so8yfOmY*d~vJ z&eEpt6=^8yLm;jev+QhVVa~5+=8xlar$CQ#=d$PmhUF)A&o?k@}BK ziDmsncCaLD4#&m)pL+Ut|Ky~b|FJIwq_>RzlDR+i-}wLM!7mMoq5bbugLlXzgh)>M z_J9WY&42DO1XbXF718g3i!Ea;4KaBWKcxN-!URw9aSi=%qW((&ca;%Bq?a4f5g-1? z9$T^lu>X4Ff0?3+JLv81q7*v%f9??*%jJJu`hT?^@Qt7)TlJhxLhOH0lE0pGu*|yi zpBIEM0|7hon;RaI#DDA&s1n8hSEc{2O8;q3{!5iU(>yWjd9NuKcqS7~NfBXBl#*Z0 zav~e-JFcdWG?|qh~jBJgJ{ywH9R)$i$nku&Z z#;*C|N0e?xzQOtym+WVjy`37${JIXiPFFK4bs|U{t3uZ|wr~;CSVfPjPqLpMg@4uF z*2qx00~6e6{8$W|{U{Pc`P&|bDN={~`0L~EtM6{}^QaIBazC$eWr30TGT14WKShDf z`d!=Zq8R@f1qD(;`m?+Mv0-#mAERwiZ?mU0)eri<;%4LuYfovMynO=e4brMUcya4h zJb%QpXwh%FMETn?Vd5a};_t?p&n%R)t`zPR>IdHBK+Pr=&8wVoeh*^=7&2Llwsb_? zWC2&}-f1KheeoUM9`(RaUT0j+`Go|Ny1MmAN>@7CE$-usE_tZVZ@cN z(G)v?MPQD-5&2rmPuE8&JPTT8x+d~_v@S~2vt)S#I}g#2XK04{vMc*}X2RX=KcP4_k%7$;Re&^_QllOVF zr=JaN8eTlzjKBL^Kmre23YI1Ws0l+_n$8w%CbTWmWvO@O-bFe0b0<}@$Molfv*|Ey zs>)E9l@4nVPj*X9eAVW8+N-7&yMwn;T>CNp)cbf?#{5TLiuqWPYHZJ5WhbSyU$*~E zaM7Dpfvr1Q5{ZMMho>dwG!vttHv$9Xzdb<~De@>tKZ(pxoa*1u7MW43F!4 zket=bJv2AhoJXGf&3j>829~Q=%HjMr`mE?nLzal|X0Pd&1i8+a57@n|cOv}BtyeoD z!#St%uB07-E(ajMbb}2fr2km=76;rGHj*gca_h&Eu*Pvs-S#`&9u=zN8HM_|{w>r% zJ08&aY}-)-4~&R`heDSXE0XrQN-oG0BJW;+2f;$YLkytA7xVK%Ps;h*g>AkAGM1^A z)q8?u1ru~!c~-rV;Uzt?GQ{+<@F6xbwF(AsRm{VY<5tMnk=w^+d=6 zktWLYBwlU1S8CXkI)52#g77UV`$3f zY`&j=t)F$UsfZ8+df3RmU2p5 z_u{=&u@N>BZtM3cPfesJ*VPb^VdB&up~VCt(xP4JXvox0UIXxksT)mP-_%XzXKaykVbnMoo){w>Dv;c;bHssIyTlM01@3TF5e5T2HEKW-=fD;7 z;}oO42Nw}0n7FCGbmA4p&$diab@^Q?#7Il%dZO+&^Y?7kkf3lRf~Cv4mh>P?xVmuQ zE_n(6X$q#BTrL)ai9EE3In;#&22XtlpiPpjSb9ICtDhT#@9{5X{7Fc!O%xR`2!>=0 zt}nzFyODn@ITIt|?zYn@ncCpPrQJB~YV#Dicm}ChOhO649*LH%12ih{R~^w*(Km_t zt1$!s$5Mcn^LMJO@Ad_0abLQ=)OmY@t)M)r^k>V6{la*aL2;Xv_(l1=W?{U@`W%tP}S1Q^+O3A*359IVV8 zYJYtWgv}Qzf}n>lwD~O}_+Y7Ire2^RqGQs7Ym(D@?KWIE2JJfpbk>;&uvwAEYJX$J zAzh>?`lfY-t?P%t_K1G?n9mDV<~aCqKKuy>aL$KMjwTP5HDIOikPYlceIz=;zq|f| z*TU9~MUR3N10X2kx5DjmU?(<7AELmwm8Cqu-0DJeyTpi;Utv(p;B6o{%^t?Z_Z!i{hiC}A<)4)V+Xd*nMe}KGg_Opab*;e*sTBAi zgHyq*v-fXSxG6)Vf5DJLbqM&ji*ozw9am{AI>tV#>snBwAm4okZyX=TKm=>xU+xU< zDtu!LG7Ny3@DfkpfZyPPt8&+yWJGX$Dt7)NJ;k3>k6Z9gh_rcxDLF1;A=LMTkIVZX zYsd~?LmEZlCTjni(?Di3xFjs;fNsZPJiP;sCjLJ^3UDG?)UOP6MZ`qhp<6nhBt{Gg zRt|Fhv#o)VmI8`pnD?=e7+4ttmHg-M8<9Y=TpSP0{lO6vS46{tVF^=w&(}dHhO!%FV&{p>PL!22Ae#KmL`0bOj=NW zE77Z;l1D?5I4M@61u14p`V*yJ4|~jiwG+G5J@#{f?eG%*Ljq#H$PKlE2HwACREh9h~IC49}j;X{+}8TCX1)YU5`^&9)w`fnCs}1hjbC>yz=} z>JXD*$jxaTlzS02QP-zy-=x8KW#F}Ero_Ka$hqi!G~TZD>l;ZqA6L(KW5w6QuxaN( z;eN0K$EnYi1oupp09vm8j<3KqSEBP~)DWuwPrYbh94ecCYBRy&+=7UftibjqPdxd+ z=fLYnvPsm_d+c8w+M24QfHkk~gyUe@PP_W$U{u7(mnK-8aA0)U@>xbX{9491p=EFX&DQirx+ifZI@JZ&_g<~nb}%UO{fw_2h$YGDni~( zvM~xuSDHat-#=2yMhmNuP z{H7m+t{uJ=na~lNC-05m`4Q9NG@5`XBW($$nLHCOfu`wOgg?H?3c_aAT;-Gvx-s{k zSY?^e)-3d^iry>Dn$fW-(;#MVstI~dmRA`S)!@=&VR6JewK^zt^jvbGN-7I#w)1rD zEl^_ltWWmyyG~TY(F?6;RC95A!!lI*XLlZ%V25;cPVF~e?xTuWVMh+(eDbO+%%$M} z<=D!f@pikv(miY&F#v;r;k##kQX7QSN3Zwh@_=eFzPfT7F+}={=Q$8##EJT%O#?(6 z$0*6r*JJ*A($H^yzI-kAU{N0OI5H5}++>8so%Zc3yS=EoMc>8&HvY3YXB($DrYY?c z?nX=vZ_z!HikHt)6@qSfHaOokio%AYX(-}NR>4?H30eotqfV3|wfYhb`^{{h7d&_P z`hlW9*eYAepM|6ppJ%vy*+sl&^}?YM1wjQ7(px26e6Q6x_lTc;6-q_0#&~jlk)6)Pc71sOwa;*0 z3AMcVke_5QQ`)dyOrglDr9U&JNQT*bw%}#nO`jCOV>v2qZzWE~XhZAp_Ho76k*Qb{ z#}VEIciI`f>8-SMsr$Bsl#E4F-J8R?{j7C!YLm=Q%d9PBcxUBS&yRKeYB5uNlEaFD zy=;qg4HHydS)FV?;kp*jYBMvo{!YmraB9u1Ifu+Pff zKdF1|e^t*Ak9aZGTDj1Ga@^#y1Dk|S_%Q5)u1(|Q^>Iee?DGYpI^g?K_MuAt!3pO8U#=0q>~vBp z#cMPRDdbZik%mQE3%{ABDr=acn~@5{xen54Q_(~;J7zr_LBgpQpR?*ZsZYF^q)(}H zSjZ;X%09HnKe!_`yr{W-*eQDRbV78ikRn1SsU15i;=pFBM&(n`%T=yy63T2LsmPMJ z8={1#PVo2yH$Z}opFVexB!{EtIF9#%c)K}IQc~T4n|>q;zesz~?>L9={b2NHxHgpS zR%)+QyV{!LK`pB3CVjGD6veuM*ZFdY&7_{QqV4wBN7m6x5Bjqjt5Iv%FRe7Yrjv;o z;GQ-Ztvu{xc*b_${gO2h3=>J6nC{E|_3a+}_rCUps1HRYtDk4>!VgN6$C`!OK3)B^ zF?>)so4raKEu>b>X_Z=_D#|QdF;1NAbj8O{D}lqaeYCc~>D#>a<@P7twaeM0zT12j z^9QG%M~Dp(5Z;1^a2($MruR<})zbzPU9&~zr zxv%QS{Yf!Xc&qHx(J_;o;>1HS7z9x5R@nS&F>8C9GgL1)N8i&|$-{JseslrQ zgUJ{m!OwMkxX9aeMb)|X#r%OKmp_$vNU-no1cnr4+Qr-BLeY!q6=z%YwC5Gq{JUx* zSZUTacT7n-x`+C^s2ab@DblXQbJW||elER!l8VCrR3aFEj5Mzoy71aPp<& zjd8PDOLlHuOrp2V>!(lB!%q^0?7mE@jgvX9(}SfIym5LpOJ8K%l(5e^%N|Me>o*2h zd4Z%+rNE$^LGE2J>Kv<&#JC-}W?$zEHPb!#)P_BrRV6;#Z{;}>I-lL@F~DFCVY%Fd zWF}+?>*-#fbEIpj`uU)}-Jx>B+**`lOeZ{BwJJ;}+HD3oFF_>&&rbZl7e8hF4$=a{ z!`$Kh$;)M_U}H5!xQYhvk zF77TynbXpzo7}Ku_OdmWvO&$&yJA-Y2P*1U!Wy_BRG4mpPNiR9qvWK%6f<;Rdw?@Y zx(Rks6*ypzXSkl|jnl0DC1LhEQ3AHr17H(p=ovHIjbA5O$3)xLh1F)1lv% zmr#6GqjI!--*AZwlDbTYaE-ES*ebAuh>wJ3;qi2fHkB$iPvP)$tHUD6gr3_0fL&45vynI3`&-pnBoDep z_wbgIZjUl=h1lBs9K8U({Qj#u{;5gFce8VL$TbP+$f(Ls3(KV;bUbD^y_6GPxdl6e z#V3#D*|Fvh?YfWk{t7?Co@a$YYJKCcEk1>_H8AZX?TKzIHlDA9uX;&+N;5IK@r`TW z=L7K6IBl-(Abvw6$+XF=u{=&PS046wBQ-HMQ3;M+T|GYzE}KLPYZhAHcm)G!F*ff^ zeB*^mP8xMr=67&nDGk>tMoY@Hz~1`H0k>v#-Udx}@AToMQ+!R1&Grg~jVPVEF;++V zw)3@k<#Ju8;@McyVl6K>%S9~Pt3*ubMZdjqN}LxFU8H-{(uB?#n?L-{&|%rm>x zqKEknGe6fDmB!FwjDb~5Japtuv)`2i@I0N6&!_X~C|fnv?X`rE`$)i{4TaYLGM9-X zphtkKhS*;k=UiN2zn-dR?VgXQ{zx{+F?)^iOJCIpD3hjWOZPQ$WO^TVr<%X&dg2-! z5Ovf1Ak}syWbu06&2?|&hkN$YAcEXyzad)2V%XCCZ8TA~iqYY`X2#}m zx-XRqbt$f`HkIn$%cRY_k;aidfsMlSwQpp}6cx*z#QZj18@wdCc5Ns!8@n0|6v1(# z?)j+nS5Dx4f|$l0zOeHJ=hK?6R!(b|7aN&|leR#A(#5p*u=lDrcfGNyMqhD@cCnr9 zrN~fve241K92N6(^^`;V3A+u0fW{K-4v5(}fA* ziqIBx3>&OG36#cyq7=6U589N{Ve8WDnM7|beboJb7)0zb)Ak~mCn3JT${$J}HgvcJZ5>;c|fZuqb>3=}$&Vg#k(9v%>M$ ztkXxEDP3&Q)*Vq4WcoW79c-i4H_KEv4;v<+IaB@fQxrGOsTq^IBi)XdbDlMAQKa0=&7U75Cc^`+;o1zx6LNS!(RtSE$>DYSn$u~mOZKXG(_Z@$=?h3thT(rQ zEPs2?gM~l;`bk`Crd;fY?4`mJ3rljJu9-E7&-K|Yt?#@c$3Sh-V4K#wg zSkI+@FR?rPUUH2g#^*V@2iJz=xC;LBw-XPiRb_>D4Pm1^cb^HeLN_CFg^ zTU3&f^kTF&4{oeT;L_ty)n8+O`x=diKAe4zo0Cg^))gJ(hu*(I;N;q6x8XSFK;gef2zcJF`Tatos=ECx`cRs z()KwS&%K(!!;f@urfquP&E{oJ>38L*rN`xIo}i%_+WRRAuNtVfbi;OCyMQ{W^taBp zFV?K2*{Qkz+PF8-F)z@PnX{rF?jo z@GbN7xY_8UNfbYW0ir}T`-Gw2^`^%>wnXG>)5b}Q^wjBAPxI=2o|5n*tuRpW{4qmn zuSM|a>S(ajs>IZpQ<8RHQwg3@Jq{MbqhLRrnw{wI>sDnrXuh=D1NpR%dmw({j>LBp z&mGcl_8k~fxt@=>sucZMITt7T!JR&pG=nW&Q2yCa{SpGPIiMV(q}1oOBkpk5-3zd+qXQ%{J* zh_~HhVJppM!j4C|W;}(ze?6diV?SoG>2~Y0e2z|2RhhVI>x+z8BKar%BNhSmo zff%(|R!mx;!oV8C6dJjMn4)OR%+!?M(+VoJrsNaDjz%i2rt`@-!MmxRW%;gi!W0^v zCzFO$^ONQ_sr$(+x?6-BH#K-^1z3zhqVyBv4+{f$R&H+2poXJkJ*L-?@7^z-J8gel z8;KLv*_N?&Sig|z&gJ=%71?`z*;B~J@53|iGTPw9mfkbuq`^j)G@z!t*HFV<+qOPD z9QDCL+es9=L{gG$D!gu$1x-_MIXi^1*tTX^+EFBFQM6~8LGs1u+jwy$f~|r~5s&6} zBHghXI&Gby;E0=z@%1F%68$&Z1Wai+0u;7q4?HVXt_||v{)HM=@Q13iFDWKD6AdUo z-wmotOaHJ(Il7CDWjq?(o9$#;G24Wp^Ki*m9R0nVUoA#cdOn`_f^$RB849W142xw< z+Wb!Sa{9XkIX&Nt_5P~FLrOqHK*%Hjo};Hx>7p;(#WAVDtK*TiyU*2h4uU=|JN^Pw z7=aSs!EHC!!3_^Y?sU}#H2>w zCA~BbwKTC$7*-mpGMjj4Mr7b_bo7?pJKIjO19BK|?v6ny?iZ3I!ul5{fJq!sHA_)7 zUgq2;bkxTw-3#5e&+zy`Qm#b4GJ&n;ckxwO4E0xc3sZzq_#TYXC-q0pu9vk?-yd4dKlH>ZG!-=`dxQ!do70 z#&h!A40*A`wE(hyu5x=3_l`twk3>5LG~Z3klLLlx-lq%^-f${hFi&(Yf5C6>OfZxw zT%Fw$3~Yg?Y_WDeH`H*jt&hp}tnBX;ozU#{VN@Rt2&SPBy`*k1D>Nr9Q~oF{DA5{~ z!wXrH@K$gt+JkWZ;#1mrnc=zp(R_ER((!{|)!ygX)ROWB<|s812&!~CZF^J=&x&ah zaoCx(3TnPEfOUO!%_n7ie(XwsmHCT{_zS81y`zV4;H->ziU3F^88N853<=&L?5F%R z8k9S{nt8RxNQ-j=DnK3xBIz~cP*g{K7?Sm<)k>SO%KfSA2OIt2oO!9YR%2BcnOd72 zdkQB4QomHrIrU0aQ_h*s?!D(MiY4IVn%SZl`Mbk^FhVNN;Jn+^<1S=G^cSIG%M_#< zfr`31fNmU4#d{Pcvs9ED#Q`Dc=vo$VT=~9&@$MO}CE!UbGQZo2Y#TcL&2<36L;?^d z?8zVVWB?N~BP_Y1A;|Ue{hyvL0CN+|t}MS32gnaL#5f;nWYc=^x; ztG?u?=7($D=J^Sl90ZI?5#Oc!!WV41Hqd`At;;&Vxl9!-q=E8N-5~vy(WS3lfF%r@ z6YQD@1Kd{_?irkpSc9{bBIJ+q6#z@`h7q%ShdwI_xo*x$AssWqp#oHT1YrEU)?FG9 z=Yj{RBIPmiQ41$2>1pr@K{g+XvQPdDUhx6uy0`=S%zpx?$Zswvfu?}gH20S# z`Xi*Jp)Q0-mJ5QV5K+H^_fw)sCLIw^;~AC9OJWpTcN`p4Um>X%Jbwy13TlqW`0e&o zlt8J+`Yvx?hGfe$eYJT%2LQZbG?gX8QB}AOuuI@sl7Fps<^q0>56;iY%Yb#;YA8)b zI$F}`_oYbG%!am&jMl(a7{2b_fNpI+prw1QxU}GPz`avit@Iu@MB2t6kBKM%AZFiEC6aSlBZm@xmUs@Is z{&Fguzb3@ty_5n)4F8qfh7Y|9m`=KT+=qmbKpk>U&<|ZCzzi5`WbeUQHfPY3fQ57v zCGJvd37X2IDtn*?&DjMsf2NIqtOKgHtUxJRKD=^_aQ;oqBQWng|A(Y-9aRG7YshiN zhYz3F&Tv%&ZfsrsLH_^X2oM59@ZB%>>iUuHvEYusEE$Kr%TVRPy4fj%syF)NSm(Vd-ihCZ?h0X1%!7|nU_b7ZfX6iP~_wh zoS|J)A(i^CuLE5|2)Mz6(-3`t=e)1+4s7YUj~HMKl<+YWRwxXs@pA$9`>47NTkySv z!5jDv`)>n@Y9UY-GSbws%HPBf3C{h72N#4dTo9dcaZc3KAsry?YRcApzCtdai;jN$ zr#u9ZL0*P<++ta1K)#^XME39Lali{)-b6k}dAPL0o;E|Ar;d9E)bg-^X(jBtPr8tVg4$7&TNuw9gGEf~ov3hc*~&y?If1D_KOXON9f zfm7;zsocHZfI|3Doq1iV4Z_x3(Ym=+87d#s<0MP~rIa~}?I-;%XBVH(pm%CIeN4}$ zzRvX8{~}pOy7Jv*rpD?AoISe9@Y?5xv*RiEwe+6C2s)*R^=lp)PP_wj8|kz}v9jy+ zI=kP~aQ5bUczmBj&jO+&A9xI3+Aec3R&crBI0(!;?Fa^Ibas|NqMnmn^^a5=X;?O2 ze!5Nh7Z-W>HXq^y+gw;uq1A_aY3D#w*VYVPpjkA*b(xPtqlyAwyKQ(H0F)I5w}2uK zo6}LvSx_Z-kCq@%j5^Za)%Y2VIo;KdchTEf$7$uE3KQ5w*$ufC-(Jdy?SbgF!sl1G zVFa{r?o8v557ZN+`UDuv+BNQ}jTV0ZtmhYtLl_Da;8pHMWHUHP{5(O%cIf*0barr1 zrx74Go1OATyHW2Rn{?z_@Mi^Eg`ZKn=x++k{OrtHJJHk!q-CylDbk_fb}hUBz}(`x zwp5|&000qgpC~}}N6I#XY6fT-jMDpO>&#T8N$zJxS85N|;Ksery5FbR(p4KlFEhnK z&)_>-fF8h;M^8tHL&aqM=IyOx*Pi`0${G(_FtIKlOEO_2@=8kLPvdAiB1__<)1h1P zbbun5ZR6VG_pX*>?T&~6G{7T;Pd&cqHATZn26Q|p>Dx?;Ez!&A!9KtOXIjLzR^41L z-p=cRqT5!lY}qocvJ`yw=JOnFvjh!zfy90J3EaXIoSxOe2^E9Ok;M-_r_)$lx>B2U z%Ycj6v6OBDOsRGA<@~{CR_9dAYKuW0IHnA$TXbK=a{i7dlRT;<$m$MF?<%WXn9&1) z{vI7Y7JM~7gVMH|$gnpkbhsmnaHzI9AB=RseRs<09RQ2(;a?n#xEa-w0;{8Qtvx>R zCPK4zJgV<$gN(n}pAyjYE7R)Slm7C?{ZzvM1G&}CXxyICHUosYQ9VBiG6BQ&BSiEno9ez$(w!Ah zPtv!>hjIV>bk;r%WH_PA6M!Hzzu8;7DYMy_Akuwp1iEbqJ`9EHZcGBO?wU?(kC6hn zuIE8!yf05Ko(9j+k}$h424o_NNWo^-*cpziD@>4VYhrb@+r zr}97b?bR((VjRvc-b}m(_3XCyT;YKU)$MtwDh%ZKyLM|vm25X`6NlLLOhs-l1`8P~ z6|7Std1ktp^eLA)GcO`Mwm%jIO4)w>)iHeYs5On^*-U=ZZ+cxEJB}#DzglLlva5>?!s;g{<-SvO)je$~%gvITkMr42P=e-_zgc=bwtaO2(lBT{dY~dD;sc53^_Ftv+KVCC zE?_(+=lpK3JN7Sd`z+4BzneLZwD+cUGPnHlD1ZJ{5LV#U{YQ-53~8^86Ae7KEG1oS zr+mPq3I^7OI_4ql(XTw>K*g`ybA2@h+i8b3tbXXt4ph3=vI*4k3^ObPkffxaC|@n_ z3k(m}nA`QqWEv<3sCsDND%$~yinw)hY!*B=xFI;W&LI0(-~?x4Xupu%rOk0c{#3Hz zqP@W5P2`nr_&HE#uA(7zJU--i(Z4f#-$kbR>bBiNW;f)dZrTgW5b~$QjhYtmQcTKU z%m}?+v9(Nqw=1t+=yrt1{xhn3`Nn@(6iU!AZCuh%eGS{zDO5<)APEvM^@#{|}bk(6`@OpyOdC6>XhsH5A z+i!0$K3*Kp=V5(DqnH4-ET|-P^N`sBV}0Ojusp^xLJ1go)~d1N#}uoeN9hM+E~13r z2iY9&m&&+K)8|V}TSKVA$Aq->9HkV~zJvs(SE|i=Y@`QyXqYoit=Ex`%*N(&+csnn z!Kt{^^k2gO2&fE^7#?yCUS|$y>N` z)C%s`(^}vDA;`xmcy2}$@;BeONDY7g(N?_ zl|dkqQBMEm%MRU{UBxJj^}v@I7NkXr)|qbd1%0ZCAdo$4kLhX#XuwF-Q%H@?b~svs z@a8MtYV016JwHH^@&dYLZS8ef z6x?!7rMN!=jk&2KOFEq$59i5uwQa2WoK|jVTO6i865cHBH(Na}DG+%xV3f2L17x=6 zmH+$~1Y%)o(mk-fW)D~kRZj51$oDSulcSmG&%;Z2`IVQh;|XxRomApme#2Ce?y95f zHt)K#f4Sb(e6g(s)wzV{>Ab*q6J(!eQlPa7nmUiTp@xZqTAret>t38BBd__JN++`j zYdaJ6sOTF}cYya0{n!Kw(|i+z`H88<89m}MZEHO$noh%DTYa2^5!(leO^fK$2iAo% znWbJkWp(9mKQM<-KwbtdFx>)&PIUh|ZSkt008vFmJza2KjYhDI`DJi#E+o7gqtFXc|<^kI(tjmXvC=1?$KJ?>ka>ble#bzTu zJfEwtIuru+obe21u8LXHI8aW5J$%gCdAG%;<+jW#n%A!B=Jh-_Z{o8xrf;tE97jvN z-IS{?%xb*$58a&l4zp776@k4LcCWQIZ2WrVVg<`m{X)uGGTgu4*_?Ko3%+)FUIycK zp}J0QtDc0B87u2mqJIF|+Pa9f_`sH-l9I)=1-2M2Fc~C{Co!`dA5$%jeNPyZaW?04 zdcA}+HU7t|p`c*$`+HYgW`0#7`nQKk(APzSc0~&@nm8&I%`j`pnVNTsyz@2XnEeIl z{MPUN035GSAd6GKetm`=O1X5LRi5&ISvj|37xkzXAR>N6!W9MimT9vo`YHOd7ktO* z!_4KA5DZvad0n}^&WX*Y2*lMF;#z%1`tZ~&>|WD5SI@~^%d?9xRMouvSk`>vLaVdU z-Nu^ZFnFz#Nlnipi|veX)199|2K9yc8do^JKktZoc-tj`-1Q*4I?oRH;DcbJ|8foF z!tc#oH)HPWMG?J?vE3Va6D8thc(XNHI5`18Q~}GFZ&Q$?IK-+5o4Bx5L~eF!epEFI z0c+@RRVTeCIdeguUVOx-9nGUEz}aTFIynrVaO4iCc~8)zB6hveJjSEJ(NcCl2uGy8m2J4oahlw7 zx{P@9WFxCru9sGl8*(9VpEa(A%8HkYc>rqAMw9ll z8+@kY*Ug$VQ@R^W(|S~C3&pyWH1y}Iv`DIg;$gJSsp1q)PF=rN0?m!-)+cJ`CE0X} zHGkl@4hUtd)DJY8MjumhBM21m>tCN_ImyeGc)^H3-dFet#G z#a^jST2+n3_jh=zNwZW$v_89ZS(Lts#{6#GJ+kxC#L~r zY~|_q8X3bXdx?|%ZZ`bvQs+d3Qh}cxoLbLMEl_B*lBp$W-1Q&s*m|>#52EnhS@02A z$&cS(NH+ZWh%C0{Lo&V2+xy={8_vkfPbg3tt1S-)Rnbq1IxMISPq|Ih#rQj2F|p(%4_dOJ;D z#WgfHm9&pnqOL4FK4n(2@-=Qyz++{{PRUe|t#P-iLqIRbc(>QrgaWan`30|1NZBgA zb-XK0yM~+yaf8vpS!0hn6hPx<$N8$Q@dHSQ)Qj{P+oX=L%WrI2=5SVfK0Zn=5s>ST z8z8r1TIHhSY{F#c;24;dbayjRx|NzVXS}-B{_VZ5b6w|mzMSiP_gs#*Nx=Mk&sU4=5%A7+$#T&wHCjYQJpZ`^sB$l3`!2taz%G{4NWs zYWc{L-EFANFREm}x1|1lwfs<&y)mo=w}fwHyLZ;@XdiQ2L+bCgFrb*gXSimcNA|V2 z$W3CzxNw>U0Z`^IK+A;A`;;+va-s#xaE$5BmF4mIt<2mS!4k`Z;p!2~uftRtHZ24z zos&Nf-%qaA81+}|k0>!xLFI(rISNZQ>(%vzRX(Sx&n8K#HFB4 zx%9Ct>-AM~M#YPGDo5Wr|1!W|i1AE4U&%mvoc;n2U~52*oyqJuFai7%7Z2nMlJiEc zF9ZK&eA8guF_wm^aF;7lmrDsWz>4Mojr6Z#K9_RHc@!T<>c8Zm7Bvd9M@6`X66;+S8mXaWk`PQAbh(R z$=yo8J~wO7^C)4aD7bNsC42eCFX&HFyrR&FAfQwl#2wLMHcWvv2Q$MgVu?tz9oOdL zeJ+zLcQEAka{w|MtG||FUIDpT-2s=gKgDM4{K#CO^JG}>7w!ElLblNE{GtHhgb(RnJ*De;6Zme=+GES*MNGD{uN#o>pm_p_KAPDH=g4 z`{nwE*O#1L0|M?FgyYcsbIU1QgZ4KNAymToxlgsYz8Yui`K3<+Jj?3>XHFgyd~W$k zM_r(@eD+B8H`_LpYB5WRojwh2&4D< zb7NX^)(^J*{Uk(bsXv=Sq87BIu+*4JB2&JAUb9>2onUsL`WzFNe7{li!7XdMG*XiO z+-=m&IPbl>=A`8978C`ftUPS!UKc=jnWgqVoV8-Gs?5<-SW2=Ynigji;8hUC<3gPx zXFUPP;a}B#AKt}KtcoV?_-4?b9iJ~ZFpl}iU_)MZAWdt1m#eilcS~_;dLt;wp~-2I z-m?dvk_JudXfZe=*V?0UFdJ(*p_m|*Vf@zVs(3POadbKTTS=wu(B7?nj>;N)O%=c) z+0qr_Ewm<74Vj>ufbpjz?C5lf=uFRQOYweh>Zi}9wC*7X<2fX2uT>I9m-_ydIsCnn- zZ0BO$Vmd-JH8QcOu@-MreNBCR=sA`N$LdUYiMztr7>Y$~)}M+hdk&~LB-0%pG?$pr zX&={gZ2cVRczydg<)y9^O2bt~>c?-siFwjvZl1Q1iS7P6iMQBekMS)=g9XRBf(9!c)BZlhI{(L3o(87el^qj2dPVe5Kpj{5-OB=K1g`{ni@(2N?7P)X!YJgRn z?V+Boe20s)7-q&b0UOB4jP>=jjY175+mt``K#L1g0ud|n-iPCwEq_9+fx43xS!1aw{U_GsIs8q} zm71a_C24bI8I00+@|NRg4%^Rg+}nyn_ObzwLFapHmiM$^dX7RN?Utfi-(;>jtx0UE zN$V3!{`pI{l;umUllF=?J&!#{%;`Yw{buKq`M?`l%)&3|dfoD$tHK9_OSbF&>x)dm z2kz9oCdxPI-jlOl_WE?R=zLLvpELmA))L(JqE|}8B_)%`F#y)Zm}MPeG3bueUU#dC z$iI&K)l0R)Vpo>UCZUIn#YSs>rX==KqK!nMp5)=`kAuki7EGX|Bv<)?;Oy43am!Ls ztNSOC5B5Ulj@+-+NG6}#rZ$^yxv*p;F8IA?Pd=$vK~y+!u;yISP21VF+W`rK3QIz{ zMQd*eNmf8Q-)&|}qUF5RuXZ?H9^Y#HNu5D{E`-3;iO*y9rCqjq(~qvGn@=U$q~CSk zo_beCEG#O(H!^yK%JtCN9EG(Fy}=f=^nJ&n-RHdFW}N5#7aM6EVl1t|h3UJs9>$(W zv-si9_N0C{%Oe6$G z>3T3WC7-x6PA+tHokoc=@e@RbUZIkJb1u5@s!;P;Zz)Nq;8Io@k%wMDOZO3~S@bxi zNGEDP307hM;!dpp+^$(_;q~(e!z?j}=ZIHRs#_jc?w{cq{nDH>J$mupUaBULe~2o1*X>Sz4G@sZEK8TpcW zZPUBD1LPyIU&2S5A93FudrV+l&~(4+esofib0TSbT-S+mTA`#n@!uAbn1gs@doPkh z17UMb1IGfp<%7;3a9w!Mk&OOhQ2?2-rRfc+g?l!r*G<6cEG;v#=P&p{5(a_za2V%m zoRx%~ri;xEI@eehPxxo;3HybE=pty>gGRe*2WUy_8++}j)(Ks+5$D-A>}~k4*V5u1 z#|pbRNIG?j?3=|OxO*FslBAe6Oz1j1Sn1`;bPwjPU!xJA|5i^R5KBu*$J^A5S>xFK zas+t4t@=`+j};1K>#VySsvuIn4OGuybBg*@4YxF@zfsaTKVKu|ot1&*@M#+pbz18C zyAp8zW2z7^)>4$1-)U{1YrKm*!8Pd(&`N^+prdqRVtneOi*kt3~u^W6~ zsqhR-*28-hV2(I4M}IlJs7 zP))fcr~V;LNTow%GVRUgV7n+8eg_b^$DkL=)-w~vYZ!9=TIu~E^p}>fN#Wi@!yQ@c za#ttXEwhRsV-g>2L?ozqJn>CcxOr_0Ocl{Ep zJ4YPALeV1p9cSiSn)h!@5NJF-MShVLxAIJAzAlG7S7MC)?K0MzxI!Pthq}+(pZ$pb z^ZOc)(_;c| zSuz-*a@#=4OGg)p5VoNAjYdjm?hlbV&dpCwzIsI0DU6NFH|3+L+%3;1Q?=HXWiJkU zq`jTg-X?l{5WQHiBpNGv@SUuohWzH|UO{D~sBH%x)-nQ$|9MLYrTnrt^6#duH!IVE zhJ`YXGx(D&aJF-8O^;~tzJ5aM?tGzqM|ZpTYz-Zig+U`|^R{N8`Of-gkajC(C?(*X z_A}NO7}B;|+bMbZh5kO+3&zhgVia6|oa=V$uWY%V$um@z5gj|0=X3uZ-J^sUiIVVT zo_)L#v%EUOj0M5+r;+ho4!&OMdR_#A%>!K72IFRh?QPIsr;)0HH(XQj2&6-;IYXLh z!TI~r7lm5u;jj6mN*oIHo^!|k1~A^x4H;xj)sF%9LxRJ~HrJD0Fm)ip|=7&vjT_{+E5`xuvex zAAGXTwIQ9;*&!#_H2r+@1FI&h?_54+z`;2_UU|5x`H$b=%J`|G3-$4hzb;b;oDP}% zGTHFvEpbPbPwl%YifwcP zg7Y%?1O$(8De)=EFI}FDKPt{ht+2#kS1V01H5ieT6_%X+{RyA3I+!CzmFU0D;lXCR z^TMj(;<>#@PqX&~`0HokuH%-lSF?{k<)v{Zip%hU6xwIpqd3HIUA?N-o8WYTyGOAl zre^2meOz4p&n$_UxiK}hfP{0D*Ywt8_qJDP`lc?2PFvw$XVF_)OBDBctn0A`;O``L zvWM+vxzomMG#H++8HhTrjbcSQ#g~MJ%i~~*f%_!=3q4gGb9ZGH=o)H8>B`lQ zbjI>?mW36Y4(`nucFuH=0m%KXCV_{?e5~vi_p@)WhpHTx2G}>g`!O`o=LlYYj&)rv z$o9)WNm;%QC{-)1$HA|oUEtb(v_U0H6C311D#*%hzt}faNJ7JRWBVeOx+p6l2?*<( zVNx8q^ZDC_>(A2=F=+0_i2jP}&d=|BZwlICuc@pVJS0+9(Q696WHjHElypatSCZCM zCJ$$ezwVXf0J&JAIPRL!;O6o$cTp#}CYXX}43$RXNqq8jI^0!RleDZRV~gKHD*1T9 zU7?@VH%rfHhFr_D7#A!AHt&%~sQR7hmNP%SPs{>ld{gIUbCxc8Vi~?ERW6p#2pCYi z?dzQ%I}QQch%#)Ah(=J+hSPab=B49&YoQA}uTE3Oz+{guLnb=wd4lvNyJkt-{*K9N zI@;8ql9Nw1l7kmq0?LQs18qQsnQIiAsI;5)_m!VQ^(kAY@E5xNdSE#`@HPCA$5K*Q zxfFiSH7g*?VnDvZZf89~qnYZsHj!lSDu&}Va%QsTlS9kfxLFFu#P_4V?9G($-vh48 zengClqj>l{CY1+|LtScKq@=PgtxKe%?bVmHc<^|y{%m2(jmp<=j8PET{Wc%KA_o;A zpXor>DQ*A3hr=)Aa@x;G(>{e<=iEpGOr1;(4wku@^NP(-84hkhL%`6La`^HPO}^>B zaPfOCiHTjIn|ZRKR!5l|!}+D624~Syn^KfANKB5XjLbJC_B|Z0 z>!+=BAJ5=TZZe;-I=1MZQNAy8|C2X-;q}+;8L)sH1VpGl!~CmjA9sI*S8-jE-aW18 zu~GU=0oE`OakMk8!kvwAT?diF;O1g~W^GAiRT*xoVAMnM0>$j;zIifm*B_oTO}8skX#>8(!Kz;UnQE~!t&y>C zFsG~T@@CNAwn~hKe80o3rC40I&jBE98)k(HqP}n#hrP;JM%56n>A*z-ULI@R_?FA^ zTlda%$=Lr=J@_FNM88tn)m80!sTU*+Q^)iwg6tYatW0kzgbDYjDTHwYrX%FEXU*!o z2~xEy9qgS}V0RKRdg9>qQ$+C?hX?>MoEZ{;c&vOS6Ro;FH~Z=L-@-3q8Tdq~um>S_ zgMlfpxiFXRCn^e~62J#|vS&-LVS&wL73DdAKp#o@s5;@dh>v@*{}C z9>NA`X1z~@i%_8!H!Toca|A;*Q0uIqu=9LRMdkoY6Q_YwaeXkgL%I@ z$lU;(HP{M)K-t-F4bm6=GhlKt#%APPX$ek~z`0_CFCj^Z`e+Jg`s0_ZWZC=^mW%9D zfpkIx-Ql(w#^iZhgVhv-7(3o4Qoo0bw!qwEOW}R9L5ubE zp1bgG7B%3JwJAN;IX-wwt+%|S<>k0(&_vE{H?Jrf1$Hi34P<&`_-C2)mU3g6YhIL7 z-?)Oz6ijytviHMy=C1kSY?ab}{)Eh;!D;fYCcdX~7m#CcCP9hvhMuKXzo3t&RNXt; zC(JN}@!oS?R{yudTeK8VzJCrTxd{nf=3ip26T{437D?lHQae$0wu?0?_(UcvG7bVf z>Kww>zh_(V8U*p{%f^#-{ z7YNJ_?F+@kzV_=;Au9@VaGkl}qbAHt$tly%tZ<~>?!;B- z;kryc5KGpRAkF&dnT@ufAVQV>znKg)ewt3Tb8)nQZR+lBr-ub%19JA$6Po+KM^^ku zr&|m9)cj_l#^;Yg8r_Yc=;sBmsL0`G{_GzsQ`ZZ#(W~`m{%7_Q?eJ*$EqGuPb8pOb z@&j3(HVhULZNL_VU{Aq;o@UqwY;v!W&Z-u7mI1mM*F{7Yglu#}md{@AciTXOkQ5wR zsK7hxO$NGF=WL}9%`ac9MAnx8rk<5N@N3Hzc=|gzGY{;?E9~8Iroixz2XX@*ZlkuJ z3R^&N?CW;4*iU>)F}2h>g<{G!pzi2wpc6dM+ z`{myzPO+-CC7ca$I@AYSxP$`^m5YC1YfK#!fx%l z3mZWWUmv?p1RW9m^9dGq4kN}_kC`f{^d6*oPW-hrURmli>2>lM_Y+Ip09YvS6KseY z$kvRlaogWx>B9_tejgQ%r2M<39{Z(I8_ArO2m+D3-@i`k`s?#d&y_XgP?UI9=Q@;H z>HaWLcHafugc~NgsQb>~C%;OQ^7om=4M-IL8lT7|tKMQ$HSVWhUdC85x4Yf%xSTWM z`)oB<^k_#PnoJmyc|M-Q4M-{zYf9ZeeU`Fa1~FQ!GiUOJJ!XYj4?HC9d=9X ziQOr(l*y{sSHID$=18qa^b7k`IOi#qT8ux8gz{1`wH>rU%-vt3C%bKyJp z?|3nRH$g&i4o`*hr$r67P729krgLhOn7kKUDh5>?P6zAj4Kz(iGceU!ou#6?j@XsC zeB-gn0IIWEmD!EPRUJIqy^!0kMn+RcwYJ3z@!J_sjJ{7=-^6>7v|_wpyM`$WXC3sx z!Q0}fUE4C~_e45Q?R1km9o8Fqo9+PIowTCs`s>v=ZQDsw+KbD{E|i?Fzgr?lgv*DJ zgt&y)a^lVq1+A9;=|0T#g(tVL?Rz@u&KYYmI%+x5Y&m6@({o`^PL!Gkf$8Xbcs94; zv4YcZ*m>4T5A*Z|2o_i5XxHX?=M&;%cukEqJLA`f8&{7(pcr${pJKOt2!uguy*b)K zh0xVB(jL%Z&q4BY8%Y}BO4+x3S8P5yz`!UlShwh~xRi?~-COL}e6~oba*my+O2{G2 z)s|{tanE(pNMwI#DHnQU%Oh{7{}|jV!(9#%Tx8{TS~n2&hr^K79UEpGzqo?6)M$Ob zBmtWzON`nh*8?o1w$~4KL3HBNHOGW;ghPL2HcUzW*MIs3Z)FU3b^pK>K{5GFLB$s0ptHm?5FKqIa_&{_AR-haG3Lo+80gU(>P!}0rCKO@t- zwgud*aB^u`GB(}hQkB^-*YRStHS@DI!|GdXTobMQUW&!5vDJR%(V#}=NG=^4%+aQq zZscGM+9q+(o&$@V@?oYnI32gQe~A71sSnd|{gKjI zY<1+Ecg2I#0S$J;fylKPEX!D}Fm9uLYw6sz;8dDHU(agsZf}~~`zhbz(I9=GvaBs4 z#O^|Ja-j6WGQh9qplqS6l~^iWz{L(ypCE6G~7TLRO+jIY_A%Zq#=}W)6_&)qh?lotWXnYnv5; zo2T2B8m=v^m91t&O=Ez*clYdCY`AqUP4+{cHA-!}M`p}+KJ>u!q+>phrgmql<#uhg zvXF$<`(NycF&Ky_*gQQs`#Gi%ShACfp@uq_D-j8t0U*%pW z>}f8+q z!T8T_`;F;@1$HpbvthP0WBSAPHMuP;l=#H`_SiYZXxW(aK@J&miWm*?jH_~Mps4Uh z_d7RBV?VEcigaJbSQal@X&p78UAyjRYQt)c7;HeuDp8S6;EV!0*s}3-2%D)$!e1D!PmTK4{+5Jtzh*a-z;G z&u;ZvR&k<^<|uVtyX_Wwn1{9@@r_P6+DtooZuyR+x4^JfFP5_GHQkQo)?)wcAo|EY z4+9sCc~$M3<8ocJ`z!>w%}CDj>9ZRk(&iiL)Sic{wj3J24^$>7vj|U8=gx`OWuP1S zY!0`MUdMp&Znz56j}0#Vy?5Yb*9CcD6>FZ_|IR)`2zp-V78tct7eWF2swlj1um8AlW@n7D26I&yA!=(m|^TdSgSH}c~bS+(HVyZYLqmc@q z+kB)*s}g_7IOzcQ$;PvZK5f-#oG=}A(J8&qfV7Wi z1(i(5jJOpx;<~ z{$7bI^6%|`=hku;WbDwa)Si3^_qD&@jUAm`^x#R@>?sK%jIgcr>z2!HkbNDinxhr% z^yAG{E=nyc@dnZ_X*AO|3dV;&`lB>AWh3_R+yUs6)K!K&^Wp=Rit$6P-mZd%!MxD@ zlsUlW!CyJ%Tvs~UQz~ZBvS_`seWCx5<{Yvoaqx;S2M_FxN2}-bbGc~5#jOnxsV$Uk$_I@{YBkq}ACr(nYrM6XKT7@AYX+t_SlIW^DeE}fQ zlvrBa`ue;7VMh0e*=|=y?CTB9k{eIv2!bHgQp`O; zqj!`F-BsQ17(^#@-NCdwfS<@|>{6ZJTEI=j0fI1Lielcz1ogUppP6k@|01!xw#ltazt4!`X*}baTzWyf^zJQV6HHC@FT!tpV0saKtd6nllUpM5 z*pWOgJd$m3D_kvA0QoYbuEx)aA%2jfzcL=kQb)NdiM}}D8~gpM-$_5Gso7{y3|!gK z$~zsUd#4btuKoKeKLjLm?nJr>Ys-ggMs1!EOI#&b4{%ks zK?6`mU^$UKY^C4(@t!}|qC}&cz8?j3Y=7S!==$@dpbD z(<*_=!NK8oGLP4z%ul3G2mI`m!wFq~rICf!5p6h%4M5nWfy9s3IY^oow#?}cv=&KZ zSpHOjCifWIvHb~d+v-2D!`06R3TisP$@SM*1}?F*(B!hQWttd&uNHe>Jdv{lSnTpg zQLK*lI(n*{)Pk}kDcHp$Gc za^aprzggvF-j1Z%SFns*YPXdu>S`M3(KA2{Nzs=^B&I+?bxTLFiQ(beqz@S%I0zYZ zaiyNMlz*W;c=2$^`n;<`zG31To6cbN zthDIfd+iHtnQj=`LAT^$Qh_2_X;s*A~ZsEa=3%k71M z>{t)Wi6@zhUE-&XChh%Ar2?pD=5ESi`-@dQ{HrZh70BK8w|OU`-ie#rE>-RqCc4-Z z8;yXKfsdQ1y%z;_v4+hw4KBWk6r-1T#JdL9pE!peCiY1bG~FK7y8}aH+qa9mx@re) z=Q`Tw*~ftGxwZjSM*|&&MjF5Fm~FY`7gQGFCu+d+Gkjvw9Muzh67f}?h5Kb2<8Eo@ z!OuS9a&)kJq&HTvl%qNJ0Jvm}n-8zlAlsvUJhKZ0ycnMEL}s|0w5R1t_k#n}YUM+lzI_oO74TCZACB zTvasPas+j+)1dC*SbF$Bag&Hivew1=DkMHfCHy=p8#Mx}6J<&E3c_2;11T@aoC|Y? z<-RU?mvpbCslO@scpdN0#BF<9lMHB=(wjQ{`1VHSTIAyd~;&4Ip9wd1r|2a`0 zf7YA+DAWJizCSvtoW4n#VW$F@ganoC_R-jGWrR1`;sqSR7K-s>aCB)-m(83{99c}2 zCj0d<)I#w`&!YuyQ!&b_G>(s_z@|B6k(LyYn~#=N~RI^49xzL>#+LoZnl zCr9G5nUM+1~DToid?Y|Lf6m}ntCMk&wIOe!KTI@YZaly0#l`i##EKIKl4xdu-WGZHb##ancCb3Is+^j7JFv*~=DyOx+_9bo`TqPtY}>bvFdM$=928)$ggcd`g_ zP0xV08J6c_aM$f~I*vkhBF;EjB16?V*=?Qf+l7kGf~DAjcgJ~cOA}X%+FkS~;y_7~ zWSdRUWm1oF!Vz=-g_DAdVjhjvZDDf@&Aw31?167_(N{1cav z2l(@UtZx7+54jmagjA_YjK#Pl_Dh>^okSV6#}H8Fazt!S>MBW+Z6@Oe03;RzCffBw zwKBnz7|QdLRwu}bS%)TlD%XH~Y0rgr4!v-Gv4~#{qeL*T>>_>Z(@1{-=2dV-;$zc* z-vk-W*eGUj$)~3ZQinQgut?xSpt582!5I=M3ymVw8MD&U7N>)Kb9DE6XTntU1I4EB z7LPh;&19WjupS(2o*x>{jiEx+W32JyB(1%B@fxaxlO?I04JVm*c=Cz)< zvsm>1zv12MJa=IB4nz#op5Q#RP=&@!1`Y z$csr5KO~yInnrun(+x%p#Q*Qx`sG=DZy? z-MG%LnCE0ZC3`jAF2w2JSK}d_W!ZPaWbx{a=#RJoki^Dd4rcj}k^l(cGI?~C-bynM`~GvlV6@}nlBoF@{=F~L!+=u?51G9)L!NjPAsNnt z1Q9c@yM#P^7Go%Pux=tO*Mxdab->QKqie(x!W%JXmOo{{u`Jwc`AU|Xb#3YBu0LYi{(OdeCrq8>fUC~EoG<0%fM9u4E zozKZw(ECX-_}nKu;?2vz`)ltjm(8>cy4Yu2e4NwxuhS{nIBy#A5VHv%zat0H(ye;% zF;cRHHfL)4jcIwLMIB6mY}9tg1?jB6ok0j}9^w{M76}2*@+Y42>54e|5HAZd4=s}V z1SQ!;!&VP=H+;+;#GNYOnmkzQ5hSD?@NTW?o>j)C5`aatQIlJl-kP|nzrhR`!9)_wtP#5;|go0k-2|4a~jIJHwTUdho& zCzON#kMxj?nV94WYwQ|ERH*SiFR|(!P?U)~K78M91fqZ=&*AnMUVn>qQOA{${sW*| z_>O-u9G@xjd_8`v;Bod{#KFExgW%zL#wN$dts#hY=+P?JMI@C2F1s$wGt@aNSr;Tp zCftXB%tH}tUp}j39^~IyF1xf#%Mb4=*l5)~;!4ntEh57WV6o1(mB3#PV8ab~d7fw( z6#~@IsyIH6773F3Be6B{G2U3D@`IR-FiFonO;VNLtXl!WS(>h{ujh+tn|KvVG}KUI zL?#4iZ;AAwhi77y<{eLH-4;U-r@qJ^`UJ8)$=F-peHb&BF=yHCE;HDaou8Z zqPh~X@A8&w@yj_x(yakE*eve;q0mbLMJ-31)t6IJ<2J+hje$#VCHs{A%D3XvTT_x} z*izeonWVZl^NE$cXR}Ia@>^uO1qN?`j154K0SV#J0q=p*788|*_Jv=XY60*@i%yO$ z{)H5&qb#v8j5Hp#sviP4DZ1co*Nn6aV3{EX-lv$XVQNm1oS;6Jc*Mu@%>kJ+9=tas zsG(M+CENgk{1?BTc~qSu;2J40*X;3I*w#vk^MG@01WeE@KuuekHp|eYW&~EPgKO$n{QX%r1(-U$T9ZDRj-L9TEvZ6WUtF*KHmT3 zhko1>mqv@{ER)DFGxp}>mp=o;T87B_RUzfH9OWd|xwrlHWdS@O2f_#hj4OckTri^b zLb8F&!8@K(mniFR6<@7oI2vK&wy9G0^lUIX9buy2i9r{u8BJe=y?~Mdu7zYFk9`5Y z?*8l%%5Q%EJI$$_*D^~4=>|ez6 zKPDxC+?S6x0fuB#C>jP13(iMSmYdUVb7c^7dT2ic(VzevFrwS+##@}Mif?u}p7+$J z96-Pfbn#=WX<&&>RFHlqJ0yQ2%(c5`#28m#k$QB#@Y=Wm3HaXoa%?^jFZO;|ZqSC`YoZ&=5TWgc&D~Rl4LUrj791tu2Uu+&05xffDa;{ zR@4DzyJbmsG5ic;;v=f(9gpdO#hkHbR)Ghsg%9VQZOgn~OPTw5-FE#l!M`{um-K9<50aV*dHTui?slm2Um8 z2XqWf*>{^1l;D4O?)keRcW)|i{HO{l=`e9mP8~xr=H|Dj6g}9a+TZ-0f+e1TVN>Hi zvz3GSN`Le0a;N}&dfM|TFY-*6;SrdHU;esd6VA6Dc)emb&1H#CZx5HG|dB=EiZ9ry&{=~D`s zQqB^wCR)QwA7Vjq6~V|GB9?f72l13wyTb0;@tfNx@C=b;2REGQpG$-h=9Ku^RN_mb zvaLw*m2;0-4Nky`2ffdg;(MC0lt#xt(O@)W1k%8qc7Mh}d*-ezb?#&5lR(V+{zdgz zZLp($paw$Lc55)(pP@3c!x8zE!nCv@IJP&Mt;ye;p>&nOn}T{vouSMGCMy@V?D}4a z^Em=Dv1Ek=ky)rAOn!7P2Z|XIoUgqoXos5r1m0&!^-?k~$?jiBJYp#}XrlZxky}?m zQgImW6q9p1=V)Z(<%8t7OW4l&KG64T#`yv1`}mnH5DsBL5+(71&= z{>Uv!^7OS(9sfnViK8A`vD?hcTf{zi%bG0HcExm1ZSjtvZc0b^^hQ%s$c({~c8={V zOv5m%wQ2qt8_Rjr;u==4pm2*LQeR}N^pa`GZ_u~z0}i$y6R94uMrm_6ZjoMc z!AAH&?&G;0rw7)$5{D^A6{08aXQ4@i2dy?bxPFh^3r}Z*p9TNym75^;d8ZbB=+XUf zcuOu?y0>Yr5POfYNwQ{cot|hm)DFc!7iziZAUcyB+)R%NE9a-U8>*nv#CzdcS{3h% z#&V5Z4dWV)i&>6S&2Fc?tK`z@W~h($F${f}HB$x2E!olaZ1k(Xdx7G}QIW|J)AwNb zQ9^0MI~ECoiGkQM=b4;VY|S@e7K|Z3!!C!CM+VwL+c+zye5*8Rj@lyTkM!s~OCs*u zC~}#1VmYD*6#}Sb!pjyemna(o*!XiSs^4U)ImEm4F4CXVPCathP1`I~X)>se<+_^q ztJ+S|XmM(9(+7c` zqdeRK<(IGicbI^+@5Qfbw3id|tp0sisX?mI&3Z;i`7tatbFskR2jbs87lv$ta)bMn zupb~zkP?n`$II2m4UkCHYJ^fTzqxtlL1zBT>-ITJ3DQ!pc9;)isM+?PmCkWxhWW4S zX4o?C-mz&?p%{rUeXs`<6aUhmDDnDS(^hHTV~@44vsuc_6@ z7K=oxr)b&Hg$t^MbYAh(GE6crPyM8;l;g@@R1<7!!Yce{3tf~H^U?UPVEiAyeS;`U zcP?M{`L{>1gfy9v#gA zy%h}@KqH7-xM$%-beegc$U6}@$|mu8;bOG~%f%QfZX^8BqOJ;iQxMNx>`jt9iO9SV zi*&!hL5FM);7QxTHqBeV(qY8}Uij5KSy*W`{a!i-EI#)$?T$FKE22Q+*JuVb_^O2y zC&qax8$m{Afkr(=XA8QnSa>7MWf?5!nYOW-iK$gc($N4HwsUXlkG4c zE#X}mEyZ>^HC^%>l5>wn-UKiU85-%pOyN(!F6)%AyMOLu}cIei?iPF`##AmlJ$)v|{P8 zq$`>CMIDvCto$5s;e=?&9QNcjj1W20Kbe}BAbHcxKU67%D#PbBqvOo^52~l4({Bqs*nm_ z0J-FWl5<@a3!P~IFy&FkpG^@}XUW62W53mZN;e8onSkoj zpB~nJT6Qj@{d+=W5^KR(pK4zbc@+ZIHq&oVNA`8x{tI?)E*P)NfP=6dR)gB0N|i3U zBVOpa`QVAgfcii75+SJA!zbypcs+XX#}iucJH>;qQwH6pY#SO|i^lQA;O z_HO5bdky$y9`pu*guT632CK_BTgu&_Igegp=r(i|QD%a0$aomG7nz~x8FojEQO2)TDVtuhhEzfxDfmH6e~ z#QC3#ORNGawNI6<<$`}E=gDn^!)S7L1qfLbDD>WphGh;|TxGIGtV9 zKF%(X3S0bhCfD-Kh6^Tt*85XFpOKqmtN-Nb@3Imkdi27q0BNe2sax8+Hq=S8ad{|J zEHFoOPoa*^befQ70{VRQs!Nu<%IJA2d&kOcCB>5-6Y?xO9qlxV67DLRo{2raz@`d% z$q+8U?X>c=cQ+fbd45KOe_bRuL#EkB-<&$eW!#ovf5_AKtm1fRbTRT)%!8?|u4Az2 zXgXMRM4u+m+S^*9j4K^_g>q9Csa|dLt|~KJC`rs`=dp7i9;#ydc_IFz{gsOGr4shr zI^v|J+hAI3@LPIOo@sG6?b))qhG5!Yy+$yZi*QV(egCTE9C6-|8=|Y6))uX-pdgrD zc4(I!|I#q)^IKx{&!%INOyt4zgj%ytUWkP175oXgpRO@?2OqUYmQ26a?J<-0A8`WW`(nOY^h#O@Iq6#%*u@k3 zw#5l>4>i(|eOV2$?#$K)(b<$9#6a!OADbK>pb`$K52a=QYNf&0l@Q5wm+7{B3pxw#_>Wj-5i9E=B*7a@Vmu>Efs#8T~f4ZtbHWnD+1o3x8sg zr-I$%ya(CYW=ARx*4j_*1x@|d#r`uheW_v#H7SF$TcM_ZpUCt?F<`>TE^ql(#b2lD zQgpBKYT+(#*j1c(zmw$fq0qfT*(_6Y}0j5((LR`E{E(=1tteVrY3!^!dc8q+sY@Ag$k zoDAzcmJ>C$QQ6vUTRHkx-NA>@nw{sPPeG-&cR5*qe|vCC8=dG6=+xRMOs!gqNmhdS z)b`5AWVgRHL;YJNLa+044#<3bV@J0oHWWVYB5~|X z#z0Rym-_tNYty8pft!nOAJ*NCrEE8(iGN$H<}B&^*QpMDP1bVukWE`kHH%$*p0FAr zoC+GEUcSHOC4a6RnII^tH^Z3KynnB|*bjm+PhSnGvbqz(=_+csW=K)2Wbl3>BKjPY zJQK&-#Yf3f1f!dtrD35$4X@U}*PP0=#$WcCDXR-e#wDrDjnA}~9-FSI&RNQuh48UC zsU=QStbw#C4%Du6TO^G6EpEq*mzy63FYUn(S{|>MPfZrgxW(~6AcMDKeWuli$(Dep z$VWr=Ze^>@32F;9r+2QrNu|Ge)Oe{SnfH|My}usnoi?&WgbIINC_?@!N6E`|KCF z>eFoV4x05?gC}R}Mf&;$g$;@mcsXZ2@pp|g23CtW2BYFPK*rW7XnqZkTF{aADu3d4 zYUxN7Qpmq_STpNIl!3GB-WcvQ5tuBoIAxgll7X5S^({U%441?uySrS2y&W19S2v@d ztt7x3CAvDp7E3E$^@tL1nd|h8Ke20pBgv6zCfL0?^3izxpH`6Ht7M{Fg8(N`4WupF363LpysLjRJx!^>{(E+q|*{4 zHzRrs@q=`zJI;YN9vxQ`lf07Q{YrK=44*T3{c|uYHV?Fgj30N+L$VV@|DpM6;&;+d zAHu~>shvO9)wRP&yjOlawmM2}n)Gx@$1cY?O;2Y`Ijeeytu`xb(FAJ;$-BG@o)_l*R_XGEHc~%mUe=j-2oB^Z!1TgK{zi)d^J%*a4 zx@4MQMOotgn?3Q;C#XRxjLJ(O3=Wik(Cd~b;Bn;Y5n1Nx%NGZdCvj&!#>6cP`pxpb zP#QbyPUZy2xavP}a3m(t@$&i5KhA}oQPGZS7Mdy$FiTs>L35jVUbrt|iz-_P;@`o? zP@iRYg4){0n?KB9OwxWO*ZMZnq<*YOa>G(Md7QEwHvs8txg9=M;kZUmn>K~K2T#vM zfZ{V%SDv_ti*~noU9wNVDf-zmugF{WbM~ew9H}3V`xd{AH=5rIQvglHO_!?K&tD(@ z&yoY?B@e*a!VR??;@{V?*InXiViap30OY5I!}@%hQDR@+lC`G9%e2oKPd*j?Rzk*S z&#|w^qXu7x6EKzJVWGy)N7CMUYk|ZDOY7j&l_)Nt5XqK!MtJg5s625`2|GQHk(aRF zEOjvG;xR$=i)1p{IeyDEDek@1w74YL%RDST?UfY*HfdV8BrC}$CoS?G+CQ&0xsriGyak-zWVnN1k4t_rpjIZ3(jftFJ2$YVr!= zP)k4rqId!VI<<%#o&g1G3FkCMuGSKhK%hjq0zpK=5ELngiimg=P!dOuX`oTVDI_pK zX)%Bu1my|_3T;G=Dp*9K4bkpL5Cd(=e{bf?yldaSx4XaJu1TlvKS0Lok0b5+Y-4n3z>Q? zDS|KB?-?ql*Zwil^+Sxd-bnCumESth4473`v$@0-5=j2Ok!%DJEADoG&ymc$__B$}b>{V1+-9!iwG~u0d zV1OG`@Yk3yctSw_c(wFr5liv8CJM3^*_ zZgFsDKA$;EIrD(n=iP%Fxl=DP{d_Na$R*R}*rNbowAl7ich3h^(Vx)L&R6AVa&a> z`Xfj8!{tPQK+r^B;AZ9OtM1I4jbF@x$5z;AG<}h8nh<9!<-qd~U_e#n_`!qO3Gp&- z_mdAZX0J<|jisUOX5S2#Qpw?EGgxVN-(X)e;7Go6Dk7P-Hy9Pywc&%m$VnR9vwT+5 zCyI^UOf{>IS}>OjYU;Y0 z$#ZV5d#m}tB5E-RX8+E!2#M z_;11%uuut;TQqgn1th$e_piOKw^a2Wg%z6bKz_RX4V;B(&A*TwztPVhy&0{FkmTaG z2|JLY+8N{Q(tS!!_#q)P#?ej!s2fBbpMM+*;eAI~f;<8x?mkj>A*iSPh(>Te0CS&# zi^~>pSLSxh)r4VX@Q-4g7+st2Qrsf!bPXy{NJ;K2XsuaA<7FrHcvsQiN*Smb8)l@1f2Ou5=dNu*l3e;_!NXl~?G(UN;?t^Xatgboe4mcrOTclx2Nrv6-i6Pdn`0!K<6x R8Gl|4emGYT7oJl<(m#&Dd}ROt literal 0 HcmV?d00001 diff --git a/docs/source/assets/kernel/query.png b/docs/source/assets/kernel/query.png new file mode 100644 index 0000000000000000000000000000000000000000..e2d15ebbfe26ec00d2d57581a8709f9f2ba69369 GIT binary patch literal 32710 zcmeFYcUV)~_AX3EKtKgVIsyWM6sgitq)UVl6;GOw(fI(_dDnRd!L)hM4gc410#fUL5xK1-c1=?002OGKDE|cwP%67PNKa*s6HiF5=3TO*X!~8I1EK z+w;au16&e0n{yh7d>IikpkHmXMB{MY)qV zR4)l1r-rhgfxq`d?7rehI-xrGk1Vks2LzNc9zP%02rkKfSTy@MhRdpb8fYwpL(M+g z-%Z1j>%ra1J)`!R0cNAJjR~tAzGNanvi-r`zB6esW;>>~q*SA&|0#i27)< zuZExaG}Z(#a?mBPx>9qXP&kPs_uZN=?s~#QGO+X{y0T}gKg$MHZ<2MQJ0%pEZ;5uj za(2DoAUkXQ8rif_`-oFQb$dYfd)K0(W)1g@fOWiG>SCmsst~_J{8>$L%RwFYTN%ag zy59%XBcfB6%{-3ERQolfu4Eo8@@xzFHRc3=;P~ReQwg`;p_@XK8}{n<_Nhi()ho-8 z(}$~kON~rrDlO9BK0u})MChEVdj(QAxXwo}l^c?N*Zdw8SK)HigLOWt>X$b{k={w4%RjT@ z*noV0s$P`wmgY;zc8GJvR;cEXCz+5$5p|llP9|QD*qV`H;&>n;Yo^a zdBRCDidSt5+>Ur$Uya>y&2T5ay8Oa93JLy&cQy3n8TBhx>Bn!^-xK8#(gTlodL$b3pD9gXuQTJU<%cTy{M zGY;wq<@jrKHH>#3tG(yRqi}jC`958R^5DkVyRTO3w{CWjF2$03e{4mNa>@k=Uk)jrOZnt=7dxE@gMd$$Q#5tP`rqstG4=YzI5zA4?yLHL#$V!`tqA6g;3c zW96qvjx>E`8aESD+FAdz**&PyN}9Pc6#6}2L18~IkfId_P6fUKCIaIl;;{0pR7Y2i z3BZq}A3ieD;J!V2O_R#?TIO}7i0;QWISltda&z!=y0Y!EH(aZ{j*7_TpozcR^_p5$ z?M^vcV*IAcnkuELTbFGO?o^_5`aRA{j^y~b_X6<&T`YG?sL4o^g_79{o%0y8>U4IW z&g&HGQI&{`W~M33sgxIF=E@cXYv=G9^X`9qqsIHOJCQ!omHTtfb*Byy3Kv;wO4yPgi*k6O(z9c_G@Zxzc5i46AiX^t-f;D|K=`Rqf#L;U$ye^2tkb$ zX?sa8szdJD^fRZlevy8Wp*o+D&4RfUv7Czh-Ew{%0li8?csWM(b^c=Ehk|OwT-Buy zoJu2)yc$*?^~t2IiCR9tUpHG%VDi>PxqkOW+w{%p#cB1rljoLW%;Q62B!eF|w?`!k z4|S4@x60E?U>YZNyIIWU1G-veP>sFld9Nh;s5Ui+#F}Yih4bn2&~s0;8d?+m6kqo> z_vFM>IdM+<+8Etv&@IrdxRjK%?OK1q+r+Yg0y7?-Mza{l2S@+B(nU7SD zm=~%S7;3<66svz4t(c6NAWekJ70L`v#f{-L_iG}cQ`RF^hhyViVspNb9egQ$7%Obd zm(SO2n{S)&c;y(%SVV!L;AKo>vXC;7jAT4{Fz=Hi)#q)pHM)A#KGkl#YPKf5ZoM48 zP}8$Qk^Ys!rqaREyS@3mvBF(u<9w)Zim!90XvT8aVAp9^bQfRviNibl?ew8FtA(#8 zMMvxVwtk#1G0Rg+`FrgrB?taLUDo-I=J(LiHvOtAj6Yirs*xYgL{6jK=Nfw&%Uj>M zg}7zA6+%iZKU$_DY{2+fTdWB-8>cYj4sLnqU2PL1ZL z!@pv$_B| zmU5R)Iw=SC(`$n?v1Wx$gkG?<)P7Tuz2U&VvyDS7zAU~4fd@$;8t+t#R8M;;R}A;V zsr({fH^K?bzEXT8%Z9qAyK67s=D8`OexSmmNw0FOGM^@$@-tB<{y?XfCON{gkf^{Q zFE;OO0h0+%FfK;f{Wd{`6z# z4woe7-T9yOE6=LcJS;pAsRHT-r{M8iH(n0h)vn_{S z#%fbLo{)oc^J&=l4*eA0Y>-XdNoF%?Mr2N;mRtn5*_2Lu%E(SfzlIhn4%!TY71uhIc}+xQQMvH#@{z+$u>K2q zn63le=Y?V!oEiOc=$$9?sfz|qTRpX14yfsybutfV19lBi+sFNfU)KaIe!<-7)?nmvvS_WH=6 zFQFB)nLe;msS?y~Mia=cN}8&T~ebqb;2Gt51$bm+&+KiKD4b zwWNnYaZ$oJ0j9X`%t+*~lz?)k37ACOa5|=QRNQc+9&mTqVx{Iw<0AOa%Lo0h?=WVe zd`%5rfqy(^!J5xuN9C_0-Oerb>xifw(>rjv?kdKB1a98BeB-L>aQp)LpK#JM z@;1_VB5C91!f$2kW^Kp+(#8F<9~|kIlE9&howpV1OBZKXFUglOY=2!L2^?SkEx^Y5 z*CpOBWY~-}v{@D0JndM;_yzd|*9xMIb9QY-}=IHJ1E-4@Y0)hBJ z!u)QY4g&WjBqRg`g#?6z_<$?;y!>3ftzPoEda?hjli&S3w)3*_baMB0a&u+9?AOZL z&Bt4YjqUPAe_sC@r`=1Zf9~Y!_4i`|4=8YXM&LfbpunHLfu_=z|4M2*y|i;Se(dA| z$PBoLtdNk1^k3KiZ)g6wZ z@dL5g9WQ3!8&CD|L%o-{+jAsGOiSp_eXb;MX@kO8aS{i|I8XV4Q7owo_vR#pa-tgF z!$<{2Q}u+&l01D=hKHZDQ97fZdc4)klkr8O^tY{}jI2o_CRee6oIxGJUJkoLC-Vrm z)%g>zegR?7xP!%j7nc`guU9w-Fa0VBF2OZ=k^osER<2`^A^5nR9Jfa~fG5=*KSFUMiQ~alK{?)lC z!Ih#>9BZ-vHe7l6x03&Dq*q3-v1aMuznl7R!v%!0`QJwR|Ag}YC*uFpHHGQ+uXb9? z6an~!kaUu4(0NLJl1T}BY}9M{EIQmEuA@13r;Yq%-`uO$|ExNCYSye_)>~137RTq? zf^PHDhrPlBDOeO7E6m{9m~HIZwBpasuuT>Ei1eOG-H+}135(T!slDg}XusrE3Knw$ zA97H-wyqSKpd~wH7P#MTkGVJ-T7+L5^b}aagD+41x#K1+puTV#b++F#Aj9w8!v{@S zij$jfIT}{44sxsM;#~bkCLm(!Qm^kU$#QL-FO&~DJ1mXbt|i~Y6c=U+p;0GpFJ-`Q zokj{$AHF6!OXw{rt$4oLpTUscK}-;jC(wGDBe7YO+p9n+L)~X$pK8~u%9UXoD|2k* zg*jVGhhMA+*!O>u7`oUC+6!1c->MW3LLJ8x(0Dx7%FmO@SCr3xm#>)l_o%b(;SuO{ z>*y>~{cWq7)fRx^iV-uv%xXGYO)q7J(qD8$)7c~$Sx1!g+@}ilB`f-H*0h%S;-&cW zL5mmV_4F14mJiM8(LZV7f+uqzh~HK@^qVn(Uf}JHT7rI=6Em-GBwTs@2A{hnp%(66 z39VQ;ecJIfMzNHVN++MS$FCGo7lr&F{o8PS)C5f0Mr8EWzo$)J{0d{>udntOr~B0k zOhUHrjH{l7rb_#HI{O>bA6uucDs++@*~_Xl^6c}CEzhyC)84@&0t89~QWL((y2t>>Gome})TW6+$B%}CUR)nT8wc$^&O zWr`vmk0SrwgRs9XjpF6Z9^GIA|81EDeeet|gX*WiSaj}^fxF_v1h>E?hQ&H4B6x<; z{R{4DX25PllHWlWSDzAt3w4mM!0Py$Wv20m*KVPk@{&PY0=ygj_M(#fW{rY@;3#^k zE!AM=x|W_!2T0J(R?5{F@p(Z8=c-U+uRc+m^uxoo@d&CY(*0Zh_+gWJk=D zOk0t>6}My~sypak==G@`Ov;|G=ejdg!ZH4*^REF#mMS6B7q@j-<%5~bXr%8E>wPpN z6mf;4kAKZ^nqYpg$BC?EfGip}#b%ssnEjSNxJz%5Bo4@7rOQB-pO4dwJ-hC-?Is>s ze0;uQ&X@HQHePbrozlH~wgwS*YdtIYaP}n>Ul*oDO8FCvJw<8DBF**Bz`wo{EBP6k zYi+&iLMUDDIGp5yL(ek@K1qxe1sYoT+0|msx6%_0Oo}VodTcBw!h~H@`H!+cj_Bc-u-qGBK*$;C{+J(DTLRng64^jcO4a_zQJ?j zWFo7Zz3 zI{?lt(3xLx)|&`KB4ql8l0RWIfM2Kg&TOvf3%GcM3C zM0nOCB9?39K@+t)%&0OKQ6co0p=HcLiI<{D4epjB9Ya}>(K28*Nie^{<@7!wyk~2w z=3EJ*Zt|5FX;H~u{A&B+i%9Wr9XprojA53M^a^sZ880)g#!_7zRGFu>celxA=jf~a zGjkFI-XOMrkX05do|YWid(^S8!;p}!CjK)@%ETbXA=st{k+Mo(Mic$35VmhC-!2<* z1F#JI#XZeFB6{o)`qG4r6q{I8^9i^2N6=gk6yEy6E^JTJT#I~+f2sncll&Cl)!=~P zyj}n``fxmlFSSBmx8BC5v*KAQeXe=)W=W||Tpu5VahgA3wq&SkswPV8qs@0p@6RXm z0gE?l@OWn2;6dJHq#R9o@?C@33b}q=Hog#=U)qB_sTX@8WJR`w`E}*CZ3i2L5DPDa zi54!O++FR31A~D<*`}pRi+B4i$Hz|5VX@}8QFS3k7AS)SQ3mayrU!hdpUTK2Y4Ztf zIG?0wCud(yD2`}T<#R{^Qr0E?iNYVisRG!m_}y=Mzd>l0Apw8Wa(t216VSa0wPu@n z(N7xbVtfzQK6&x%Zi8@XMCWHJzVpyy;&Bt}h+JfzR&_E5=*LDK~MHvg` zvUd+ko<-9e%sI^UxxvnguPi$h>=YWpL{5Ak_6D|%eC6ht{^KrbK9=xZy1 zCQv;b3YnzX+A$SXvShtZLXeXv`^Dki6f|4RSKkb8`&GcqG1N={zA&+ z3C}-ji7gwL3JgfYx7e9r2b)}?VVwBV$1)5Uw(2lH7X|aAraUPWy2BKVD(9Di4)}+Y zx=torDBG1H0VOU`)-BYG87tpk>g??H=}oy`%|%DeIHW66!R(ZBHSY|Urca9qG5V#9 zXx&yvdk|59ch~>mo0fl#>Q^b~r-l^}ENK?nu(;RLTF7op@UXD)yG+|?L z?@w$~r~ifWebI+ngM(x^U=(uN4~g_$!0q^f-lD5avF0F@6KG)yKTZ!FTiIi?bq6!~&b0*UiZ=y7&AmqahbH0f7HQ z8!t8@+;@`wN_#4-J0&=63H64>R#-swpt;kz7aJSIa(70!0`qv-8@bIZxE_uCo}^x01)AxW za^}p6m!{<$NIVZ*d`qj;$&ZgcHP58Er$1r2**ciPq8B&YSbO;TKB+HkgPunvHHrGu zNius@e8G2-Kc+zz2>*~0)zrTuw?THi;I^6lP8Lfq2nMEk(E);o)_vg8i`UXRjd36A zKyy90c!I)Bv^5mPcy@Wf(+h%5=ET!>#0PohVw{DiRnnD4Q#PWHs8#DlTxS~{r`Y49 zg&G{oYovDSMsj@6A!?;P{7J>1SS}6@EY%>hpE8X+eCatM)BDNb-KMo1yRpe_F1eyb zH!Y+fZ%PhQpmE`qV>-sazGF;3Q61Adhcca&7C-7K;>Jxhm;wgETiG(q_Ocw2@hF-1`K8J~k z)MXw$PAc+u)FU^3`xu55Z-l;Yoaiz%TB1@@U$q$}R z5AP8`qV|pvKYn#_L7wy#Ag-JNmc4*KL2QUplbdC_HB)mR1}ouO_}H-Ht8utjXV~Uf zzYKqKKBzz$!)H>u4Xn0uewa~#XK;~tR=o2s{Syo0E-*=RD|_(4Yx=HpUPGVKk4ZHu{CuS23picW>1{{%%bBu7I(k8@(){Dw=c+Bmoj-hwa6s>7+X=Xf-b^#Y z>H_A~PsZQKoXn0)cC@)S1l@*BxxvxjYm^@)?G+rER9h|$^1;NQWcc^$ed_!qs zq0QUYQhjHO4KuF7u)WrR#)hCr?URMqg`jzo&k+(g`k}a7hdq`UAs=Igt0Hjpj$50b zn7bXP=BHbH&}U!qY607!1aYbAp4^Si8L0V)0+4X76aCqm9M)+Y7Lemq_{lTU7%=U* z1IFx9+f}ct$#)i=!H9;?+1?B(sCE5L!%Y7qm*}^RV-lKL0OlFo8)2X_v8t3TZZ5Wk zXx~2$dQ`1Yy{9h3BX9@xCb9_t9p3=k8P4$hCkqz#0-vXY$eXvJVZVcc0d;Wd-b`MU zGG<`P$9Mv-EU{E(^myFf+(iP!o}a4cI^QPvv?Afy^UNZCSJ+O2(i$m)JIUG%DEm#% z)~EGW-CFy;$bL$(M#pU_quvQP2D!W&4gX|69TK@QQEA!k`Q%T4d#y+ztNN5o{g3us zsWX)qSZYZ27(#e-GSQxDf!58L{8`?>qnm^G6Pm#uhi;9FVU9`Xv)CmT?5zOf7VMiy z?3?Iu7SZQ=S*|Yes%OPN5E@9GwO_bseBMBwHusmabQxv`hS8p^ozOw4_&qt`C9_o#TLTU_*F z6Hk05%xR0Hc#`#eX)Oaioaus&b*FXN{3E|t%9SLDovv5j^u50-V8c*{pQtZ%SB@tE zKyEUPKzw0+ralAm43cSDKGxMrwlp;JP@JUs*!+4JaS5rHIZY3mXpS*r^cx52b1(@i z^215-sP`u%G&|yFyKFL|IfWbXs^Fl3m_hWxAMG;QvL?u_-XX4WAczEos3>j>ROW~gpTjVMf%jYuEsG+2u**t2-u-1-=deK$)W=S zi>#BNQ*w+e2GDtiib|!uY=!VJ^GS}w;(=YQ&Rl+P>h+?e(U_Q-aJk;i=DRMY{rC)~|vF(Xi0Q>Oz+CbKeUdFxId`BO#@-eXUjV$aW=M>q$qwdq2 zUw7ioE%3YrguG4na>%z1ykja*JWqp zgpgoOxZ=g57*Vk$L?b5R`;k`+lxN>g8#6uus!BA0w1AUbn zJXPX4#YNn91_WRh?bS{}OpyL$NtSbSn&(oa?u~gNO>mcT(9w``IF^`WE8SER=?Nr2 z7CDcCyhcRz=Jg!v=jskalZJeKEOf5D|Hp0)rr_WvcC;AL|M3~uaYcvo3HdL=gkwrS z%!{g>(;ZqGd@|$Os(oP=0EFH=^fP7G)20aRorpUD8t)?LbA!UeZJYr&{xjC4x)*b7ppc3F`}Erl|PL z8|tP?T4lC(uo1N7C@37w_hKmN$-s-sze}3x1aR)Mfw(y#g-dJX$ zX7i>RXq_w07&6vaR#6gfS6J~XekFgP`SpCh?Q8fdB=Dfi)%QWKsc&_R$o|*XS`K7m z#@mBW$9jn~`SeXYs5?5)Y#*Pspqi^An;9(He#cV|yqQAU!J~6JHBlTg`teu4HMPL9 zyA8CG8!Iv{3bST~cBM*|bTTxE()1`t(Ygl1ClfB}hzZ-EE|bdC4Zm%NzBK*)qxT1V z>^{c%ACCzlGh2-t-L3Ay{I+cve(ZmhlY8Jc#zb573nt`ss85+NM^UmRB-6f@q{>jD z=_upVw!%e7kTB=jgxrxbw_bap0X(GVkk)aKVk&Zc4{zATa3x{9JOi%L@lQUKhw$ay z&!2NR{zNB=)K%o(t6Xwtsz+?L$;vY+_Reh?Q-OzlF+0cY7syJmXv4#)$qwVgxxL^C zzMB*->}Ayt$!ri_)&|CQiN6RtVp^`4Zfy~E_)k?^)RZTGdBK7>>j(9A^H!Q2zC~8| zE}v#C)sPP;3B(xG%$d^7$QUb_4o79M_^;o#VYuH1WN;W)VcFy&)m0W2d;#Rh_(KdD zmCkapKZ3jMt@;DJ}PPQjMbA#;WJwaiI z{;LD2V$Y~E1oT19tejp{$9I`*5QVnRg5#5{km|3r!q2|o)B*;~z9QjMwpT{K*E#s% z8sm=FPW^+}65iG58ziv$3_8Vl7tf835H zuFr`V8acs$!VuV4YP*Tex+~Q73iaHYL9|}xVjt*<>&OAFpzqyv_FiZ!vfS6caA&4r zk3f*Yt;J&u&F#M2{H&yzRror0b;zq+QpRb}YD#HkYx8&IN8|r!=(Y8Gz@0PnGa1Q$ z-&)%wlq>TkTcEO1IewMER|{F)UB7Xhbm3}l`0Mt#yUq?~EQIk_9<1m*6{lmF=qoB(;Q0b3Z`WMj2+5vLn)0({$ z|1OmOdCm4^-rVBVGRyBd`3EHZIhqF4B7f)m_Wx-6zx?dJyk`533g+L$@TY^{N&5dc zp#MuH^!bK%s)&;gKlJNsV(}J>%%GL2IH7S}6wg8M^83!p!dTU_7h)@2rN_qJ$PzkYp_zCI$F%{7aMgB zYve?JG#5sWm#1@8nm~Wpm0zq#RIst=W-i@TkhmM?qW+LF{{y=+(J^b$ax8 zTFdegN1gctM$25((800# zmlq71dR*I%fJm=>(sYq44OTjPUUBxk?z4>_gEKlZvk;d{D}RBkzL9^qdTUgG8ktO% z_S``QE3LoxGV{>5URp0*8J&+YC}ly88-trN%iv=N{nEcp1gl2KmSqKaWbQ;f$Ut zeJd9N0n;xtspK=&zS5SoSQfn{qa_k(`Sr+sL_?ggEu50lz}#o7V2cSUj+SE%QaMNUzNeej?SVPodv8hQx%pF7wn>aoplJZPh6yLXS;E12h!RUpP8Eafb-n>UJ zJb!{W7`~6{!%}!)G_aXJxV3W7&eaa318~1?wkFNi8^{J7zN{M5E{ymRq9^t1UcV5q zr+-$rAALPz7VR=B+z-AIb^maZW}hUn+NP5TCL_BHEKQ`QWerg#WaGbcD5P}jDRB}0DDI{_bR0JrK30Kpns`xP z;DLrs*3TaowfrSQxAMK5bjTW`7Fwgy(r-G=Hns4o*_o)&^2JQ%p&Ps1`+DaQfqaOh zPsx<`h=z-~PzRCWx5XRk_pF@p8_rh6;dH^i*l2pu?#b>2SuyEem}_S8*7B6$hZFW~ zO{oH<*tH%3xJYH|dC3L>4mR2gBWJKbN=_<*JMnCc(BH*}zLiE$mCpbx=x1;dyyDoF z@^G{?ljp^z8+r|~eZ!vOw_|zF|78#Aom^LJhUGVKI&mWg1U=SzTIZV_I%ArNcAU#R zub}pj8?`hHs0sJm)YnQBMT*1FU)u$o-*dp+w?2&whp6!i={d9-Ukwc-q#k6D@%OIF zjCstU!G2qt2zS$sPAx*(o&*rHZOWgBj&(2s&kXBpUF|0x(&SSGcvGP1c>0o$dQ zxi%bmN{E}NSth}#`!*z}6F?B=?cS&&iwY@99E|Xcu<=bXOP-68gWml`MIn z*PBIxKh)arMiF6MM%nUs#>kU+b^iO|Ouj}bx_CTvL8K+`uW>g6 z-K^hvj41&IS-K%e`NxxGjSw_qcIs~IR4ip+>wB~!JTFq)=M!U3SCCJVKL~wGUM9r_dW;jB0)hK>>G@n1o2l#Wmu5)$y{Ir{ge}jTAB!id*j$26@tbXXy zixG@=gUrR z;bmjgD&v*_`KQb1Hzp~Q7s~p!CtjjxOb|Z-?-oZw@8v7Y7Q;tFg~;;85EkS`c}Ruv z$g7D>?|%gN5rk1NJqK!0*Lnnz;vHYZ@yGxHn?0~CFOY}vJ3ik{A^eSC>hS91J~piX zXrx&lIBlHP^#guQ&@OA^aEbAZ@ zpwpS!_AEO$7S6NHYn*BSQq<6XpsQ@Jo(6{4=<^x!{Mn{X74*(*%}yBm3q$mstYPx{ zaGSDg3lvz=8=N0iLe5QdzOj|k;5E1DfrEz?DgM+Ef8201e62E$Nr{!`C7ea>eUk9% zZS+|Brs1XF!25Q}K#H&kzB6h!vvboOc~UzdE9{WrW6%gQ_yi^?s)H?02Hz%ZWEHTj zeqSsJTQSRKYXy#3OV1h5{&$fzF7zNFU?-ybx=F*FzXym`)oC_ zDtc!!XD1S$J!_bzz4u{*X$aYYFIg?x7g*^t#XSxGJa8oE7o^*e+MA2z>sb#a;?g)hNfYVn&6Jrr+L{pfPAC1+{lPK|K{spG`4-n^W#!kGL%=gC z3fw*j|L0lMa4W#3#a#{l3eNy8kW$RBTAkdH5=qs7JzyyM)~j9J$f$X;*rGQ^3e%Aj zmT;)~q}(5s_|whr8sQbzYb4jSBe?%9JMusAKd?B50TJiH1Zn=*=-&(W-(r+^fHWRc zNB_yG{4?wS9Blw$%dmM$q`U&&?%OcHm$uM}IBF|w)d=E!=95S7?A&;U?Yq)=zUB5h z+4G$KpE|*n|J{6B8-i*1Ol*VCM<8M0Hwhk&gim(5<`tL!(m5~a2_Zjv^`+=(Ztwa~ zZrig<@Z^7+c>6}^1)!1_zGYJ=yb8>@gwuh}cLep)KjXVXtp;fKBO2mVCv9MT0Q4u0;D2CNd-x)!mX1+xXXLxW;b8=DXsayk*PM5vL=&d1+s=zOnL>lL4}; zRFQ9=cm@<9{nZu@2js+)ErXlr#9cp~0c?m~pfu!qJnJ=(IW2?wD*f)m0us=T{-&rCFQOEvNftj`?Gsk#tJf2Wx&AM-n+BLksPV(`Jq=)OxNX+%*%MCd1Ex#FROpVm zWsIS)eXpRF^!^(;T?NCfx(~XA}OMl%KW` zc`7gDwy+1rwf1kZg_Tv`NbNIx#_t2Pv!qLU($@4-C^T;DQ4VJOhIQ}^8dbn#unyLJ0m{?b9mC}obctCnJ=3`S+p{gaNlF|Wc|f*V(3omfg$skOl`7?7(s0L1$o z$azu{Oqh9TxOXyv-mIB3aJW5C7ZapwfQq(Qe>HS8ME9ZJbmyriL}TXt)|hI$c}gb_ z6Yoc!zmMc|2bg*38K8Ly;RJwc7}o@lu!Pq@-O)e+AdQ5I^Rjro8)RWW>pcQo0;zw0WU(09j_#G{XqnPwYDh5FOJ z;VL)pjR00Vlsa^)0@i4SGuR^d#&7iWjM!dY9>0PCdp(h~awL!T_4Ly>pPOI><)JPww->`TiVJXvhoOlH=T zj;n8}brWy8k*eWx7Q1et!D8G-9~c<4MUu@tBdW#O412w(Ml-QM`Zq0tiYshJTR=(D zVIV_lf^YbGTf-QlB{a`h@#Vp4IPy7J;OSy`HJ9!-?VYNxZ=R*QG(;#K0pzyTX$Ic2 z8x2uKllNz8A9uwS)z(&WE$S&5gXfOlkRqXGNHkzPuX<0^T7rE#=YdtixGue|g#ao} zJwEOdo;^qf3K4^cIH5^oi~-x#)rXb+?Z@E`Jtu8M*X6TB@yz7vI)+^G0U0#oPs2O( z1#TU8R8ERym;!nG7tomsCLOS;{^hO_u}F4jGOY*Ay{v7=!d43h)Qzl8yE2EB0KCn?xD{~QEqIcjDe>7os;4gGn(wY zrm@^rjA9ISy@`LIiA@mXlnI5Tb3*7QI-X+Pq^PDrOl=@mRu>0nJN0# zeYVp>2$pdY#{U`s_)6(dmr^lu8kng<`CL1He~whzAOl(R8c_e+IiE=`J^TzQP3isb z3jn9_O5uu(;pWSFNT1luJ>T6izv&gX6Syj+A+^RRiJD~!*;0)uqJm?eWWHr=** zG$#loDY>f-Ms>B?eV;+j;Yg(_f~Sgn7RyZy`3Yb_YXl`0YXHzB zXq|RT-@5`nC`VrI%dF)zWr=^1hhnYcP|iZCLng^3xAM~0g$6$sjjyW&C{A}SaLZ*6 z>c~;Ac1zNK?J+m4B$-O#W;jr5!82q%_izAPQBxiSYjLrVW#Ov&OIfPX>8 z?_`V>FDF3y;NF3Xj8AQ*-R%8iZ*=`!_hZ_7g_-^(Jxb2&C+HWjx_5nBb(C|y=s~BI z&V6d-1;86n9s}g5^^2d_TL+y!4psCummI`iyPYW0fUUp}$h^bc^33my)*PLo=?mHJG(DTUr zc9sq}V%7ESR)N6B;Ibw%TyPP~1JWw)TZ@n~wB2Za669#u%IEN>z)}a&R;7HmK^CY5 z$NIy>D;HUJDOM3%Art^1Z|WS@rh*jb??W1YL1+Id4x1Pv_5Ef)4$k zz8r5bnNKvm_+lR}GqjS0hi^Pl<#$-QQ*2dbGMpgg2Zc0b27kA|Bl9q!LDMrW2=s7F zJ`&zSA9TbJ@|;(lGT{z^p!cm2yHF-f%1_Napz{V7?t)YNFJ=# z4M(+J)Anet4{tJEl-wsv>_B!fxIG36i-O=&pdJ`z&T~2vzqFbDRQ2h;fAC|TTk+$a zxxsIE*5hY`$JZ{sE^15;BgH-1{?J6{9^{@jm-O>=`eh4^dh_P7-+HTE8|MX3jHgz1|-%#tp^s$!y%)$idiG2 zIyVXiTs48TVm_kcyfn<}GqDEJ7g%;tn;DxBS2Y#R>CSkzF*%S9T7RO%X=1p9Yj^+sPuOiy!CV`0$SBg^AEor1 zj^pxy2tFSza;NMfEvJIDeh0D%hR>nI=U-EwGSQ-ezAA?VR>Q2_NA-*+Oz;?4$ho7Asj}{}>|b zge{Q2d^=O6KbeQ!W`U?MN7isiCQG9>)VvE|51K7<=WK8(3e8EAb7_8BAE1DdAEE{) zg#48w)y07-k#mrY#!vX5&XV|?x2`iCR)!mS5x%&mfPMn<-0UDb5Sl>LHLJ+yA40VE zdp?hBR`yQ?i$_VWm1-9HkgQ5wV#m3=Sww+YNg;`ft=DzqpaGPqI-@9}v1d4>p*?n-m()}nle%|1_^uzCN) zv^>Xk_56X$OpFC&vw2Leuq0bYJdD$gz-O(+b*n(4uaSyAZzPs?pw3_%@uH%M^Ke?; z#j5Ku%19@#bs$3<%mt)Mq}-Yg6Dq>7S9l9gfx z2~oRjK%lqjMVhRJNkO8lM%CZU(LPl`Ms3IMj{aKP=uF`ki3e}mAS`zQYO`1RJJFUH zz~QE4GSW$UJJtcjt$r*e`_5aB=~(O=QryHm=QFwFCQQKwmG(wOh#J=z;_PtEP_GI;tXlYM^ zoE~O)o*~C;9WaX-%@wb1)$1HIo1sSIZ08c^Vwc6m&O6uQwp{0$yw;Rp6OEbNhOK%X z7!94FN71ZzA68xlKbN}zP}S97 zEDAvkl-{go@dWs9-@ejOXXhhs~Zn`D%@=zxp|-EwJW@ zo}8O=f-5VYx_Ivp3+zV4VnzDWKH}g=Qe5r^+?gZ0ps~Sx*)Qy=p*98$K^_-P*ndFP zSdqP$F>v`jU6s#vV~gL}@Nv`QyJ`k@hRaE&U3W-&O77jA>$w90=tS{{;8q*Rl*4N8 zC*ieE5^JFTPXjF$@P0E$>Ka-aK60@tHE>v@mpKxGi?5@0cR(vsRe#h9h%s8y*MO>Y z=jY?`a#Pq=z|s0*MRF{F8frX!ws2B5>E8cV=T_)ef4EOP*v3dNLmNZ zX>#D!fJ?D#Pubse{fsmKL4d4m>R7}@9;QU^&KM#~?Pc%%&PJrqP>$k57>6&g)IGQM zfN&xue(xCyhW)6ksTzdZfku>9+Fk_!ITb+66P8BMt)gszoiW9I>EaGgE&!?wnY6($ z5LYU7*hGDJ-#`K9fun}O^2LQ+6 z*gE|tX@`Esi980I+x| z((Qm1pRMsTaafNW=G0~rU~sFMgVD(1{tPLYQ;!4(%T#8v%WUJ2j8d>$yktbSM;v<` z58`nNGOUAz@e<KvsjSf{h;rai}=r4xj^qlU(aA`O|-rVuL!;1GfZI2IF7 zO9lZl5FyI)7#z}O%k7Cuhy(ly8IzRn)hFzx3%7OCJe(80M98@vC?kgR)!FX=c5bhb zNnoQ(;p3ALV8g*kEDYc+2rhJ;c2>4#^Rw)MJ8Rn z+XN&8>KqI+BSrXNx^35X7;?(9_ndmv`K;H#Cj1+jLh=)<>X&V)&PS?h9{ zA(56G)1G`&8nl7^BLG`6(Y1X+f`1@S+DY%*w@NVjMLutlS22+Kf68lEp{FH-{8(Ft zVAG@G(mb+NfcuplvAo-r3=+}cjoiSN#ICen)RGC$A^?D>;PN3z5`-~ZX3>635v9@P)5YuTNT zk~rDZbxsKlvd-=NuFu<28 zpowy#oJMBL7znX@@x}tN>BdUMW)Q8EqewsS`RP(zK&J655Py~cu4Q#pA0+|WH5;q3 zQ|H2t9bDWqwOTU;*XF~Vr-bSOf}Q**80G2qx7G$;C8=)*}u zfQH}ky#xyG5+L6tab$#kRXFY0!pF0TcPqV-dlLPWMRa=7`boLmJm1S)E@4aOPG0M&KwuxtMY9Yxx* zi_D1V8*f4@E$FOgRC?HBV>y84?NIAYYX#N(uuB{-`xMMu38XkU`1D%r)=yM7J?{i# z)kaLQSh@ou2smDx02$K?7VQCcP`Be9qjS%ghXf{PK8dIW?mvf=qzj9lhkG&av?8eGyoFJ}OC1W($cRXC!?X~hf7IDszq z7PZkjC+z!Jmm`AcI=E{tk|qw{2`crFb#goHd>VE(YFX?csgfMOyZXu`Ky5;{5E9&M zN+|#YXm#oNhL%;Kw7$VtAKzobPYXp3k+$|B_S;+<&a7`#PD6?-p2bp``yR! zeZRgx989fsuY28bUFUV4=c3GVU*Cu;tqq?k8}v%ck44tq#bqS#9lqoA*<6i@W)xp{ zBJFK?rHU&d&LZWJ0!P9ZYu9qMckS`0Z4j<9SORR+JC%m27E$Nx0n{bqPG{mmvrVQ- zLhoK0$?==!^z-uhH#5D(BC5^mUQ0ap%p7o9i!d31~0P5Hh=~>nmc}E025JHyc&Aoj!=FP@T0k^W5E6 zh7erq(^RRFZr$*F{pX=}4r#zo6v$WQc(kD5r6Y>1pKI2Kc&$CV4j|=puYiKCv&0p` zlv&^gE$8x1`ITayZ|S>t8-#Pa3ZI)@g{7v|b)w^gQLUa+(!e@TXSLVYCX46R1qa=% z`aUEsRgYAizuQ&s`TAX2>RxHzJcl-CI3w!~^w5&GKhP>K^v@iz8%b(e2^|A*aJJXZ zW{1NeSB_J*vcnr?nCY7`YOAa*jg29- z2`o{OSbknKUS7ABJAsdWe5U&n-CV&)RE+tM*TcfCDJZyJ zMixj5H9KhGKM3a~&!3;f=P~act`v=k6aX!8Lci z-`5o%1M;Rq-MNCa$80fa3~a8`Wq^ioOZ};+yL7pmE>)+%P$7hBV!mK+fWm&-1U%aN zZZcgO-ED&hx$Hxcko|KG1yc?yrUb_JudIN|(5!6=AbqK=t&k0Yt<=xU>&4zlO0Doc z^Px4#^#K#co~}Cd&HWdZ6;a2IajIFYiRRMa3pWd(2&x}*|K&S!o;<>zeLaP1%Tnz` z@~ph#6ng0?>`qr2h%9%~o-TpS%POhICH5WS67ba78grzUC@=jSS-!9>jXm+UP%eJ0 zn{>oGsC1scA%jn0+ctNkn%?{bZ%50Mq>%|=h8i3FBvi-C=v zhLBhCN!u6AnL9z2ec4(T!)EbG5amd)_jlL0Z)oOd=SeoUdKSm*qhX%633pc%D+qdb z+JAI^S#YHGt@SkjU~Z}EhdteT`>T3%J>r~}2eD3bsM2o=uQV_Ccb{mpJcMat@!-=f zz0X^;efzjs_q%4)sMhlxsyowD4RQqk&i_C7a|Co|Wsj0hDHfN5)qh)TS zHWxCGQm`o_2p3npZRK6aWH+8Tf0%ObUTk@l^ym47f@UNEv0@Z{mWD-Ia?T97q2DcK z-Su^|y*a3H|ISNE$b^F$C#$Y+geN#ZgAlM?R4kY%I21ZC>E?ZMl_}Q30Q#Ic_4KjT zU6kZ_ILXU&D?1Ld6pBe*?kRomT79i{g_USjvp6rSQJgkUdyt5ics z?LEETy^jao*bJ)W!})}dt7RNr`^MRU))gvz56fmehow;A^MhvvEH1}@Hu_BZUSmrr zV`vb??VE$0Ui4Gv)@2b9ay#kAX`6I>HscCTHlFf9WRa^#ESyhgQAkf%hiz2{%Wi*b zgLHH1MEY2;hCt1#`jB1m>EU{m@BX(H%dk;uJv$*vt$ZCT8_IuZd*k)yf_i4$QKq9` z9XUL7hj3nA`3K#d zvE7}dApqa(v9}UKrrtzUBnVVa(v|C-t?UmMXLsW6VH@-87V#XADnFTH^i2!<_G!03 z-^I~0LpWqqb-$r|o*T&47WxuhcV1OH7zrfZ3FF7E-Vjj0lJ-nE#PubrGO1~vxf(=q zvO{(jYvcM!l4kwn(Lz>(09@3=?tH0R{l!MdHqfg;iL7m^MEsyx?eVod+GSgpwbCIW zaDEQ^v@VQ*Yk%!UnNhvm!kfI+x(`=!lR%M^g=~QoBHW4) z)No{=^lil1P@$N6g{{ygK#5VNYN&>tU7`E|rIL=Jt{dd2@iJ^vAqI>L2tre&R|y2# z8}%k_3XGiZcK18?jz2^CtNv=xGCii5;ljiLuE_HHk~-HA?9A8Hf6>wr=v;EgD0*;L z2)5r}8#G~THtxc>r#2%49eLuBe&C{CKjT47mot%qjFiFAFA)n&PNEeG9~4k8FS9>L zt@6CTuQYxRekxqH2x9<@?G;nl>C~65aE$AOAdoDwS-$fTK>- zS{N!TS)jM5Vpc@#K*8_+dW20wKFFHo8_kBZu7xlwY*-~&i@x~I+|i8Qvr%_U`<8zG zH7MzSyp{ZlOlhZY!mTo|acar$5+`<>M>BMSR~_M%yzkH-Y1Cqqv?2hC2DI&EFsOmb z>MY+D+vG^Ek%WNVNY?;PFa*&}mX^ciI>2u-c1#G#8w|Nri^bFH^5S^rDvKTt^Z9y@ z5Ya|tUDrfohz@7`eZQN>a4R9++o@r<Dh?BVQ zC2!PIXsVeyt^QR*IC1sBMT`yHcfKC`jz?KA>GTH>30D8hbEDc(oee}rV{`e>EQMtreqlC&jY03EPN zz30Y-DJ^p(2}vU%eNOm#Wqr)sn^S&L>|)3x5v%tY!x}ekMpNlwHa@qsK6zpS1`Z@? zf1JM?>ii_l0U}L?qngj{!|n9lHWm{irFp+|6nKn$w#HfnUR!pMtd++q)-A6X<`NlA z%X9n0n&j%YFCA8E-WwuksSlO(i7WNwN55Yl*2N5k&q0)pD(DAwtcOaAbRrxL z8zd`XWs#bRzi5{tPQ)?h6pFq8ASGsiX&zr$1}~2>F2-|)UZy^qk$igMChczq({;F) z-->;2Pm3zFXPNg%K9L6}iaq8YnS>~s6(EUN=C#VlkdaOkYv7cHVpry2Gz}n;g%3$H z2t%1{Xk)8$AqYuQA+;(p%6S~riFKgFky?|TwgT{p>WTe#Uiy(|19Hw@L#^_ce+Yt` zykeIx{lZLup3mlqaxPqiI{F5Y)4;9x+#%wb51z;(xNzmwqe)i8PCH4UL90rAuhG^k z3e3PCZD-Ufd>kn`49o_@9eK+<@b>On$QkyEGqTLPYzEPNSsIOXv1WfQaCB$tcWjLV zig|>gG}vOSs+19^<-J0UX#;KMOHV*9;=P`kC2QqqcZd^Y(Iez=t22PTg^s;ooedJF z34Q*x@)5;nbHb#(MrU4{vmZD*_h$wjU&S4=0QN6n7lNT?5`PdDWmA^so);Ty=!Ro= z0uiX2${5zkY7K_|gZD+mteA{4dWL5G0>P_9BwE0nWYN2|(y`*jJ?O@Jm+lFe`x-+9 zb}!FcEZUx*ET2z|)P7|LpcyxzHz&H8sp(s8zyN4?^g7+_*f`-7?_mkn&~Ts9tBexP zxjmbvFdjRKs2KP)6(i3g6*C=XI%WY0xWDgu0hsX&tyk#C2S(dXG-Ms))y7;2q|b6) znNCkgfhpX9bz7U|#?j4#`;MlbZH?2AS1wGtS^9B8GPbp@+d*|FZCRN7yAml)_BVxc#IFO0Sr@XV{z0${Lci)C-9^rGHi80;qh{_wqc5* zc-K?{2l3@=<31qdA@fBh`80j@cPF2}huH(X4f3OOQO9}Ia!@W0-93 zU4qz5p;dprCAi~=u|&0!0JT?3?lE=y6s`Ffq*yuEQ?E{WoZJs4U&Z7V-tx|P4k?%3 zv9zfB#Vgl#l)t}xOC-gF89Fo>q4&_F?Y(*9<|(+$1KeEv!fNOnqUU`%P1s2(87MYA z1Hz_|;T+sJLQZrKpk~=NfTF~!3YxVXHN%p$;%`wyf4(_D<~jDx%)vNN!j2KHDPKN^ zN3M4)n`iNj*^BRE{!{h=#(>lHdvRO}^BVbZDtkW0eBJ;=k6dCzGwa8r+ID!x~rK8HZ3hO~Tk%82pDfFB=xEE+F)@+_(d*rh>Kj}=>a&N)mg zvCFKpiHa#vu!bhpL-g1SVH)Orfl-p6Z*w>&FYa`1Zx3J{>BQPgQ06E=8+`yVX1@SS zm!Pep!z5(cI_MiTm$P+H@lopaBfDPPUmGnU6Ya=yC8)K~jq5+60CwcNiGf)9b89Z3 z=OdzQ9aC?1b4LnUWYuwi2QJ=mM z*wJ)y$)T60DDTTbsifi9E)ev#WT>XUuY|=^xrQ~m3x%(#2VKs-Vg{sHohT+ftj>WX zqs4H*1kTAS4Uf}dmgg+LmvhRzcg&J&YI6FykAi(`WPF60&xvI^V~6YkV9FJRhp_?= zCiH;ZKTKqGn-_L3TDb(;`@7NJn-#O7wI1bR?9dC56tO+T&7JbdjWWxKu=hN1qPG0H9j1~fai>`f+$13m zg?A|Aao^b^uz7B_NIn&t_I0FFEOT21>@Klb#aM+B!&ao(tkFzfaY{f;j5FvuEWu|u zkUv#$6sQ_v++cKB{}veuv-~xLaa8d!IJ5P;`?n7h8FdI_2C`!4xJ2bA3sYx-jyu!+ zap-%;egE?bRuEB}>)kn&@az7E;xj>taE2&0T<=5BgT|`O#6LYU{->nf6GSq#0 z6t3@^Ow0MqZ!b^xq!rMe)T-u-74==&n0~bQpALj(^(6vD5X{=B{9$%P?Pje<JIJjF=(<7%GsR;^x#(2JyRVJy%q zMmj25?+I+6>P?3 zak!R8IMf{AHL|6yR>V(UQ|IcdoRqZdHL}gWdFjW%E~fgj{gE@(W0678wII5Nt;T&a zoCTSTat)XuL_zJQaz^7r`dKU6g|=Fhsux?I`t;MyIKyE4H$@&i%0>F`=BykoBQ$(? z9qEYIYB~>vEu2sr{%fYm`Y0|}eECI@%TiHZtc>gyoznJ7y{qkB_SSf%>2LY~ zd$5Wn@gzx3nQ-<;DjK%cikZu^+zLLl738*ayU|=XsdG);+k2CMCJ;2;h57^rukP$* z)tWklEqLq_G&6a=Ziyx>sR#d^(?Wdn#NPY!7$=J$!E0hkwCcmRIh1!y-PpIS_I!cs z)4X4oM7?W5ich|MRYx;c=dnBFbXrfUq|RITt-$>G)DFd*Fu#mKs<6P0$RhLtUjJa7 z+V-WvH4cnnjMKV6xMz@eB6Lc^C*M_7kyL$~yO_FnwhdD*iJN*cYyG>PZ9+fBe5kU| zUnf1Yy>rLv!Hla{!-{a#)o61w*(6EtalBlw_^tjd`yIleK%;^F#8!`76U=8L1joNE zAn05&$MLlBIiLrfJ4N|F*TnDR?8hI37!^tx!Op_Qc#z|+_++S@pSHO^L!@?t|7wGJ zLQZ_=nXCA35$=h_5uVAzuiF%+PWKySJ4CFwzB!e9)>LmmbJ6uc)ndi_$1xEtSC+5; z*GD$*f2v$W*lC?RwX(N(nq9Iwv^r)EKQ1f|wZ0AGhI<}6q04edn}?zK+W*x@!g-yX zs)W@8Hp!n@vTd|3AM*TLs%K(5`YI)euJ?>L5-&HU$ii0`o} zLz}x5P}~3X;ot8ehzen2PIlRUxpw(jXmeTw7ydU0>OZ^FcK{HfH!2E#|JahhP6oRG zBn`aRmw)P=|Dj<;R3I}*K#~0tR{U2l{67yM$Z$J=3LpZxfQZMp`)lCdm4(Wj*8jcD zYFD5`vN46tKh>(xmB=Fxren-|p=$rxbPjyBw87!kzPnidf7~Io5{yR_QuaRMujBQd zDzuxpT4q0`um*u{HbfsPP@^=o>^E@93vOdL6$Qnv_*gNAjBxupubtlUm;QnGBbViu zdsJndW;>uhuLael@3mmSE!4sShpSynAl!)1h3F+)uRMPUC|7;Jq0A#f|6wZ*biduk z7FoFcVkpx7pW_2nh$*=wNS)~)gu-rLhVqHN7uI_28uKg5v>uR6%mTSjNpTBg2HIL=J9ra{sA69L+=5eR5^6wf|3;Q z(k+Iuj<}Q;qt1FWZUvRof6Ra6n3UGSqDtw~Xu zIcM>i)vu~fDs}I%=bZvuv-_9VKNhun0|u5QwLWmByX+v7c&~#YQ*MA`d{Gz4;m)td z7B9@UBq66AHJRX&5G^|0?l?d2u3ZnX0TZ!S@JslZ!8n<|m%YQ)&~vRXXp5XF&N_h_ z^I#e6PnVUGkL6i*1-xcfthS3>Y!{3}|G;fyhebMZ)9rB#xQix2_s0>PUyBO?8M~$; zx4%Aqg2*((A9t|9uMTG#9Q+H6`)lTRkscubZCDFYR>QBjUDd2)JVx%sS*uTzU~IBJ z4RUNDzaYvFAo&ci=h=xWVFUG zEZLVg?^K4{hGpZH>UO2m8}Ct$EUosasxkpd(sx)%S2l_8coT%xs)};6e!ZMoIfV0R z`HflIq;4DdnJuo%abi9=9{)0G1s#y7tz<6}Q!; zvqKr^rU>RAl|BM0U)M?G`P?Mtc{p3G+L}& z25r0^ULVk$laM(1Hdl0OQ)a;@fK0rdfIT6piq`B{pm|7dyi`a1+vK4=2Wxu}8IiX1 z_m-JVO7KzW3SN$^bgh^LO|IS15o>_P>}ZS%Z^E-Y2#f%9h#YGUR}t0uzGfxUJX%BxG>yThZ6gwfByy{a+{K9m#w z;tFb$Hu`C3#PrZq4|PWL=)xEuPDo(b)Lj8BJZ!pO^TNG>@p*pP2<<%G$_w_^vM+lt z&6gN34F9&m=)ktI7AYn6GtW9rb@U8+8CLyDnky_|z}y>WTZ=o}s<(bSk7Qn(a`{3X znMs?RkywWf{d!%yKRR-?;n@idNaG#yWpz1B zW9<)N2KczZ=68}UZ-4wo^+y5zY2UKl5m}N5kCu)kn^UdMq7L2UT16aRh1L|A7h7L@ zxEJfmHhIx~Dak}X-s}#OpUs{Z$dgW+J4gAjshDy9)_bm1a)o{CBMPS959AaeoK z_-yX?EAk%MJt~40GHEy{%EY)T(SanTcMjZJ=S%izQrVl*%QOxK$G4vF{k`k>_ScrX zUUvqmF$0Y;crG*Z?tqCNYLBND0|SC~@rvD_$Li|l*!eix*koy|6tmG^&9`aRp2gr% zU!VAKNZe(c!DJyx6@ZuMxMG_>OhCOC1S>6Mnqr#>+L#jyp-QKd{9Fv0UU)j%W1;=) z?oP4MU}gPS^FlF)Q(*iLoX zM+eJY!g6(M#_S%&pg@nNJYovE@oT)1ZoX+;3pe_7Uer8m!$Zm=#Rgt4htu89SH;uw z>wiwOsFLT6L`Q1jrF+Zw@;JNbkehRakg?qrgdI~=k!e_NI$i0*gNbLkSu-cpo)_Jj z_eqOa#p<=5pR!8gR(|mdFl1Fzq*zrxcjUL+Aej`{AnH)svwt!7IA#|1QN>{$8lHkx ze^v7uo8%NQ%ocTF8RZM_HhFll)vb-HFIjHC2t18I#B2M3dBP(08t(B{NA)6B9z0T2 zI;!#4Xh!l@U*9N+{;J{9pAHWxpm$-}LFkLvj11g-KPtp6)$ypRJ(1`)nWX|A*ya(; zclB}O{xxDRTt^oo!?vr}hpalvB-eU15=(dbmj*Emz&OsF?Jro$eb1+dqISl+Q0fyp zy$Pv~ffO!rsK?AzSNrk}^k65tio_GK;8|cweN7a__ue#5m`Ytm3ORJvo=o2>E@^GaI(13u~xKayExvVOfiao7HF`dIlbU3HS5J9 z3B>7HNGraFsN$bz7sCr`;`Rvt8gEdhqSg3``v5Q<=QDbVJvZ)j6*)44zYaFVh_1_p3`B?rLsK6%Oxlx1NP3 zjy{SheRa;2$mdSjQNV|6Q{!0Ugb6BY{klSnA-xt3&z8CCtYR`ebTLg!DW1Vl464iw zwK}IFj`$3;^M6>8qm1bHmHg_Wm!xkgFikORnA0>l?WWAZEpk@EIMEF$8cj1Wvs9H7 zT(}l2NMpE=kDfeLaFgZy{fhW&b#_a*?z_`%G1IH$-Yt7JrniNtFL$xgG>HAZ2Hv!ez5{kS?1;($_M+{Pvb?IAuBTK;$OtV9sucPaome|Ic!@AC*{#t zhpnosyl*~8?_c@V3lswo+uS=e{bLa(AtNOZIHvYBsZH`^gsSi;7@sc6X;_dSxj+m= z7rmUv+0LR*%N~RWQ+;_WT@Y^qiRSIhX4e|YgQB?By=J3)VVs95gFzqCFnVph{pi^y zSgjv?xurTJ>z@gMwQHBH>TQx^?kq8)uBc99rEZ~?csZ9Ctso6kB@I&g>FxEIRa}o; z2$ef#Ne1EKJCl)DRbqcZF|<>sy*rv_;TKT!W^y!MM_Tx1)5}US&9sO(%0h)9tYQ(W*VDa)6K%{Qup+xo@ccbeuBSNnXF#ol z`LF3h!B7Fj(1;*|+YQ&AOF2=I5VKP!(9~r^2y+us-XX>#FVj+39*UdTZNoDLoE!As z?QM)x?#MjJa(-~8EAPB!XQI@o9Zwq}Y#q2i5kU^8dy36kQHuczxW|wbsnOsY`jz!ltjfiYWFoM?soIATh&)zP{Q#)9yXG$u6JKyGciM;& zR5$gY*m+S~)luslo&?A|2J$0brafPSc-3>X`8F=b2yPAYF049=s$_#=RY9h@vub{o z<%IJxd1qkK#aJ9{``TbPpL6qUb7qiTF(?Y$JJR*8a4~h=Z%8f@ZHnd;dLM{NmFkcW ztvPAFH9X<3?}eWtCg7#v+Cux@)ov~3Or8{?e!e}>14_a^SetAMTC5EBYxa&2)rvrmz5}#&)UuA1=*(PbzPjB_h>mFc6ByyaaIYJ zUoF|s*ShR(t?DHsA;nEKx)wV@_qfK>@VBu^d!Jm%^wBTI6A9TisZg+At=++$=lP*R z*8Md8!Zw|bxuc)n)Fm;7T*`MDD7>9KGXT!Om&)-ysPc~si;dg#cz&0~5wT3GP(ht} z1pLgxYX@=dU{RBrrsM2hshHM8swcOTfiSXI_N}r1=_z~@G9L6Ti%w78 z0xj*22f1}lNKMhf&Qi7K57XDw`6u>2GEg1Aa_6Slownnuh9y(7`<11*XA!C@jWoy4 z)jN1(jP0VW1P#32nk`ixr~j3j(S9pH*bPm*4*Utog&y~XO3S&0ml!gZv^buav@Q1C zy(&8#uGzL@wcNPirp4*TI-?9T+y{A5PJ-v;NcW-Wn8>p?=`PkjI=IO1k7+IY)^o14 zyC&-3L1~X~UwXk$&?YQsO#>?%W)+D>K+q;vaHV{oJOr5+7Dj+tV}tf@_TS0UD^GI@ zG_?&8vL)qt1W6gkfg6Irv=52V*HKm`J{Dz0H?1`bjU+wgtYdq9q+Qw>@ivXmB(^!6DmTO-tiMMfRpU6wESFHtct(9c0Yf7_vgv%l>Rvj50H} zQmx#lsT3+=f|!6qe*w*Z(O*%19>AS=j8+TET> z$36@Y4{=<`Xsoo$zbR9#LnHYGamBzpo`HAtao&xhOSo}(254Pv&svn@YpnnzHnZoY zBKz7pq+S}`ZzH$mqYR)f{v)@e&wz}% zsGwh;1J(ym_X>Nc&IYazKaa$wi3S#ff{Q>V2$WB!H##Bj@;B4}JShBJ(+|$2z{Q>q z74{zk!k34>Yr^j0XsVVV$<3eH6nyF{M0|qSzDM=!zfOV5tcSTD!R#zp5SRn;Og0c; z3>)ofNgq*c1HDthYdOZUKR<8s@lQt}Vs-Jm;YwWLzMm&Ww~#|>moDx+j{#5RvTAl` zpgfDfx(#}Ug6f`e1eyKe&~YQs%=Ci9Fsb#n#n$@gcXGfD<+qml9d*GF0p>o#y-mMN zVGyzPZ=N2}f-%?geg>NYWN4ydq>{OTKrzW)2pDG>fB-X2xx47^l~z#v5&F*ltT_Pt zp^pfyV!t&@{_HtKXg!F8lZVP}3qF9QE4uyp`vth0v+!Gu1PPbmQW=R;w2u+62=~EG zo=oI9=-7i}XgEN0;aEtFR>TwifAlwWf_yRD2DT`E*Iy4A`BjAd`#;|sfT+BAC4c(2 z8~p1&{Pka`DG{pXH~9HzrGNTv5j=11>7(bDe>?wwo{Y5_E;&peI|;(Y|MhFehoRO% zb>``RHsa^-1A;{u!0T+4j?E{2cHQq|_OI5x0RvWc`m1#R2Gg?vypFWTfwts-Z4T{| zD}Vr;4Px{A?^FD2B9b0v$z(UX?LT$Q-|x{9=iiOv$e)I=|Jge*fWyv0o14=WKm6~_ zL9o&jYaQppDE{Tz*=|Cc!xW>ce@lRV>%Z?;f*?Lc%=k9j-ZK1P=~@gy4|iG!R^Z6N0 z?|Z**=Ev03)YSaARCRTy`<%VcS$plZ*7L0ABuG|T3=NqO83qOhO+s8)9tH*;0s{m0 z6!8i8=G{;Fei#^3elsB3%57#Q)OXcYuig&w>VjmYruVZm4|ckC=| zfp>`Xz9(eUh!VcC`l#@@9%_oVyZW%t>u55jVaw7!CySKn>CnA;wXOfNq(K5!>pXgY z;CvwY{?1{I-()bk_WUW#7Im$@-OB=4fu=lJEMmO&XUPejBug<|)l**6ybkQCIuWhqYO%2f9$YX z(mTa*IFPMq+)~1|!E%tvrrMrbw3i9@m$sa!BWk-kh7ZQkpEHciZxE3kmU1CJX z%+AK!=RvI6c+HgUhj4PL;ey1pz2E#hX?G=$QJ$=aSPkOYXGC8J=sD}xknyXcgwci( z4?eCZ!zhb<;spOVGxN!MD?WVStF^CAD8dp*?~>7fwWhgts3wsOhA7Rhu6<}=BCvT{ z=J8hUgFRh}CL(^8An_0mekT>pul5j!8qBg_*8?LG5$KY2!?yldywY~f z-9UP*Ndl`T0P`ZcfB7?xTNt7?wgB@5HW}=zXN33^`?$Vj-9AM6xGG}dulMOA!@3{K zkPae5kf1M0{>c;;T8#XO-Hmv^l8i!&-kFF{SaiR1{Bg(S)8qJ8l;lJw-Lk}$6rV6Y z2CcNqW)N9_zFuu|#c6obx)sb~xBt`4vznTRvJ(3xrZol^QZ+T=`aADndz=mbtMpz? z!<#sic|vpd<+nA&ih;c~4AzJ*>|8bi0}up;UWOaduErPz97dfj86UKb$<9V3_qm4{})ItQ* zA|#6z^|Dn6Pi0+7x5`vnwW}LXt*qDuy3a=$<(MA0b;9+R>x{uof3*2<;WeuFnG78*V#LqCyB$!zr z4wioN&H=Oali3zI>2YieuI;1wfWlmq1{kkp zqKhvih)C(Aj_okaB1ISo?YMemcjAax0qx}Z;#`pdpD|uTh~fjkkq zD@}?Y?)>}B5ys|QZSquE)a;DeJZAIIuLAL}P(p>VN9;`TxRB^Y*E1!Dt4wk0o)-(r zq{j|FH>JLU+wFqM3EtE+AAhqgnCq(GTglV2f+rx(% z+K{tHGDg*Rr@l_Wc^*tm84)QIFQg`!pPiIGpV=U%q{OCjsvN3-TQI1sRahr;+OQ=w z5vE9Az#A<^BpEFqE9)XFBX6UyG;GRhp-w^{sw}P{SeTQX!<*(Mn_0-2Z#RBnUNKHH zzBOJ_(5OgVsF7bW-c^WS7+XlJ*j%(&_;&L9q^^Z6o3Xi~>5Gw#;f<**i#pRz(=`i* z$@y%zw-p*8Sw_XU868szrBzDXm6HZsqSa{?n><%MS8uKe++*L!&50&vCTD8pG>X+r z)ys%Uc4<+onFLD*4B912YrUURkIhWXtyetv-IH37aqo1e+(g(y zIgva)-&z{JH*8@-WbS5DwU)K9UBY0mWdFjFWLYzE@I`N*X;-hQ&9&`aAkj1DE=0C+ z_F(n|6Wm2^-kgm;sX*T27bg<#e2P}W|o1ZS~N}FG(GP4U>0Fk)|)YtF*G`E|Lu=sA7LN2 zvLS7b(vJzkLI>sjVyhyBB7D^;4VSr%Ia|wMD|5@enNiC}mee!KGuqYUv&_5pyKkno z%EWegcPe&icNn?To~Aq%3`Bdn+lJKE7#Q+N1#RCpi>v%a$P0@X#9W(PBAf$G1upKK zt@gsRpZ|~#hF+}gtR$^B3|y9k#7O%0X~v4iPL-UM@Ju03ZCkTe%T}vdqgj{E6U{%~ zd%j@fHq0yPMt5L(cCoej=k;OH>CR#K$-DD{lPz6y7K8YZ`1MAl#kY$U4PkCRCx<7m zPdJy4V@oq_O7pGqLHMKc(AalO7vqYea*MgRZ?22QG*^NH` z^5Tn8SFhf#aU-hz(@1!tr|v^%dPBOYsGT3$K1hC?6RZ@B6r2p?L1aT8KrTkwML9-H z#B+uuVZ0=0;%WG3(7$3=!0Cmljmbi$PL>~m6tPWCEHWb|kwGYyo=GNJ{KAQGmPQ>% zi};Z~2Q#g~Ue|29bTBWW*q4yd!p|Or`&j(Ecbag#dV;Mb{w_gpgCw!{sb;7oq7tGe zqPnP5sGDVwq}3uuIo;bg2r?P733HGYo{x%~^_b0#U3K|?QRB_ADC(MyEMxDQ`H~3P zYjtfqbXyIRkxgw$wq;5VPl;@zy+vXFq|vcEv}~wod~!J5z}A3&BXCFK9o8cAUGRWz zjL|{F*d<11vbQK@$9LywN1IfmM*D(>n#V+8@OnOOo~ZVb+iN$EW2K|9n4#W8OqzEsXHKEdB?BfHNC&Oncfq9#tt z;1prab}G0X7g0#X4aB>DgO=KvILWhMd6l$@Zt~sOGCrukKW>JnPk-`8O{=2Kct1%f zK8kz&ymrt*rQeI2(MWsMbM>-fo&SzB4UN~$ zC;IUYYHl;J9St4$K2DcTyQPmLA+eS8r$fHRT$99rnu6ZYwh@ zN0+$QySHwk{Mq}HI4L_RKIK>vsy_F9(0-xZNoUfev{?>Qv8;MD&tC4&HQEiD3i}i5 zq6Nc-e9N)if!2ZQAO2SPh2x6T(I?+IV zD=1kssUsj}>$fO!1TX5~W+)Y1DMG<3wFiz?mH7HJP zjksU9ge{?0yZxBZI;)$v?YMqtaNT`|t@5ZOr4G+?cgJb)RFXZzrgG_hom-)SrPw$>C1y&VsKkL+`{y;y!GciGdx1!50wUhXh{RPX7lS1Th6T@77~DL6cr zqipxDcg-$Y#!My_N*21@1wBf5h>sh#11>t&7bE%E?u$Ilu8)Rw{Cuj;Lie`aDc#r( z+z+3BWanP2Yqr`8+x0nMxL;U5&n8CY+qvn!aotns%{57-_969ByPnvySgr21Ie&75 z@T6T6N#%~#_h+!NG2DSW0!*bM%*KJIC_yI2eGf_oM@28@g^a45=r?>B81t2T4^76K z`b`+!r8llOAFj3xU_WQWhHs$BvR9{bxhhAc(i#OI4#6fGG#mdm@%2%_F&QNObyxcs zqU3VmaLssE?fGe4@E06F6v0NS62?+eFf`yZA`Cn%A!PLf4ge}e;e1T*uuI2y(gGUK{!Ai(e9Dwns%oJ4}Ri)l@8Q54c=o#AR z8!@89166m;k`Zi9Eykun1iT?ik>pG2G z&HkCm+TmZf1#XZL`U@j70~6!lV}nz9px<)InzOW^PGqJP&=c)hk)BiqI$-&58$i@oH>B#rbdj0F<|NQY^C-N{tZ~Z?^@t2(+ zz6FNnL*`-pd(rrirFh7z!8*P$6P8f~?@-SKf3Q$5diB>k^s{MG;6BO>42%Gbgz#HM zSJ<6ogqr#G#_BFsh4&k}@0oLi1Owq2FdiY{qoxPgOTfjxqu>)@u#&H7C4(RcNuob?{c_4_AdCGwW^ ze~kxz7U*6k_`jNo6&?ffZcGse=bzdCz7T)Zl>d2M|DEPxrPBUuwEueA|M&Ywzyj;y zsE={rWOK|Q;Cdv?+i)$(VK9yue}EDL-tTfFGwjRP!2w7go$=4d_>QqkVNq04kn?$m zFamz>hRfekGW}#Bbox(FpXvTsbhoHDXhl&8?UVXDjtq;#ArgdUNzOvU&}>}GbDIXe zr#uG&$6#RP@uP0?FxK2uZ}$NHmjk4>CnI8sg(Y<~IFLwA@9W)a(GQ|A*IR|9{mVZ; z4m`%!499JM;eD~9cdaZ*%>rTnJHQFBBas+7Vv~$OL)5qEmN;gE{=+U}o6gYZ_;$>nU^ZQ+R5J!$$dUR?v;7zw70* ziFS*3jN%McFL}?=RnObUyD4Eq&X#<99C|2Y4LhY4Ie_&B(gR;6oU9T49Uoc0!3c^9 z9Y%71(Cjx!`MnjHHjaJClZ9qEgt5s~S;MMoqi!0jy~lv4XRa){trJ!3l=pQaM^PD?21a(^ zY0%+GD~Pf>QEc8_H{Z```rMu}?+@`^_XCfmUh8`NcP=x9&8I`P03kX~CI@eM)$-dvK z-%p^7)f}a3zByVUB$&N>QyOk}`6o6o2G<*w?_&Am0m0#|Bg32CHW#cn&ND{%OCb*b z>?#g8=jBfyp*h~~DT%Jn_>;3PV_p0+)E$lN;iXm-#uq!lHGekW-|j2Q2fnZur)yMF z(PEo6FD%yg_D%sV<*Hk4B_-8&?s<2){-~Z?)HuE?jF{N-c!1qcD-9w`aLeOZQa5WQ z{)pE8qNj5(S0?qPDoSKo|k2NIQy2s`EuC*9QEk&4XOqic%_W(`>Ik1io~C zMVqhaL#qxyXIUXNQK>{tE3M>T_Ca1GLF|Q{Fx@=3Z*9!Z^JlE4vUCkvv>nX@od2xg zE7-B&afufXyF&ew??N`k+3MJRddm4tR+RL+Vj21F?!?6oe4V6(l;e5Jih+Wd%O7*r zC=?0fz58IHTqH!rf}T5H4=ciEsk=Kbe0~*LiEUvQ@w!1#@*IRJX-S5)T!@a>>4et1 zr9a73h7=;XW76F8F*ZkJuRSfR7o2-YG==_MsrY@u)HM$xnpuIy8Amu+`q8`&!ur)G z=r1h?Iaa>}Q$#9A(!ePm*8B>l?br6$jVAns-+U2J7V|v+JlNITLrp3=&Z49?!&G^m zNsQRb(NXJnJMybe_24W~H?YtsyLZ2+?eA~TpIhuMdvpw<^WPkdJJIU8HA>&%96=t(Jhylo+~zuT#tGnD1Gv1*q!1| zn$=8i%>T>p)>3lOFHDaRp1u!xd4)#IH*Znz;;d1V>yU#skgg(cAaN-tB+5W4+ONy2ZC@7u9u z4ENOKX`5L|g<3yjqu*SV#|(cZYJciPLoT9IMtF!}`C# z`Co1OAa^ufW=Htl*f1p}Auoz)*Y&*1|sbuhT#(*rk zZx@euh*jjx@wU9z?h7qq^}Iiw(plmGHCR)u#NGyGef%ofYeUn5pys*Y$cuIUd$UP( zi{gZQ36fd;5PJQUQGZmmHlCAVA-S4#p}>UIFn+!|=@beVnHj6<1)3tINV~sR4EPzD zZ@3&|WY#079Sq^Q8RP7hqDIQ3z53XyO7oQxohmRyl6A%H*<2#`UJW_kQ1ivm{f*2qNVHWZfz zp#+t1`uz=A|5t(FSP*_#zjVtmFV}8ENPjkio13i68qm&Zg}c2kNaj8RF zUA$;`@AWLQN`~Klq~NE$+ml>=wCWP4mCEki)Y4;_mogkH@D9I%s5=|R{85W<%er2A zw6_|m)#vpmq^#D@*)%Wgd~bW&gunjJLg+yk;!}+O!$Q3A%L-3t+-bh~!&r!SwOhUP zG}o98cwm`rkCtRYm8j@s*skgd7Csj29hkbqbNf(;$BG@#s!+}f^TNkv5M}a64neYgVv_3R ztV@rQ@!c6MfGMeUj$Zhq)sGpfx?I+Cu*qAj+0`Q^L8XcPg^2sjcYbe!mEFnwpTB#e zXj}VM+tB$@7hP9Fm!nm*gF58+&KF#CTxHG|QWr=mzALcHk45J0U7tFygQ8he*C#x; zvto?(n|Rn~46z+RW(eF=OD9btle6+d^ofwT{>WFUJgoSs zq{f^(LG^9xPW^sUevdiq(B18IARcn6=TT6ZU?>zkm`{71&2+t@UmtZAj_COC12XX; zxjk!qVITk1G`ZP)ZQ>(?|`M@8BHRFA%+4N%bFyV}Tn)xcdQ7*&=LRR8-QP(mmTK*`GI9%#h^aNMwC zaI*7kzGQ%g(}dCha3ZtHIT8AEQ2r*OvCj{`@H!b$sy1}mtjR}Dn6+&~W9Qg!6}xST zMT}V65reA$&eoA5o)u8|C)YjSdr-@DB_OA8+p323X0?mh#}ZG|_UEug@3+^+9k?n* zujMZeEs`~8<4hNx22|laTN@TaFUF;>E2Vd){Ji{6@@9c1@2EF!Z=uPX8Ls@@0X^~& zj*RkitY^l+G0ER%#zaweBGVS>9^G_LP5Bgiq|?hIKuavH?IQIalSLKIt}g^pZaNln zrqyUmTcj+uouOb2=BCZq%IZD#8>?#pHZkzlY77_YhC`vl5K-|me4Mc$SGR2#o&H$t zwpeTTB!bC{{!XoLx4e&G4GL2l-A@Z}zg_>c%bVZ{d7J`%LpM1Tk(dqygA$p;f%oxs zXV9eZ_sQpeljav!+ohR=$?jYEehVL1kyis_n{ml9zE5V5OxrZyLXKxaJ^BIRr7Hzr{Bl5rOiiWA=qNVbVXPt0-0rS=-m02Of|s*;mUKqKVMTuw!C_w-iKw@cXwUrd=*rzkSP7$LG+KKJDQiW{M;a8pX`jR+VN*R* z`?TQ}-qGLr+~$S)Dvngx!9PaV2BCFJB+~J3XZbt+W0-OPDEEn+>e1hb2f8wU73Nza z!0ELqBoO}I@Bg6LPuKv>a_B?;!J7Y_?!U(T|C6hjQo^NroyEItqzCpR5PM?aSp-qW zFwI%lTcwjV-QQiWedXR`A{YD0+URlq2Fgp#*!R&(HGAJ8XOC_9DGh4AFH1*2q+vW5s`?N%%9AdKhskjsg&{ z%d&Dr)c99>P+6M85N|cem6d94f1(ZUid{*cDCA1Bfodlw-Y7;55>oT?$#egY9Qe!M z6r`EEUO0TI@8@~)R(G%I+S&n76N$iLOjv?AE5Hg&Ac*&V=|g+C(O1%GBe(A5!ch1! zQ0jeJnC0Ti>%61NqTAv-?!mjuY1E73FfJz+kZ}XJvt+d-v$33bP?EDd>*zNuhEyf- zeMKeaYfxr7283js&do1;wl>Va!VLs*nwW?K@mosEs3arNv=|SRzAK#k({#OuQnZ2b zNy(MgP!Vl^=?0+nN3mhQ7yzGBRynEZFd(3(J~LBlC$|6=e5OF8DnkTE`Ls$^m_=WB+0%K=biX)f5?{Gk!~+a z{;&$nIF3oR0ltrPHAcfa=m|BHl86g^Vu|p=R^YcyZqj&fli0q7!c1O~bFkDvd4C); zIdZB>_jjEJI160tr{>i&#);Fq7^In0_7BeRN;r++wvIOZ{^fYs5tOyP099M-9M+S9 zphpa$DLmSaBO)W*1Ja2_YB<~htkAKZaXFekGa2GNM?K*?xd)u+{VQIJT6q?qhXH*B z-oWO>OtUkWG{86AU0EDYYS_d>X{(}kP{&$7VX61hb=l9B zp)@DPwD;raB}k2}w#cQfO>+QXwVkr<;8U}B-?*{>NAj<(`qU0k5T0~8lmzSNJ)d6{ zvxs+yA2;9I8|2=95wrOLg@Nh0b&Z}P--OAGQEYMC)k@BXntaNywY$o#HIQa4`yR?n zN5_mRW*nQGcw>l3lAWk&FRf2b%xAc#&;ra0k+nkS9&qVWOHE#xn;+Z*w;EOP0M6cz zM`3JW@}$o1SHI7YN#&C}#0RxhFdNg|VVIBqw$SVFMD>iSw>O*F34>hsH^*X|s?1%g zAz~GPib({>n&oKfT4oiwW5QiQT?78FYJY(wyx7hb0j^l8g*~_G{&7a>iyj9xi)l7e z{b&Nftvy>*Gc;8fhos;N>)RPZ3YoyZ*6G*5Z@%5NY4boH3*G~9qcl&ays0iG``u`* zZFzUM{#|VI4J8469pCNASY|_6_vi1naL|fK13qD)RL5htN=}E*a{7A<^8HWR(grL~ zz}GJnh_R6M1&=oN6%fhro{p>9?aZpk@u(S2pQdJ5@DtHSw*Si3;sFE7%>x#Mksi_e zAtlTD!YD{0w~(1yV1CzO%XsU@pC-`q$VCmV& zqvV!q^C_JVaHVsvARv<2;%HN!b=8&Wcnb1M z6%Kz3R(1fy6LT2fRbH8eyT8a7%{l+|%ut5@Ab{JVPG?ejPGeH^Wsg*j0icS9GT2`- zU%$~OZGv_(zDXbX=Tl0MJr`M=L2ahcMLT zg+#iK*9d4-YHUZ_76exfilYTmKR{6&5?O&0l2`ERF}@Xf-4)ZhG%IwF9iQk8l%UaW z_5Qv=w6Co_S#LH04658{qT^AYz}3Q$K0P3)bx^bZ&77^|Ne2;zg*tW`Gkn5CDYeV$ zFoSji*-Ep_?%D6qe8XgU`nu^{n~0S~AFf{gG4JJWtpe&)Pl$VLPAI88wXnd$0)0kg zs-h75DG`q}F4SEEOwEEhUCKE^(!Q(u3t`fo&^mnex_Z{F@kxVQzp2&&CgcUu1Ld+M zlcsiC*F5My`Ek_x#o(p>r~UoC>fW-Kf}foKjCO^R4HAFRN z8XF?O>yL_rIjhdlx49kFYT6Rxc!iwpVP^53qV4Rr6PSpwou-)QDJeNRWh*p+ZV>tz zpKfDaC+nupo6qYYj!SoKsEZ_E6oRRqZ6w>cF!m_vTS*kUEy5t~sb{ z^9DX+P?9VI=2V)TrVelEX89w!ewD3G7zNMtW%T6N_!GiH6^ zcB-D@44A)qO9IxQ?N=*@`8^t`dt8D4yHf)oe;%RnLHRlX+QNE9QWb4CBU5CL1*fUU zgLSP|&F=e+=c13$vNK(SOLs++$KE*Xi8YM!fN=P6r7bYfjqCn!wX1%a-fcXsFWRYM z;0?-4_lEK{_C?teQ|B;VtIlcix#^1`UueH7G$NSem&3%}cA>W^sA21Zuh6cahcL2c@(Tmj@zb*!en zgzLS|{gnz`3UMd@l38wY>%k3_w1a_LARpfFUWncUX|ZIsS@Oo(=p0@~ya zM>U+X=XiWEC&1u%iOWl=m}vA49@ z{~QCC5eKVZ+Ef=0*s-k?u}th2EXH5TsCq=N=NLg7*El~r1_B6B$;=8$uZM7BiUHls zlffge80+u8ZBCWyT$Cg`401kKEWDz&$nAMXs~QLGHP)^r^i7{K^k;-Hn7f~DDc$#5 z1{2b<((Hc1i4%rW4&Qu17YB(ji0mP{{DXp_DE8nU5GV%}(BSdEK&>nx*kajcG|0#g ztNlNJ7JM@FKganCKWCEyRyQFhBkg}4{Sd^#G^qcb=5K2CA8Y?VH(J3NXWJgWqe^J|{xK;W=#z4?K>m;Wv()(5w>h!!KzTUk2ZjA`)mqe<+ zd9$5M&juwIX^$2MWxjq9^*#&Tu)>%eB2Uz4j(ez6lepp} zo~q+amfE-sMwzJ;>O^=xe{Gl*%^CVunV@`@vUpF;Nt*WK&1i1~p0ZnZfReWvLvg>_ zX&;S)dYziySZ|V2a}gwWg0ZmQ>{Q`~HCAb@^1GUmbYbeR+-S~1Z!@F80-XqmNq`is zgYs>8Z;~=NoV~ysdWa7B97VCWNNQ~*#AKpWJTa?E0kgT%KKu4QTxaq)21v6jBBPzv z3Dvx592VcamR~EnN%F~5HL7wUEG#nVjAnLwFZt-96ik+eR)SW37JPo{3V1uqIkM@j zkKdZD*-0EPsN3!MW-5noG}-JCy17#~Ar8?&9t+-~=M zlxrbws;KOww-ToAAI>Dk~ zF`LucZFEbFYIB4RwuawRD$pML8M>sPG(-rqZKP#LGOX~YRIG_GEw8ESMbVd@z5+bS zy@yWiW4M#;%j;7*Od*5Ec9t$(*QE;@aaSNFFMuF4Lge1Qi{HM!yQwf&HTsPW&0NH1B~5B#YC)!SX(mVy-mCzEqN91=xozWufHc_? z{{BMtPX-+@C0x{d&=~p(1@`LnhW<|x#1)}49-wAx;}&07(IKn{oJI}$sT=LeHW6F~ zkoaa&-RM+Y(uXkY2XpIwr~E;QvJgrS0D?b}A0V{P1ktVspf|+@jnW9L5GDVABC9DQ z9Cfmze2s2RGl#jyMEEQ;fOZ2LMGSo6{?8a|DgifEtklVKJotQWvxpMqT{;w|vqFb_ zlom+=UnePGTsipd&Cnjz!a!-!VTZUhN~_r&YmwlKhisl!2{j3OvMQ%`QFKJ=b8x|a zXV6yV%UyB$S%2Rr6zbFcoOnV`6f|KH%Id6mq?wV}d=DE^5bRhVNr{^9&-*X-1s+hxL3GP8x917M?9Fv?+hy1N!fi@ z{O&yrDY?!n^YTDfknDfV@RNq0RuEW!7pj!vor_kFjsths)Mrq zw3Ore%vRz)hbsvF;8 z_Ey}@sh{3Ieh&W@JvPs&;ymDs+wR?gLe6`$h1N6Oo)ex+?gkRW6J3LFHZWtf?^0>S zXNNZhhRuUv4ys1j%TIw)Y7=U}TO?EYTTIB&$1{WmM1q<5$VcEpES2Yhv$)5A)5r}C zt3}Fn#JYWz-oWkw*GPu;YsON2sV0a?$@xuvT0(%M&~q+}baxwf5P@)Rf^>1qB`E5w zL2>QmkNW@WWMx-?pc;VkeDad?MnD-C%WsbmFz)NJF`Ol)(m# zD5cbF=5sqTJO&m1b8uVMU!ze-nRUnr{+BJu3~?-cW%w@J=_-c>SJHlirYbF7{P2VG z56c5H1+#U1MCUVof<|cgy^^*w^x-G2Ah7n-RnF*>xTKlSJpI4uHy}R9!J@&=#<3v! z=L>A;R@W~)7f8m01iuyQe$^TBFoJqK7@=(P?cZN;ML>uA;f_8|0ETq@XN3QoLkdhG z=9Sh03U)xlrqKu}upU*kV@bwMC~FK9RWG_5M~L8Us%NwpfL#ZKcLfN7qDnlt>{SQU zC|=6CKY=F4)B!lB)nZGrJTI7>-Y;D43w}D-r~2?Y@8k8qa;`mtDgd)YlRWlnfkaNz z04lu6UF@oO2Hao20RwV$oWP_fSkp#;lDTFgDh~XC7m>uUQjGhDC;ds4fb_?8KyigJ zB9Mme_{^UbBL7s0v6O~-TQK(UkC;U3x_qcy(_9n^a|VD!LWSi8Yyr0Iz5sr(_dkmNwtT%i9Ko>gQT}l*&gUpzh&{Y60|){-lnN*YS)= z>K{0gzAxuXK1&;)f#OU(448Sl3W@Rq419u6TCx8Z#-@u_sH|WmP4WUy%TcbP?I|~F zpdpccJcnI|+wTam4A`8tSEY?-vrghOaivXHz4JiJU<{Dk8qgrT0SX}2M5`b1YM^Pk z9qGL&3=9!T_5$jNS6I-d+W=587y*4%m1?UbV`C*$-(sC3m1w*NNbqT1tt7x|uYD%} zns?9z7Gc8;E))y$frJTP%Q4~-5XJ9P`MmmHdhCoGM;8krQS>hY!e+&_#$|sIs%m0} zDtn-%JCQs!5-@l?{Or4Kjz8ZN`hg!mKO9rZh0+Xl)~8WuZ>C$Iono4ttxV4T1t%Tv z%j_9wdmi-2H6zA>gu>VvXpWHYMcqKhGnj4@nuzkGm!XLkN)^%#gDSA?ydNkqv916) z?!yk15g6!!eSyP(NO=4k=zUC)pcfCAWp0-3qKZ#WnxF+6+G$gjl!IQF3fO+9en2K_ z(YlW9Hv=wh#Zk+P<0P4GP&DkNl`??SONl@snAU+*86AFD-F!=539;?(qcRk)#=F^#1}Dp-&SSTCdKXU;uOgI``X8u{5E~rss3F4}YsNNs?O>WO#9 z`n6<@#j4PsY|evdVAJ`9V3mOr+#kJYOYu+nAvTdZGQ)Y`a8L;m@}_AqBX#%>IED$6 z09<7zgRZY=A(`G%+-$XG&LEWMTz2u_jmR5WKeqk`3s?Leco3@JLlXlS(`2vle9?AT znJNS|T8F6F|4FSoA_0&Tm256G`^2U6ueXbA>bNX8iNXcW|3o0R{Dk%#FXdQf2wT_b z+S4k{FVVTp?knybQP35!k$#fnokK!HES|PrL5D=n$ij9dM6r*b-g6Nmlg!07-=vsv z02&zhCDdg=tU!3%qY1hY##{DIhH~d1R6alb^G;{~bCcKgr_GY`UK${Fal@J#j{tgG zGGv-`o%_I1Z_^FlGN3CE({my_J~F@2_p9|4z()cVIl>gG@@$OS{kc(m`fJmwBs>JZ zW#31wx>$&;l-Wv2IVzx)I;dy&mG8>bVJkm30q8oOCtDzgCgmhq0{VZ2^B!aiN)lZJ z8>{uA@N_vBT?BK#WEA{)&Setec~+jg(=mdwDC>93Z|m- ztr}^jxhJ>o)19RC0=*&vyKgC4`Fwbj2`GW*MG{&EKw{OtXE)*iuX(*~+FWv?`Bt50 z9SCUX+<$cRA`{SS|FXxGxm^YvC~lt5L8^oc!#z}H^4$VE09RoN8U~(Bva+)^{MuXu zVv-pq3{Vq0d|&zW+<2`S#4Ci!9nRh*?@$!)>$#;Tlk)T|Ul++`_-`F?x{_5#c!pVO z09-fhbaI;X$jVdj@dEub<0F^dw4?osAwE5h)6KIkAra*>6hje$T`mILm8354y$0rD z!k7BF7o@{Xa98|)QmlwJrz+&wBcm5aUO_oyeIMuIs-4zpZe>ffCD@W%)~9D&Ai z`HVj-<{zJBs;7D10R0KtiFA)Wki7?I7k?*o0Crsdfbs#f3P{Dz)SR_y4rb(iY##r5 zWkzc(d|R~W#sM}V5Bz(#;DqKZ>2BL(vm;j^|H`H^E@p?TzcAXN-1?=y9UwsDF=~df z*-8;M`JO&}I^2;EM&p#7E#=7oL$bNxr#=;as*{Xvdq6RH#HA>W(tgnNzWgJn?YyYl z;l<dKb)P>sGG#rYS z+$g=|?x3gp;2&prIa)Fib|rnAy`+-cA#pXs=%<9R#^8k{ghds3c&t6=6s9RZatP$K zVcEGZ3T)?q+HP;9wXulNqDMX_k;RAB^~`UAqxm|sPua1mE0U_r#Y3ascd5T^j|@LB z&sECpoTzVY)VUESY@;3{;`4M3_*{H=m}t{PAmX&oTA%FrdtG`yu1;>BIu;%DzS5fD zx#YWL5lp2_3@C^}M<@6z^Nif=r6V)0%A@+C(r5xbIVM_ui_YCqtfrB2>Tc6XtzWTDQ>8=wXGcY2we9SHp4Hfi-1>GS zO1XgyN|~2e1@VsVE|+$v6BP-P_d!q>fn;^A7+vhpz@{o8C)J6~m_4UvpeEoW^l zQvznUqu;|6gYn<_huuyHhXj=mMtF5g!6nt zR_A4uw6RYKUAgGbswc2hR9s}BR+Y$nVmHI-JejN>W981rmgXou_WQ|{Fmiv zwqBBKq(Z;GcMe?dH~ZjdPvT%HFQi`aEtN|uU~0``X+Q(0z-?^Nm))455#G=twwEr_ zf5tp029Sj(@q&Foq2<3y&0Z{{Afc&kCRn0n%3oJ{Vh89zPQ=Qpa8gY2+NKUbi5*;jM2((-A2H2d=Q_MadHK*+`t0CY&i)v*Fr5r zobKGsl!WeQJASv;JRr0OjfjX&rS0~q@TJ9q@ZYUR$?yfWZV6A_g~8xc574O}E_e|s z<)}Y8HB>Wi4ABo*g@aol5ok^!){dNL@q2VEHb^jff1xbZZQFA>mNPcFZMQOxaM@bC z`1-C!V;vMkM&m9}9+Sb`v64GW_mAG~Lzo{_LX=Scfx~kq?;dRY?}U{3uqPmb)-kPsW}9hoRJb z+fq?@=R=@&O*z@+Sn3_l%ldHPE@3;Xj8yEdg za)Om#5}CC!f8z5d1_ZUclL`6GbUDda2M9bq*EEs>i|zezHpZ`4Its zOLoB>t0$jC-oJ_}0Gn1bcL&`6a=a$bATQa~=-RIdq3(l$n@H{?s8~Wy`|4;`oVpKA zg^|c^2HsN|lyWxCEl-=HUbJtD!Idz>8|JU^$C_DCb$|GYp6MVJkav38j7b^eBwS7h zl~vG|kIru!)>L|{YJQ|MP5c8){&gz#_v`2c9fHOE+Fc= z+(3`~aDY=+mWX)O#k?f-vg+37Y%MPGORDQr(HoAT!A4D&gr5a+cm!r%mG?drZYfkP zSAwEduO~~CwD=QaVxDK_c%3*LLQB44u{3%gxw_f~H+o3rMASAbo9{SH*TfwK;)Ss} z7Io#KKmWluf+Wc?e2C@@h5Lr2$>=5ReJ)yF)QW}HwKD$I)_T(9t}KC~=n_}QLh1QP zBgKm-_uM7vO-|8=AY^He0bFgMpx%X*sMUwTX^(E~+mE8LNd3U8*x&AG3-n?U7k4Lo z(JHMUn?KLh*saoRWx0G>3ah{I#9pTe%$WN1i@Uh(+xt}@&prESn6k}V1s?GT_z*Fv z@y0r043Y^1@d%v@Vpp@0`b=kFvb6g+h9IF=zqnbs6<)g-Z-=xdc26auAaC zFl;@_o7%n;}xs5kReR`|c89_&iT{IO70FOb$y*bA+?9@s5t#pgu7CsTW>Ww=q zx0Oi{#&5O;_{}vgl^;Rf$h=ahyIW?OmmO+N2g#?dph{44Iv?zgAkVxTI(-eRSuK63Z)7 zcA%j|w%xkdat#eRU6Hl0yU@2$ZwU530Qgk>lYj;iL669QW?=SL^Y+h}atLcTP7Qxz zG+H|ks>x3GbCc^uxNegrMq4kv?^J4dw#_f>lo*#bA1|X9jVbG%wJiZ5QofVQODxGz zZBg$V_at(|-?={s(<_9laePYx$KK$oTf@9*H^u~SqRgNtGw)Apcfjsz)IJ0 zn;)@KO0MJ@_ClI8W3URvh}Bm4%DvyL^y;I(SB%XB}N>z?z@B-+p(o zixN}QHkG7aH|!|`ZC4e473yTxinD5uXxs1~*uF;&!Bu@4#*OMQtedQ1Lje)9T75l` zj_Q6{O(}b_eSgU_t z@3GRm^HohjmQ_oB|2fEFjbXN@q)_wZ4JvwF5tt-l@`apU*lF`t+bsLyCCY+mV(9lmPS}-uLyrKUWKvi(i z=z3X*Y*vcu3ccsqjxW3A`nZ6rU|JA-(PxVyGdnsl?q{sdA&qfMfGDxvL0^v3k!{}L7oOrO z%Lr2g$`yTYO~+8*`=a5}DP!rx^=40@54l0?HIUA-MH3FDI5 zHON_8x*AY_v0T3oP2H{hYP-XG+|LrI$99%t({u@0bF zzg(Do=dBK~t&Hhf5}vk#o#7=Zi`4+_6j_Ek)3}u6$?Ao}u-4Tq*5tkUX5iH2x7P}- zm#Yj7U)4hFbJRvncmEIW-a4wPEs7Tg1f>xr1VxY#kWwV15u{rh1w%4>!s5wFq=?}L$( zgB2SgZt5m*EvI!bIoLRM%xwe}zDnCU=BD^G%og>?>{@p7yJt!3?4odaDL9sOdR22s z<#f($JA^*#jLBpuHQ+2w;h`fJJ2KsHxkk`PJPC3Mdt4Y$555Ol3Zk71bj?INdD!7f8&EE+P2wB|+MdYX@wJL~ z+npy>73!fizuA9+NbsusQ&3ha;hkA~3b`LYvR(O@h$QoCQ#KGvUyGxc%ylDoCX5Zm zB0Q){4tDF~Dg}`y=iQ}VT#J@4SsuC@But82%2q&Wfp;0H?HEdbxY$c?6HIIf1Ij z|8S{n%78z5^{rsZYVE+4Zhe(4$b5JS#A+>`xo+N*Qox}H%9T0?;jGuJ5^I+P0Rs_) zC*5wSpa`2POGHJUyJ@!91Nmy#v#K|b2@=q{=v#nz$RswOgybe49M3jLzpP8+>9=XE zZ}A6_5q`Y}S#GXCZa#p1auITWDJpqc?nO}drh+`yu_OaBsb*);aVQ`eB4BC3{hD_e zBClM;Ey4knL?_E>g;KlTDY)i%OSf95`5Wz{=Ec0Uk$|5rJUXWXBqmXX;hATly)wu) zvvd;xdB4NhuT*o=j{s8Q_k!+AFX7r?CGdc?Qo^|ZgC~_9?S;a#1Yh>!(6kDLykFuW zcaus7Td3ORUufZf;jmY&|Lbs=s*+a^SwecebCwf~fVL9z-UU5;D{*HKTEyIZak5{? z>I^i^*&8q{x0QmHPM;v?rKbQ}nsqv+YA_)9V*bx_pQgX|oqa2Ov~O1oI-w|_n(%P# zAwnNP;As)GzxV{$!whkverG48#*Q}e8R$eMcm&JCd3{-*0GL_jzWAcTCDxg?5224k z;;z)&^gSen-#|B4PwoW4AnLk+BzyQ6wGj+;21NhO321*{rLoX$&F2dqG?)&CpxC^F zRAPd~=ug?OZbk*(xh@9T$;T$YvcFbZj&S~WPCyoI2L|pyfuQs`XbL+5FTM-SIZ1fo zTn+hJx+vfsXc*IQZz?OBEDH0b)t-7)9auNNDDqv#-WK}|`+ZBmQ9`gkree=I-5(4Y za3uql@S<|lHc00dbebSP-oL&)GRXSwN1cO03RYw3Cn>0_9^zwlO+ZK*6SMWE2; zo9=srnlksAf-afBk~ISv7NUnm@zM-*J7yoUMJj{oq$mlVG!P!cPAD?DbIZs;9%w7=l zrh}Eo8ozeCfuk;_K-Scg16fguPD|rbtS|Q8S;;Bx;Cyk4i15UvT{Y8&W5{c zr>1#B5tm}uBGKald)x=+^w|3W;05vH@`-VK$!!D^0dQB_mw<|&>k9jUvnTC_5ipZ^ zK_+1Ogb1uTh~p9DkUt)VSEC3<`-v}|#2R_{ibcamps3^qM}`OI(7G98&O#RfamoYj zzT2Tj^fg0o$e;)AK-#`uI>$XMf$DgW{>Ha1=2`ZXM0}KSd_k}MbA$c3e~*8R!2Pmr z+h3^O{Qdyddg^(sGxj7y~&s{#q@==bfYO`8si+8u$utv)hATXM) zi_mwdWGXpTJl-4HkfQ~rprNelN<34>p7YCB-DlUE&mPmz5M01?0ih(MU9Se4ly-!F zJZ_X_%}?iop!o>1Vtjcdq|{EI5cJMg01nJWuo|+E01UA{`=nGXis=MA1bCapA?t#p z^Om_k)4}8vrk?XW140#l8ILVw!*dr_(Kuvjv_n-*TV!#litIEiwIv+w&t@6-e&EPw z;rue6X+PGo(NY_~=H}ZcCD{5&cAAvs7;sahMHacz5&!V)z|Z(KT0aq{ap|C*0Or@d zSw@NDDQYG5Av$@Irppj>N$y5Lul#d_`(KK`*=Tw1D0}))PJzC@_b{k4F0UncFjLW~ zHHWjWm78-TR$06|Cdh^L2ZCAbAYuBw=EaNZ(uIyIx9IazhvL^Gb& zf$8?;W3DX>7Aud}FGK^+bhX}C^jgvzaymsUapkrVn#wn+l7*1+cM|T~$21~eTqgf4 zCA!wT{-j%fj15t`y(Y)SLEm@14cz&b;Wn@IXJ`#W-fhqy-)a%OvY77pI*2-LJZI;m z$>csK_jK|*pnhJTyh{WT<+RVh6QJunhjoSfF$xH$cPxfVIEg5^GAxGj2%o6W0Ix5b z)~eRce4}%9tyM3dbJ6KX19^3mwuF=V#oD$!o1%sEALiCzm!haJk{{`-neE0N8=!5x zc`BXV1>x z7nz0+Bj1Gc$SltK2;2EvoUL`Y8(qK9|J8M>XDv|MiuR(@k`>_HASWVoVOEK7kI+$D> zYKx{r*+oATYN1* zB0C7vf1HraN~S%{bi1}>>-A!zy5mG+cwo|#AL*FFb96qs!z?n+FW4%{mI~E zrxurue$B>~g_e)R$lU++%6Yc?r$F;h%BCz5Al>rsx0Z0wr`^vdU)GRX_=CQStb+?O zAGCvRv<5h*Dd-^o-y#M_v$FMjPcQXnYihB)f8~0%lPBkjIqR~Z3 zg=gIQ{i1BFEl-ues=N*tMrp5j=J^OI35cO~_q|00xnm=Fi4d$Zo5kmB~eu>&W_ zm4s{b>!h4$`Fab#hTdM(P*vi0I7cal);i#Qk3j`C()6(I$@VBOtI0DO{UCiCrxRWU zzeZX~(>CfkLw7p;-a|H=ih($-8{G;2?N9#Vdo}) zhnL5##=r!bu7NipRZ9m0TNfD#bPCG79}}b;AcCwjezt+KPn9m6zX&!sk#V=*8h0-gk$N z5ybhkjfuk0uM73TYH*@eWR22~39tGDOkZnZTtwA!d?GyLL~lPQGQo)uE*uz=MqTtsM7(tunxr3Vg?q9>mUkkiPyH?-~O5Aesri^{_`bFxm%)mnGZp+IH?~OLQjEoIGYm8i0PP&_qa`Y z49orTeXyIE)5bX39jtAGEVDHT5iVVKUY~2Ldk6HPG!R!uZLN(WVQJgWY{Ele{=5e}$fKy;wm*WgRpS!8i8f@b z8ha@O9CMuZe>VDK>Tjq*8rU{4@-+rDssWiVrjmZvr@LA;IK}{p_v1J<_C;C^Y4Mq> zjwti=Xc{uWP1b+}Gk~wj_Q4b9Y#T`Gtn?sAvG{l&5d#PpL(=pQZ>|NoL=`dbw19Td z-5VIJZC(OG9FCUS#Og=uzYa2w=`2 zgH^?_R9uCa6ZMuf6M~g4Tue8hBs8W48imW;Zv;=RXou>wS~s)@H9Ip=z0yFKktznK zGtWh%yTI=;a0dIR3_chW?ef>#)vmiD_o>U+B`fC5k%#!fnM7!t%sV$MoKw|cfDNCW zg%37Grq&NnQD*Eg#ipRln3@?&n`?1f3;nH2wMU0hRBG9!&u7lIVr~Ni4coU2;d22| zr_8#cZ^{}rjD8e|Uk)Omg)#U>V432&$s53hTlW!q4j_yH$NpVdQ((UpS@quUG^Z`^ zDRgAXP-7?nrRq6?I#*s$gKAD79={bSO= zy$sa&Q9y%aW2F?6%q}@S{Y=ybBlRq2$!=AR*v<<06kIjdrqZo@K68z~y!mpBXPrm& zN{h;gMuhLNlnCQ#k_jjG!Vy9Us>*X*g&6sz9jmLI0Qn){pPaQ|N-=6&4v6Jgoz$_z z5HUL265oTNtJeqb+C^~>Xwh9rMD!1&-^JUpCcrf!`H7PfS0M6&%Ren?WOdDcPWhA1 zI~tr*9w*P3c?SIiWLJuM^lo>gzMIo^c$U?zKFN;kRY=5(WOm0xMZe%t;IUWSo8cEa zI^3$QH@8G&Z^t=U<8XB`mV8!ULRWk>inv}%@}dgmgl!}UMZ~(X18K?MxpXDXcQM$+ zyp>+rVTbvO&O<6(k5bD#hIh7w-pI^){Gt2ah}k_YqDx-^7%WDR+6w`5Z#0__RJDHN zic1!oLbY3Uhp_I^v7r2siKexvoPY|RVwgI(j z63rbZGy&G@@%K+3^X(ueANOPWVV^=yMnxYacfkn+&4$L6z$Iz(ICb&d8o;!t&8RV; z*A*7(;?H0gl|353_~1mg393rf_HHR^s*k&{>#KjVs>HADs_Q(TN~+PS^GNG{)^{WG zOL*5@m+Tca46YRKE~kh$NuQad#9&~!cgh)bV0m}$39$Ot)r+tgg=G;AEb$c3ifcKt z7}-}Z0ddplO~ms03uw1cm&$`MDK=BhcSH|Nsl7F0ktNhamxSHNL^Ct61M%l+QTPI> zEyAV9ow=jxMe#8k7_~%8xGO3QxsNVrO>!@&-o0^TmQIDKvE(<^x0e!STqZJpV!FIr z^1LzU?J-0`{2}@-c~;i0tK~KFp$+=1wA{n(t=c2kCf!-sPqK$~PDkorl$&tNWF?uv z+MFoNd%sM0a)k$>i74O<&0i`M{&a9uo+N7GPsBf$*mr|HPhj^uxj)CUR8jO#z|yO< zY6glbO)f-Ez=}`dyU3nHDjDY8ag6-qn8m%m&2Q;-+Cgj}gr)-C2^iTuCwWI7iQm&y zC!%*$vqPvdh_d}8{G)g7F__kFRub;I#e@&~tI+^@qM^cZ`oc{)5ECqGkuvwV~p z`C@E=xxi8d^TkO5+*WpQv$$AgPOk1#_n>Up!~o^w>6crEx-w;^;FZjE z13D7r`)rqkw<6oY{(hQU!60t--WlcdlhN8H)jZ1TA20jA80T|}EnR5hzNgeG?&i4I zCpheB09nHXtr&RYxK)0wdgeU4jZxl`_&9t#+c{kb7(X6y`WK~dl}9{fr$oAo0Nd=H zWx5%rqfq$=9obt4GC-&>>;g9+o&w7OiAior4ayaz1MfH#F)=^*a3&WxqGWGeF7J(b z5vk|Qd;>D}$xj!2bG+`7C`~_w=X}C)nApq4HGNm0q4rydoY+kr!xb)oY9+Bmu)_jH z?_g8pomdNW^k|`Jzb$^E1)5otw5EF3RXYqj#C||6cL5ES%1#AIocw-|1-x{9_PEwg zf-~7T6pncYBwJb8d9v+v?bq|cV95snz>i6hFZP%0L$Z<-b>g>?%%H*-4)-0o^`o(a zS|x$3hqiSY#VfLSvwKuemkJ3oU9PvV8tecKw5j29ez8FLqLNx}^)1K2iRv2?fsWnp zvYfVgH(q>Ds%@&SPqkZZW^C+Hc6Q_qX$t95(w~KKV|lM!*S;HpxUIxQziMqAy^ju> z%woxABd^h^P5qg6rkYo7v%8{38& zW16O`Ow5~~mq5|gl$^GOJF!nwbYi%nFR7!)A-XwP_d#>u8R^%G-ZIsBmyNbBe_7~S zt9mEqtw)(BFFQf~U=d68TWV1G15beZfag_HG@?G6e6+T5@L4{U*_FrdkbA(aW=e$q-h!?n_G-Y$zX)2T$QPd;dP__?1vY=b`XgTHid*4 zF&Rd#T%why9|A*j)%Fiw<7JX3MB_f^@Zwsv@!cPw81_8DLBN9;O3lUB?aVynj0^8? zFpktE%UJ9EqSLEwq_fxq)?Z(fG$ORiMCOD52gG>I#7PvfurxKNrO*@l9Y+^kfeNlSw~i7P@0xs!<_WTQ%wF^wnf<4VqjJzH`2 zu%GmmA8~>}>q^UPj%CPnYgHLukVeX6%k9OC4YfxQEwACs9l6onP4>k1Pqy4(JapZN zk(tG#D57Vd0$zI;Eh~^fkbzJ6Y_B7sceaM)a=-rT(=`+^tW5hiXCXrQ%1AL39)2RK z&HONKAigVsf=%K0w9jYCOnwFP{)CwFqS+8t21(aA>mpib`XPD#kvI!g{~9r8ZHG9*#3T! zarmLIw(k|;qxGWiO8y7|MAl@?E8>YZK=tu9PxnjdwfNcWwh9P~%t^}@hxe^H4niRg zp%R`N{rQ{ead9C(Ls-5d>}OIS1kKpgOecDV(`s-uZzSwfVXS`3y;{UBKhUWw_rVXXFrCrX}VAvyrq>*Gu8*aEon5->X|k`gl(K?zvH+ms`fcAVcNJ z>l&TYT87da< z$+QEgs-=v3rKECSM%tgzX5h~zJWOihkDNe)9(s}cmwUf}+pifL1{IxA2AxBkO?>is zd)Z5h-h=y-5M(o6wg#F}Od4$=ZENK`hY+*Tw*C=)&2|U%E6ejW&U~JJ&cXCl*!F>< zwj%-?nupd(sAlHjt$8U&)R)qy49(Qldd34Jlbl8UgGy~#-(s# zExKEq=;as6-r2{sL9vUL$vk>SO4qqR;0)Wh9@hDuDQ4Y?`LT$lZ1^NUa7tQ4r~~Hd z08Q65G4F{=ivmXDvS^B!8lGk#6{mMdONt z^ir*$wLu!)=P$@~dn0Z?hOJH_`WrX1)~ULv&fT@Q{liysG&Tkg`)jJI0a-bzb3^e; z@x%GV{Swlgp24ZO<^Z+0gMLRU0>;?wz<|xsh!2UobAqkRoxr2zI87Btk+{N` z{)I`1tz+g*Q_0;37IJ+To05B3D?6%+MJ+*tpfciMuDh97h}pMQd9=4OH{kq3!EH=v zLse*b0Z5H;w3@Xm7hDSm$zSP4w_nv9*lUN<>ILSNfib?Ob zf>B74EGQ+$i%GthO&9(OHoW+aVOJ7_F?Pw^n_c=IS}f)KEcHB%b44d1S0~np65+GP zNhL;RJvb$lYBPXahk&Z2)1@Z)GvLL`0}Mmc1|fm`&LKX#&v`Bnk#wKo;|rPCq3Z=F zR1MvSVgMj((Xw0kw%lgPV)6zyp(8^M{B0nQh@oJ2s>h}K?8AQTyZ*s~<_xehE$2)d z@C{F874EIjKGe%ogf$P@R)AnV={u=Fxm}(=slk1>vC1KYg+ZF+=UPa?Vqwq7JOyw3 zF`9e1>~%HkKpFSFUAVY-j>V0VrA^X}zwmihp|92FednnpJ>wdPdi=)mYKrxz+Ai3& zO+K9#KB}iaycePCIDGY8>GP|Izia~)*cLoB4Sb(^Zz(sUh?S ziMghW^C@kMM1Uf#3!=Lhat-R!xBEEcE~njT=ud7t?BmmX$YR}>>Vt9{Nu3X9&SdQ8 z3>hx1TkK3|(P9f&VMzCk1+cp`DP?>scU{fx{rRFAAHs`*6-jy4QqTK7{X|53uz!Yq zy#uW>zc0Q(L6&cCB)!1I$lT&^pxZiK@m!K`G?;!LyL)rHdX=v=`cr8|gZXsD^iL_=%xQT?6doRqx%~$hTs#rN97X! zl6&#GV|-~Lt#6^-yLNB~(tH^vgY37|F*J1@!h;#axJ?yY{o^7u*`!s!!<}k8xazvM znai|QU{LQj=8;m@B8t9DHu^bQ;JiS4=g_?4TkerS@DA_NxwJE`WfUxNh4p9&cYg-V zY`O4Q3q&PqRtxZ0&sU#(RalLFYb*De)ly$G*gnp=rqrPLHYOZ9s;cn=i;@cLn(0Ks z3n68crkSr7tdD9&h|v-CwWhN%=e6T1wjx%kIgMTwjsWl4Z|v~edZ3dn;?(g+(v4Cy z(a_89WC6Vl@|RTFARdCce9=;KEVl}i_`&Qp%iK~8CRzGc_Dh;|g}R1J?@}{gouNRa zR7PDtP^Wm(S7X!W z@<_*5g6_vZnZlV>pEJv4YXh%aE$$9T>gcFYOB86hPxi@ItG+I@n_y{SxgEVUb4Wv^`T+PN1y!vmb3Q1}UsN%9eTpTwqw#hs=}Ld_{rd)@eJaN&wOjDr1TpU}V>ArB$xB(Om1@<@E5BC!)uM%TpD9|M?uPn(i zY(8#bPUm7f$}>{yMCPM>sQt5ezw}-3>KpUaKnfuOu|ZR44^un_xi0DTPa$}Vad4?vX(yD zcm(}AymiF;4U9l9DE0<(dk(w-m{YRCtfa@mUFQX@mF3U6sQzou)1u6X%?`}7trg_D zOU@o-zO(+>$!{Wa8-nSZ7$|3TLl&|THtc{e@c;9WzznOwX;p^|yKO0XDpG*}SsD@@ z;ENLL5oJ#JTnJB$vNf{rEd2M3XvJU4hb|&VmUwgv-omm-dUjnwD-Rb=nEjrK?e`Ls zD+gaVjChB@_R${_{@q$NDe)lR3`VPC_=bMCRXBaawlBCLB~8Tiso?{1F5ZW$09Q|`rF)|DkA@k0kLQdfR=lD=NM?7 zp<$4oegg#*5JU6xkO4o>Yke0>NB%4^{P>9c+%1|Hkz$Nk>jv3GW6hzA)x>m?_N!eN z@X#@LnSMY%w;Ga$m&xUy!tZ2?8U8lsiacCZxOQ2K$5v>`DQ&BHHEQ1CSx#tE%w zqMy-LMmc%>MI|YodhuU^%VX0#6XnG{-8A=EFS!GoM%fb%rzK%dHb%#wU{ehyKE4n5 zm)R9jnq+1oCE8~n(8y?lu00bd_fwz?Qu(a+RuRsDoT$qm1Cz6=a4_RK1~$nhI2>D| z#Z3E|sAQ{}(Ji!JCyK^p3c3bnb-XUykI42OTxkG&=xP@xwh?~GMK-6W4O$T|M_?L(g^Q#Q&ZU_1=8QY8+-hTDGF{S!~whi=0E=qJ^KJ| zZkR{##a|cr^@-suO^`|vVdiazk{*OWmDOj2^7(xf32mZRuR}u!Jcu>dtkWvEH_<$- z&(H*QE4=4^w!m-Jm6*ZpvQ3z%n9BUSM~Kn!?$4dN*DH49-6<>R#QfYD=mBfycrZq&mb zy+$Clx_}fNGm8}0O+o{8+@na zafhVws|gYYPdRcLvz?BP(A-O#eyFKO#gV&I!f!O)zgezwPZNqT$2sZRF04q+On`mZVpp~B(F6Yo1pzVFo%kIAhytwpm;!Iv*{ zY<)BAh?iA`q>-UEjkTZug&tFu<;#(1=`+bOkt3E{7On@8FRENnN+UhDb@Zl{f-IP~@cTp3L8hfaX$G10y=NB#6 zzwe1w@6u-<_2@~h%edyx&WG*_IeEf!S5tjPOmTQeDDxeLc{11j%lEytMhljD;gK5N zZn-jh4VRV19sBvRM`;&g2WnW8bDUGcgBc$V{PZ!?k9Q^sE`P4Y*49cuS+{A-_%uqO+eK1^~eStOW&l&S; z3XAq*VH>F(kc#fXvi6TT3NIxL3f(?!H5kLchxPF%k1QCc8M`dcHtGJ=S3>9s-phSi@>~LhmKjuM?(evp5(uqOpXKdB&j1Q z>)&^&rVAa@ua90r{qH8TBWr;~oP_$n@AChS2jhW@;{tTX6@XD4#l7)uz=OcvOGi=l z_uY7S(23F*epzQ=J6=%DXnz1+T?G+pC0N`sw3Grjvyxdgtpb6qoAm^}{_Ig@9(c&c zi^L1Z>undA$2G&x6_DA=&szc`zt&~oz&MON`Rg$&Tf_0wE7TS*kB4Z`hs!Y!rt$!4 ztm*;RUCeU8GX|`_qW}5$MJr)=g;l#}P$7SU_zU+w(;MP6O?}sXHV7>yFR%PHhQjW` z(`y!ZwQ&6IBVqgj1nf!(iU$y)9@NbGli)=Ad1TN#<|F5l81M6-2Ldlx+g_u1)R)nTgn%*vL(@X zwN0WJTJB``kAh9+Gyq1G75fl2RhACuw9cPH+tp0QK~w+W8>7hU_u`DLVc^M*LI^|g{-i#XyUI|6 zSUv$lA){~kZZ3cUBrg|@LBKi+nxnI9(s_gTw95R}ok6)dJymm?YL$Q3tS_f#2(a2Q zaKyjY?rXiln=a84>JYLBbpJ18Am**ww{rg?@C;eY|B5HV^u7RkH8^az->L7EHKSCJ zIS=lqHd)#hejQpyl(kf?38Mr1 zuWjqsA|3(_IzL-={FLxqh;~`P+p3+0%H;cjz@kWV%2lk|xS6A)QC;sM{N}8{bZ+&k zJCu!J8#D)dJ|9AxmC9M3rR2d6z^&{nK#Z=qsin<^E|%TgGB#w+4_%JzCkGB}G3!q& zB?lcalCXrQCLdcn_LxNGI28G-H&V%;fM>c?BYSeL~ zGcx>>gfFo8od#{{sxp>f16`Fj1O#h#;j-05o4L2PB#AO7>B2M_qn#Da;c4eLe?;XT zn^9RcT|Yyo7oI>3t#D&sYxVRW8_a`03Kvpr z5mrgn0cr!*k!mAm^1r_XP_&89Uc|M{Z+f4%eidkk80&Vu_)5exs!>nqRm<8xRZIR`}I6%i?n)0&fmQ65lr=X^ct#B$@SG1LeH4u-dvG5|Rcf7ljtlo}l2jQM)Z#uQ;?cpvakw&v#UO zw)1MxTUzkCDcrt6B)ol3EzAF1S1tx#vk_xwR;0c_a+(U7hSWhzzW&j2A;vjbjkLID z2o1&NBHs9r&3-gZx82)pp~x*Bwjcvzts}3SjCMt$-jZP%7jdr)*+J=1fdkygx0|vCn{2t9@zJ(|3MfDnQ(1R(^H*uXho=tKMReHDS*#TUNaaiNk)$ zIQpI{*nsgp|IKQUGky(y_vmS`hacP_4W;h=UIP0I{%Y7t@%g~VRg;SgLcLBI(EfoZ z{Wx`2KHHKD6k$H+K^`#i?yQ`WAq3T_@qwLMXm8 z7oO2vsU_6TDe+zCwV(~^&LY|Qd!(000$tE07CVWv3z8vaGaB)a#QvLIIMH$Ii-(9e zQ9=nRag%bCqpbzwi>f&Ajrv9vzs)NUufV5#RjRo~jn8qSh3Eu^4rX5u*uxH1mFc)G z=Let0653X3>JZVKO;P}A)zQ(6l z_;D34>$e^1(m#9#Sqv_b5xtVeMr(EIqn4Kg(oM3Cv`#%Tj1t;y%@`LZuhm&a`<%I) z+iC+WzrhkkiHg7#%WGYa_{A&qv$yI+C5v2y2Aq-O&e|_pPFper1+$7&K7)!~))5?R zLBVbM_B3B1D^of``+b24lASc0Kh%nRzvS zSP4@`9$8YhyH*djAE>-5(p2y@A~1+uWndmIN;737-FueDnLJP1Y@}GW`C{W`+DpW2 zy*uN1NrUhPWl%PTfX3r7#b_>Or&q#49cP=AYcGeEGFsm9UdVU!J9mxN`ej-(y;S2j z2i(C@Y}dNINgd{P3<~G=A-75F1fx5?&hj+LOfoHhPZoFtVAI3$lR*$%9-0YD&RLt0 zi+Yo{_U2PsW+oW0WRVC!1 zdPnh&@H#!u%g!4GB&VAlQNF8j>}VBbGE7pE;yPa>(&9b%Ff zWvp=wNj<9(m$+a-GI`Z0?$JvRe6~%qT4zgrEIfNsLbeLC5z-4PJ!+)O*_>d;Ma8z7 z`NfM&M-75*zpF!uatSjES^0VL_ZtUvF6fx4jD#bU(Crju##cJ_DvhQ4{FgMx9OmW1 z4L!*FcS?E{eX~O^y#4{%KB-PGJZGKi4bKIGkGTE;LAe2MzXU!%zd3raH9<{H_nnIE zoHK+xpN-8gY3QmjJg5!NMDs_PqGK!Ysf_q@&a2V9kl zByMcPJ|}XNcr(`=do=$|7oDQN0mG8B=j%5$s`u6;B)vBMFE^vruijMkYE!q2F$>o4 z_rsB*Fu$geTltngw4m03PxmQR$pU>KxP);F=jlCk?5Cy@-%vlg^L|~Ym***g8Kyrf zlP0kR=fx`sQj2}ep28C6c%SgE-3)w^!1lx=M#)52TPeKyaV7-@bfd7XrV9(X=K9Gszx9&WrXi+pU-y zH)Y0Km~&mQ%EP!8>M=|vYXE$Z4{TJXpmtqXQB$lwq6_M_HfZ_#j@v^F)BgFx+OiRu z2n-?~Mt#PgJkhiiP0Cm$0nhzs3J$n)gzt)~`rbs@RrSlnDe8oCane=RUc7UfzNN3u zJ(`U8qWD&Sz@xaAkBC+MS-Xt2!nqVK-_7V<$e>ZQJqY|UcqmRJ#FaM;?BTCpEp-Xy z_-jzvq4Oo7+Wtdb9BZyJ(S*Sds!R22S6?#6ZibdARVXBraHU?Ncy(LAKAb|1vxp{m zx8wPVO_%I)%CTUR6#jYcRARx15LFj7DSz9-(mQpI7wo%kB68HCkJ8D)T&s0G5MB|_ z?lf??5l;}c%2sovm#i*3&``%*G2F|JYp%t6s%YuPbTtW0#kQR=+fu;#RHm{%&Y3?p z9VhfYxZe;s!s;Rbi#awhhCfmBp6m2+sG}m9E^>Hld?%CA8k~IW4{zmMQzd`!v$jRR z{mcrPwh{4GGW9_d0cdf$boR`NI+S`J9eR#}MVU+YH514tiv`{^z+TsYSoGoa5Yh1| zvoSBiqv3@Do3rH7z6;Cl?Y6*hwmH=F&a*fb{%1DlADIueTlb(03L-|ywp<1s(DZ(i z+W+d7eBVWJcGmN+4Y?C0m(@>*8@f<0*p}0nX`-{P0mZI7tTrcV-x<3jf4Iwb!{m*@ z$?`5SywccORSIs2#(a^U)k)AJEX+WK#E7-fLLnsa`BgUac$>m-R{6B}N588(5+6sA zlE2nd!hduu8{W3uYstk2?;y5g`=0FQ7A&&`9LuqGu3q`dO%^$m+OE_)06yDknnK|| z`@D-hSNjzXCEmw~jsPTtqhxdexHR1a=Xr80r3|&i zx#J`~Vl_hZo+KqIMTO{oOzY2Nh`yK{91Af1)2L{b<|;^)8I|{%{e3w{5^YtgFfbl( zo-dXBTQqV!NJ5~0uM979QT`RB9DnjyA%S#9+<|27-y?G5*BL30+>><_@c+-b1R(oj zZ(6Hj`R~hLVTR;d$e3dCzt1(N2nn9C;fH(diLK+ z?Ek$#7L7#m<*S4#yT^aY#Sw%Wh1O7Hd8NF%8a@hk3!`8tFsy_A?XR$p$qV}H zw*86l%4e^0?2Cu8hy!T6MV!-xH!Z+^o^|4J7FbQ1s#n8Ut)>hE#dA48WGlDs5ur73e^;Q!az{;wsKk}g!0P!oO-MOVApOZNyJ-3TWX}?6xTeGz@GU`y zj!;d>_17eaBB>h*@$-vMHHY_7@|QzZP>W2<-<6ztV|}*;WLPvC<+7#R*IzG)A@lwU zn=;)cWlr=XthAXUrkF#yY})}hRD48Fz+`Pjim^hjFHu0s4B%=-y_}0r%$<4{MmsiB z0X=dBgv(Mt9@?9#RC9EPDM_hlm!s1%kJYtja4>dLX>YI9LcjV@v1OD|;@wQqVOg)9 z-SSds$)V~|I>T;?9z*AZjLvw@pi0zUO7Hg*PDhFhPhUyFh!EYtJQqAEVk0wswkw}m z_E1jIzSaHrBoGb9T-SZ|{igbY4FYjG!QlPYP5W>X!$q20QP$(vVOO7FalMylwoX2^ zk7VQ3|*AQ1YXmd8gs6Je9ca?{`=j#x8o(z@?wP`q(C6MUfdK01F`_ znK6t{^2IZ(Yg|URJ@9|UMMw)fF$}K0pnl=JNqY(d8($tK>WN}~xrr=rhmdesZ|w=I z$x}NHm5@b-DOB`a+m~v!C}-~~_-#5e-(;h|e!}3Pd~qrG8qVY2YmdiWjOhDG&rT+D z+7<>=pktmyrWtxq@K_)I-j``|PwNHlh2J2R@mXb>7c4PhxtbY3OZeT`|1MS}$ck`U z5ZC9I?(;wAau*KVX`J%Rxxal=a#K|EMep+j*4^Qg zcYSW5dGx@`kjZKgzs<+o!Rn4d4AH2^aAOJXugmk(C4N_{qP!4U^4*VD3Xt7ZWNB8? zS4+C=IIo`Pr1yFyE%?{zyCmUqR-aNKLSQOI-b;@=KV)C<@bef3@lPJZ=L*jlgP^+{k+}5ZFWL=cltO!w5vR z2O)oVx-Fm+4!q_)^jiS?RKfPE0Z|WHBdPeqT=259Mhulzcb5FnFedQK3YExq_s3s6 z1$dPJIq~3%DlZ=xx-wWKrFdiuaUMCr^kjl%54#dgFcP%Sk>Y`Ih_0P}ezgjg5ApB3j zB6>$b#vSb9(}+zIq$)a*X)jSxXxLZ<3+VJ{%o02!(Jh4RFP$wz#W7|DEJ1aW3d9fKcHldZ5mrEjh*aUo(w~sMw?(YRtHGK*<|HN5x4K;pISpgd z(5eFnqaDBuHb`JTCG>WJ(n!X=#*^nC1Z|RxKsHwbx`0hAm;h#BEHVoRQ>U#@d-&OI z5tWLgc^r}sfv98*>@#`{84$m1urnLIxTft5`keu!wta`B?$Bn*o#_K8AsD-#U>~$I#QKx9)CDA<4Spc*A;Aq8nUXms` z_CPEbT6Ba(Vss9w0LYQ@K?Z3I9s=sNp7HM3cQHOXl9xY@1!o?}0jwlVBU)oj2zJ(qDE;%6*V3w$ICrBM2d4kzc;wP>?Q(KylsV zbqJek4C2KVP~I(?N6tC5rw{ByEhvyzSy;o)&%9~^x`^zc&Jn>4p~_wt8z?BPp@iM* z8v72Ae_e0y6!9s zNM<2mDvQo7WFm;KESF1!nP&rpx$|zjy$PRvofnFK02MTO1mrGLt`1TO{=i$d7~{DJ zH@%45bU7^}+%&7l?US5{Sx=;(%0Q6%K~XP(3|@LJ1icRG{XRdgcib=>D-$GIXHd3F zGztEdT!zAAJ#$UchOZ?>n4(N6ade+WrM(STn(l!E18?*x+=zy1u0c< zDCZvslPIc(t>9?^9)BXZya$iWG_MSn|h=Q15*3YN3w`{ZbX)f*Ly*X}Okzq+eJ zSqdUMVYiOJmL}F0j4*?*W4Kj`wx7EAwN~zFXUU!Srm;}F+s61JXn;|Gy`6g>sTKb% zmneDwR4nhGBqV*eFY17G-DDY3jrluYEYpHfAe-JB1nbWicNI1?7cz8VN;`aSl#vq} zDBL()bZ5)mF(>C1S)d)XSr@(#kzM2?jhM9SR`P#_Kr}9U4ykmy`=H14wo(Gll{@*3 z1#AoFA+8Bi5}jZ>sH^98XGn@B5{}4!&S#PGJ>SjVHt-1Y2+!rr`smi~nVcwEu?cFFU?ZrBgo zJBjk*Y(7(zB2X);GvA@xp86X z3~^6bXi%*ALf~68TbgPRmsLE@av<#;fwHg~cp5Kksw2~qp6Iz8Gppg@FUUM>>U83g zcH=Gxx9)1XyUPsbzZt?(d<-ARpq})rqR;ZFW&9bZcV}p@m)d{3cT^VQhyjt={$Ccd%yahe{ z3h=X)iUDCnecm?*lg zeh++CirZJ|cbx09?y7|gzr@2wv!w-r@4)$}i z=s&qT_Ry4u&WZKuewR2=f9YU8`h6uEUo?yZnnV@!=B?;&h6lUUCmDJbnd)6hw1hw| zjv9z|9Mn!$jK)gs7q0bI=@m?t3ha1L`~1UO&D9#0L9hl&OXh|m)B45{2`*y2+;xT- zhdi7dQM}&`vDE5{xc%PErDV!C6EpCq#nyn2ZxWG|6FlC;XV23p!*i&Yb!4DUhR%L` z5;bXSh%i`PROnI0<<6rvJM-g6L}mwnFw2E!Lfl_M)1~g=2XR1o8t(;n^MU9ZGZcBj zBeSd-z7a=PkJ>38mL&7@flya$OpVBuVm-T}@m6&F+wPOmlt~-`&T9jwEB98aIS$3L z&|WjR2KTv`3JgHH#UlFQK9oG}L)?bFIjptOaj6<-%8@#`8ZLq5afC+l@q&ET*{B@z z>+fZ*NF4z|QQ7ZV4eNsIwF~HJwJLFfHS0Zc4yvv8x84uVyLOLPFKN_%X1Q^vPsX-* zpftO~B1W=PJ7!5EnUzaq7w9%wmI-Op3-3o(y~=l|V&t2*ZnoxkQn|8w)rw}ZwMDY+o@W(&BaZk+H6La=*L$oYHD9P^ghiWx8q*Ei>|E@qnb!7zw5H* zJM=OpX)jvR{JRoT`K*5J0pv|qago_nNizftAkx~OW(s%=uIPj08c)T0Vxnr>?u?1J zXO?d}<$4fbUUuJ^j+F8(+R0A4<>7-Aw;+`DTzgr62_>*pAg(2Eub)hsbFr!&*JvnU zlKkGtz*1SLhaW(u6?tr6fqSnwi`%v1xybGpWER_C0NcoPY$9{~@%r@DsQrHasV%Oj zfw#NpzKcaR+df^ZnB%uZC6+c@m$Hi&hno1~N(aby)lhnnU3Ps4cE-s3%f|1r#5 zb6@v$U)On_pXIFE_OQm;tn0-GVR2SVQ}D_7o{#VkxVy8>K$^JF<{vvsJB{eNd}E@sAX~J`P1dC0ROsy zGiTp+#={_9)XMo-Yah|Y-oJ4V z(Dp`ESKGF2N?qOH3*n`o;8-+>wDH ziKiU6bVA--A?InEUOU*I-X4E!n)%U36H-0lh7G2dEY>d{RpCF|9mpb>VfED4Bx{AM zPG&oZ6||0wmuIj0lp)>@(>&p(ZptD{B{9z9q6|i0&{1P*IMtpT&?s&Fz3tdF_fvL9 zlv(hk6Q>rk)Bn}DM$Gs7gJ<^g#cjS{8Q^+^?q~eaL$7EH2+cT@!y&f438LP-*DH9@ z8r6n$R!S=W?N#)-IREh5!llGizCB&4zBAA{sUvb9+C%4yOp&9{8)tA9s9(9R0EbXJ znvP!h>Cv@?=loTtrh?3SP_emg;D+$Q!7|1(*`qsp`CBD0>MpfC_6>Ht+g|vhnmOa) zg{KQ2`H!gKFej~$-k=6DQAIJHg+g4I`52>T^!|S0m|-@QJ}fu% zE@0ni`P7?PFc>JXDfC7nc~vWG{OhQ~7FE?9>H<1_Kyi^pEN;HXM*3$P-M_#>diczg zjj<~krEdJNbI5r`MU%za#yF5fI2Ok&XMG?)xkkFg$N^Uz=Klg)n&)q9%WJl3rS0Hs zTzOS+M>*#-sPwq!Q)TLa68J_Myi;SiPOv>VAWQ+(V<^{{2C0&Z@%j#iQbF6>07nWP=i|H zu0}~wrEV=0Izj4u&QMU$g+_>Z=MDHuiGD+#a zQPX}~zS^)zXffM^9SFTP&r>{;X0f(1CYPsom6;Jj{$xTWYZQ&! zy4u7<7(}|hofj!J_Se^s zUm3b>S8T%Y)@w;yaM&X0Se&}G)$J-nZfuO{Qc9Ylp3Xqu3zMpe$Fp{?*o^beh`7I8 zyfd3&&~yw+T9JB5Xy0IX^4X(ml~fZLBC}c7S8vZhXmnW>WiNp<&mKurmqpNv5Bs=w zb?zw~ySFyp6kct9>m#8^+%B-%-@%(%{djMqjv2${+K&Bl_U=>X*~GP2;)D`aeLls$ z#3dE$Y* zh&L_KWY;goC^vott9G_F?1RMQN$jw8_uSsg;{t5aE-)|PFB}O4$ zRFxm+R*R(K*qc7pm7Ac1KO6z(Y&wOE4Q<7!)Tg)NY^;}MO-s(O!#Vtgff{qARl6>< zIwC_7X-_nmxUrm?W`;Ye{R0HW66G}jcdE@FI`dIVZ~4YkhErrOfmYWnwuYRc>`Sb4egL zrP34p!>%MxP$Oro`+*6!uc~Ac>1{r!ddQB`=W$rVRdSm)M;MBpaJr^gP5rbq1Sy{x zb~^K?%&c}J5W0|Y(&vEJp|8Yoq(^y^H-8jY9wW|hj|Eq6OC?=l(?XTdVz(3)A$PZf>9OCC{G}hwdyue;l_5o8dp8E&RJVNhAZM_ zV|%%G@)s=HRPddeg&IS2i>R~|OWI?zNm$A zVm8x^b=qjsO>bK%?7$>+ zug}vq9Ha{B7S}QxUut>ePneVuq}XbbY?XUxfxuHO2Dq_LxnYS zsH)}iS+d9@*&1s%OppbB$DRc`92@UMZo>riBC$1>qQkroe@ZVE+r_gY4QyhyuEMib z@^;20!gutBjpLckE(xp~>E7}Q6iNilG>M)reS}T=So_KCCZ_2}p6{~Ni#o;So3pRI zwqav5WiBx>(Uj=QX@vOnWEO*4B-;!q@A@b3dEWPj(esI{Vef-dKpmCp0b-FV-t zu6|s(_C=L>t;RyyK?c*rqu47crkz}QyPF%zb9H&kBi$v*vIM@7V8r&KYmzxT;)Udj z_BgWE?-e>aCloY7F3L2<(e5VfQ_}e)7^z5qj;cf|(uN33K%ASdZrax{S*@7z;U|=! zgU|%VXSi)ZN-_)Do(|m2&^-WhFiDgmpS20IE?)rDf^Xpy6jjQN;YUL^%#Z5{N~%9r z4xatVRPc2I$st%oK4#}@?GATx>K|5f3*^*R*SK@KSp^(f@GH^6C%KrL1?`-#X(bygY%7mK^lMdu+xq8|hfHVf)2 zaIhzEq&JCWKq@03T$M-9wYTs#7=eHc`Vp6ka1idxL)m{2cCjwk05B{1h<$+0kCr)x4e91y^DEmQ@; zJ2Md6MpywI?nG`v#ReZ) zLq-+%ocXl^{*ON=Ny$H)>s2Rn|38Y#pU@hDq&1|dnEehr?U^)$FSOp&_V@2ll@fm# z%;fyt`!G_y+{BMR)=a;Y_C{5#XHSrVHWq|RMZbD^UPW7Yj z#qAZWw%GRtC6x{?nfN*nD$<>y8=oN}mKAJKULLD9cgxzfGuA@ADRFO;8n z(lHQtn?uRm9G=qypVm#*dWo(5DY$=XCO1?|M{`9=Zce#FoPH!OT1UK!ut-W_4d z15F=^wFW#LcLBBOwQ7qnlwT7!aG^~5;>hjZ-Ij}(PjR%bx+StA4h9N+AWijLF7mOt zQvq^Oyqpk!<#su(rm1D+PW0OnJiE32zR3pO?iHp~z~EAlQ6gn^Kv zeSE=3`0LWk7N12Qjj~Nh-VnAu7fM3(e-#!WPf)uHES>V4<}``l86*T)lDdP{spDJs zdh3A$$93P}oM}m605bpMZ%Qrz_v)9-W1+Xjf6`mfi1K*4Jec|S;dRjPtPp0sj6;9_ ztX@P~S$;aD_P38>(*M7HDa6|@Z?|pRV~TVog3ZkO+keAd>D~pLTzPxY z_ss#N))CA%12@D5JO|VcB8~gfw6m_-Zym?igltaU?j5o+BT?EIl9<_Fe zFB<1lC_`+(Bu`3zGN*R~X;cN!tk093IV1m)wll6HWwAkLWG6(0OI?PfgM$IB2(g*F zNnd=gnb$|L!uE?;?V`$Az_Xp8WwJ#V6*0o4gn@6gP#f^IM>!7=PkjV!^cYu%fAH=gRp@E?&5TOu4_i2LPPi^C_{opyk4N~Jt|&yGuao1w z`;Noq{XA5xUD_UdxL{m{07xg2$^d|jnY(GVs-#-@1z_j+=5|I&VrV`6R@{_S!(fUx zyg#~mG(@Vs)jpUmR@8m1BOu?82inS8DUWkL%!{~5YsS?rW!t;MoR2yIy&*at!Gde-N{M_AZF_cc@8x?%3jwgolK~Wb$jE3gadf!nf=_ocItnbSQ zyXAz2>gVFzzXp7-2JP@8yKh`uLEC@Od7A_rp+xjfFdqzLvFP_w!gX|Xhhh3Bl6t1Al0&zNx;s!|zsB*b4lc^$!nOGQzSUgwB< zCGp>%eV!FGj$I+K<5!hUdk_U$No-5Y0uU8)y-s_#+whM9njHck%^nb_v#q`_`TnC2 z=UvU(J0+=wv3=mpsIS*P29@Sek|8fc$V{$6$G8Trbsn~gN~{65FrX)UGkMoA|V zfM`t0ZO+3TOgu;G{WzI#eTocU9j2Au^dP~$)!+yKNC+`&INXdLQjF|V+b;d0Qa7Ch zE2#|=UPwkE{xUO%9zl+koSf;Y*KU9L8*32I*b*T_{k7yUCWX5#cQS0gX4qKNtefY`K zo^WQZ5Z36cR^G8h2r$+t}hf@3~;M~&aR%TpAtI}Xm2jnoN%;F*+E_GRl z8-1-e7Sg{`Pl!TOC&MT6*`;Oq(w@u-(z>pjcXBBZHWXK%wX_1x{BN>g(s??ope!9e zy_Fq1VRKJ^|-6i)Ipr0Q6ni8mB%%HcbO zzN(JS!dP@DOkvdvoIx7`*759mc0JKR&GI2-9*9fdzZXhmhFspY^vUPYGmDjc6TXH# z(Sct+Ag?&hFZt8Hscz1;HRl1aAzIQyB?}@h0+V~QaI=pm^KsuXc0~jpH{yFPw7N6+ zGj$~3go7%;J8Yna+FVnbu9qr@FE|pC^cM_IR4?W@$Yb%n|I0=XIby^8p}(qN%em2D zIH3}PPWNW$vg2kM-CAWQE`i20`Gt#B5ydz0^$4~5zy$Z65PM*0d|Q2rJF4RbCS&zs zb@Cpnmew^W8o8|%bK#_LE#}CH^2%(}@pBfnJ4%{QS8TF?6d#5F;RXv#It{lTquVMT zy=XVITeDcnIld=%%UNf^+wQAR=`XjR$csslBm0Fn4Jz8_>7Ijg19f@>+7>{QmG)3w zwvmQM|CW38G+x$_K311w{3nYUa`$yf2!ldc+4iQ;l<OuX{Rnxkz`SW)YO_KAI97oFTZYGdNW84}Wvml+-B)O1Q$@$;$1YZUTZ zjpe^+zf#2lrqTlahn}0ee58hwoiu!Wg6e9i;5?e34JMK6V!)?dV&>BHIV80|WR1hNHnkU&ei5I4ui~K2(Bw*PSLc zeI<6v&g>W@fR11NX-04MNAvq#p0QH7jDDW0aK$G2VXG`K|wVVv(#W4phJ-4wd4(f_h zDb#vQ$5^Fl!JlGua6>YClfYMCZ5IX(VS+2_$1DySb00UIxC!9_gbOVZu9#@aK`0;% z0@Wox`h?3N-(Y<%>8&Ij&m4($XL8x3C<`Ut$J{uRaO>$=Ldp~4$9hKI5-+}K&m0P@ zf|GBwNRE2@wrOq_rBTFjQkYlqz%@N`qmR^)cVXAk$d5D+g~mcF*)goXcDA;K-nrUQ zY|P`=6~`mc=o2lmmGf%m-Ltn~w6tRUkvFT zoW7vg?1~3!)f~{vS_v<~B<1}q;G+nZ*Ia~2m1OUBb!iG4{q+&Eh&W0M-Gmetca|y* zX?XY_hJY|MRtkHqG+pWkqfLc6{Z*A;UqlfZj-Iyyxjomp5g-1znz(cdoY79YlNm>9 zsN>>;5c9>`)@WJrrp*!~`RaeM%li=VRD5|PuyeO!z@I-oLTKTU5w%;x{4*u-kns1V zYkdCg4RGuuHJ*#Ir&xde@768)pJD!+CymNCSZ~Bcv)Jhd=7Za=sAyfvxoGb9zW{U@ B0L%aY literal 0 HcmV?d00001 diff --git a/docs/source/assets/kernel/value.png b/docs/source/assets/kernel/value.png new file mode 100644 index 0000000000000000000000000000000000000000..f585c77b2e1449825a3c704cce6b102f567696a8 GIT binary patch literal 121414 zcmeFZWmr^e+Xf7Zfha1epkPpffOL0B=g=V_-Q6Ha4HAlUcMOemij*{nLx*$^-3)y9 zsC&Qe=h^Rf9N(|+&&zQTGi%np*1F@m&g(qS8>pO&*sU8xH_*_~Zi&AXRzO3$qJ)Nq zA#wdGcqgUk)C~>oMv;k-kes-X&?7lpD?<}=12nXk(8%a(N{VW@Z#LcN>+d{z{0*~S z+zLIzxBn8=%WK4sG4DuR_fFAMAy6q!d*ajSQ+R`_idtmICs0oV56hA8wY=PLt51&V z`Lzc=wHL=17iYV@J?>VFPKF4c!z8BLXw+}%BeR-2(O$o_eLaKDn?F+JOyG(3`OS5z z&)6foS4I_NR8_C@8z@|8O*vgQmYOajdbhuMaiJoAH>mUq8ow?T<&95Q?o*M9wbOVj zMO{WKyV;O%0X{gN(#_bTBS$6#wbG#pBG$Y{XkK~Tr)fmz6o__D` z#3xI)p=zKSm+;lY>|EyB>i+r_FD-cSieXL|-a^UIdduc4*#)g(+7pW8}PpCHix|$~iAd(HeKg08CD*XQiL5m6!&2YU88s=f4u4&F>c{z-mXfbBK`KW9jfQ zw>%@mI&NqRs$TgdNWme#-ly2oF(V>VN;OWe7j6(c6Qn85NM{s&T$)?AQ%2>-BhsSS z(kBTGj-A)E*~=I2m3enPX=jFdozbN#&GQ4u!Zkl> z#_Q~HMU_K%C=GDA|y^j9%E-evP`kVM?TF-GQv*(N1muYcJoN>JO?W5`o>Yt+N zJ2}q2tTPCm77285>5N+OCo$MVJQ~k7hh9cwp1jQemfL_yu#e9I9W8_y!&VL9Yq8FW z{`?U-(JQoDDOfkX?-vWM*;{nd_E&QKrSnYfh5!S_sdERZ??J9MD z@ln>@!O&{xO6yJC2EANWi+u8-TV2ge@len$q^@=nrSQD~>Iei|`D1Qi(_h2;~((~3JqcTb_|WccCFI^N=m zOOO{li40$Txg>sD+^XYsDf(z6cl>jTBC_c4u=n)gZXKl0V0c(Jq8X!~WSeI^NGVf5 z$WJNcD&fM|Sd!v|CSMk0C8hIbdCH~HsL^ak_)5@3bVd?IT2j@g-8E@r#YKms7&U$ z%9MwN_TO63ta;Pm-w@`RG}ZT+LFzuYD0OYJq1OGozgpHjSS^NM~Q*}=nLr> zZbMEx++C_q@y8}{z0AGL17!|_t67sVtZ9XrhypqVdZi+j#)5P4x0y59AF@hB(#7XK zP>2o++EpwH_VC0lvFMmGlueXlsQamll_SiW#|Xw|#w5!QO?BWTLj!Qk{tv6`-#M~( z6{2(33gTh4(uZY;6cVjIMcMpH>CNCNyC|Y}%@RhDrDJMBCu1kxC$>llqzqF2vf^8+ zxA3d`SHo_K`n5y6Qj*(iuUg(wC)-E+DZ784WK5|htj!g&7$ zgl!>qF?J}9mpMJoCDtGo57u4bE`|3;uLis8hY-)|A45iS@QN2OkUHDK3>!f4y#$_%@?W0<_ zBdw#=I_>(EgN1|22RYd1*fbB~9%^%`a|S&)WS??KHcUs z%-LJse(gfxd_F%qm$}(`2-|U;wOFRzo7zMM>-UN;JeaN9DcS#U%zPAVJz3RNRnQP< z^~NgIDmxjb6QL73t?zaD;`N34MJihM8*=ml?~%_xKGS+TIdgKO7savR&Eu3k&*6!| zd2m;g;Eb&0i7>%8M%<+P%yw(t0iS}6#0^>eeB`v3hx)~Ivvk#T8++S&u;aCOPfe<{ zj=pV$UUzNr35*Lo{J{Gm{t2mYtVl=tSenUzU3y-+n4Y_SkiM&)g4Hgqr;x1FVZ zZZ3W9u>E%5cKjy~nG4PA>MM4&OR>Eg%$wibdyZFjWoc7rf6%i>y^#(S&k;ZBzP+Hb z9f0cst;Gqz&}_nP!b+{Qj<=SZe@solBf0aET88N5!ON*Q?wHv~h43APZv1GdPWII- z<&4k_zpRHZPoqdaY9|YPefgFBv3z8P_!yG|6QO~lIBn*YZ%NN9^EcFlN?(nhn^U=I zJKAAiwg@fk?3w5L>So7d*Ja1|JXqV={6puHjuWDz2uog zD@Xkve?o1op<-67J8YJeYnUT{yc%JaTD)cZ!9reqU}@iHKbmdCf@*ThLjPM^uXJzx zfq9!nlsRH*wtV4LiG+=|&Gb;L261Ic1-_+QrAO0&BzY(~KG#dGc`jE8A&Gb=qK%2V zT{|_2(GA<=ofECG+Mx}iQQ8R){jkHNnj48hX+g4lP_G(|`*NeI1`3c;!b&!$%|VZW zQ%+CA3(a~RJFkIr`fFs@T(SL0&5Im!3kwGdN0qRKJI$`$?%}aV7H`fha^}@^x*B>6 z+_e_=oIj?=Y_uo6#XII0?2(VC+TPZ>wdemuG{?qfPEJumQPxh=ZWx+^Ye9>kz166G z;rb)vykkevG@CV%BC(ebxgj+bJELr}*7d2w=yZ&rjE9}j4WWFxe$czyv_!AH)o5M# zcF<-qt-bYQ^aNdERd>ULee1Er93(sMRwD;;sDEW*)VXIteb-@ljaG}PMrL}e#^xYv zvj?lvxw3E~$)Prn3wDY~tafrm+}OmPkZe3W@kAg8uF;U{krMDKcq;9vp4z}S7aNBU z*$)#|_c!WK$Ywj<@(N79<14*5nn=EgI4PTto%*zje(L@4?fKi`TABCFeVcJ9Nh4_MERJlakNK76@NM2hYC+H3s? zZ+B&M`AgqXU#3oXW10J{^OG%&wn{67hG4#_8;_RXd<;i(wnIZReSo&NhW36@YVZ5p zC299-!MI1V+yhQw@0ifsG|&SzG5N2#H#7h1TT7cmBKkRwp`;va5K;@on7z!3G(qiW^ExqfvvuucKk0 z6QNxJf1!hKUUcHW{}x53Lc9F){Yz+Q0VZe|zds`bexttLgKyM1zkXj1_eZ-1Ug3am z=afr-efo-0%H_ZQ#*hHV&;%5P#KpmHMSWWX14}z&EBkK{p$PB>ru9oT&;tbIsBd&} z1+p#B{)mZ^s=cbT6sNwG1)c6|D?I}`XA5goJ80a_oZwFj1AE;^&KBmDcAU;UPkw%a z6a0;OoBqk8pC7R|<9VVgE%!*s%GTf!E8Ppa7f*O^JbLtq+xE2~r-HEP@6*96o+rlk z_ST&A^iEDrbWTikR<=g;3>+LB^e-6c85wE8Cur?lEbVojX)W!Z{%Yi}c7zS=^leS7 z?MxM;)POpnF08ciZ4pZq&P+ zawg6O=4!$w7NF0-HFz0baIka#JmJ3%{m&);I92tJQ<+$q|2*}NL;ray#LmE0$jSm- z)SmZ$2JH9Ae;)jOA~!wi+5hN^UmgAPUC`6KH@NBl9yQ(@#99<>U?3lu2+M-<3Jpv% z)Ym020nn&^{YL$b=t@}_z(YghM-vwoP;y3JpS+fU2N^otetYkZov?t&-4{yGGxZM!;gmkFF(AAuEf2UD;Nkw`Rm#%}$lKkIgQ9bZ~ zo&4wI`oAgldnx~aT0pQn_m~WW>yZcJ%2ar1fd!RQW2_^u-Wn%iqSn3l?3F3!xm(0+ zmI=u1c2dtw zM+(+H?%fg7yaL0MfTOzIUb9BzLHwj$9}lZRCtjl_!#lc~oQ?8f?V$>b@l`hMRt4l+ zue^ea&gb>(rCs@AVI;AhoJ57633c3cE2muZLLPDrWl6^y(yEsggR{m%*=MhN$1r-INfYKZS>2_FAcijs_qxWf={AH7B7%` z9yJl@xNo-*_3@rQKa>*mz5lbP{2%cPmOs`c554;{uzBAg95GH0SqFF6Ma6x#)m*Vw zSUVPY_o;cZ$3e&QY0uLQX-Jk=c25jL^N^a+!$G6E4{{-9%x5d$8M`QZjd~N*Hagr7 zMvW@YcDhuE6{E3Kv>YI(Ki=Lp|MA&-sNU_cW&%Dvc7REl@apZ=TRS>>!4EigtSsI} z2H{p+oS#~NUhcy1I^C!os&O!robGyi?REv7R?X|#we-27#`B$y>7`^Z-Z36bc%aX( z!M;ic9synJIX3>UN07jm!^yRpZh#6mQN)|;Njxtzg?I9vZ3%I0eZIUAsiF-HBv5;O zxAtiEU4cfWrMylquIu-UVI8l1xb{2RiWr?lU(?*61!6vvZ)D^4gVO1z5{JRzTQJp- zX-Uq@kn?914RO?kkyKoB!A$EuSadl>%{jJ>CrHJ}Do=VH&*KH^In@_SmlU&Q$|s$s zCslI%3mgy=qlmg)87$~Vf03yn`{S0i8)qf&<*pn+mEnCSN{ zbRCcVNEWl9?mhPWbNM2%nmjY2HQ6vY3=%DeTuu*@3M+%re(GkF7fB~+gj-{HogZiG zRP23|TnfFP^Y|8pRLmQ zW%>5s;id0udm$Z8kC&1y_Teg(v=t*?;@GT`)F7D1!zs5ud-|M*F7qE^Z$M&weZAZ- z*H}3e*zA=sy3O;tcB4W=70+W; z^716{$-}{E>PbQ(Q4ygWCHNkj>S4a8k^QvkgklI~!9L9F6`MG@ z)p?CzImt_%c-l2f%_n{A9fZhGRxE{I3^>|j*DUSbzp#OR^^we?9J0e45ticS~m8#9q zDeIj`GB!UfJk!Lto=>W9(0L%zr76hLRM03^*vX+zQW(bTpp&(!RgmwNS+nN8oEktd zL8uHx8ZWk!dOc5Yo`qJQ22ID)rN*g6W1}Jmv;61hS8OO~^AQj^08YW|>;al(ZIJ>5)E8Jpl`y{G`0kUa;J# zFDW+0+I(g=Cp|QXR6*|~TE|m7-|KiGnlh?yB^amS;$*lH=k$6zKJOrOq8Xbjf9$9@ zX<~NcqbzTS^qBwAR;R}n~&QH%+KiZyS zLQ13rWNl>h@mNjO%hBXBs_+rd)zf@-7KC^a&K1CXVdpaQ=}SU!^5NvVbEjmfaVh#&e@ToVK~3JLuq#&*H~(>OctA5|og5I{J=KmKYwenV*`8ZGKkA!p*(O8;4h4=rgb{yvqTb3A^>ETiT?`(`X}Dio z!tCK^H4+uxnVK8-!lS;h@%%)vrba{Ob(Jpd7gI3py&FXz%V%d;vsY~8+T|1t-P_bs zYr}=jxGs9M(J0O7aIS1FdmK)x$bHj}{2HxkD-L-7h9M+Q971Pusr387#1{EE7S-hR z%;v{JuewAxMD@TyS$Dl}XHoi`;~bt?mwh*x(wwc-Q`979#sihgtpy32gx?j;k1G-L zn9$2S5(Qaa>jAvdODquHB#;=ckbkQfH6*^Ms>X5D<$*j@h6jn~_&W&sJWi6k)fHj> z4Ga<7nLL{$44yu}o&25(?&>PsP?I&6|BfI$!8Jh(zj9@kW7%f@N8W-&@7W}Q|I4qEpH(WQ$??jqkU6eQlMxSYboFs8 zB^Z~4%cbgcWI4M5V32NrXqDL(8L)rZMkAoErD$M264pzL$X+?x44VBWzOuJdOzYu4 zU0H{u8OV!q-Z#~rm_V2X9~Rz}l+J-Eu$|Z1ffex@aSQ2zIn2dV1|4byNd)WU$*ZFJ zgB|gKx3_uOi4Fw4#%F2QC86V6-{jJBIboS`z_U-0^AktvmTAKwLc zkUYMo@wTt2YrYZ$D_{fmMNt~Fv0tUz2-2}ze6h+a1ToIdN-b3u&xENvyYnXHhfpdx zByDFg=r(<`O1ZEFEYxgzPuXi#amCb?KYRo+kNIu$n zkSfT6G!T(K^lxy!A+00YiksrH24>ia4_OP9P5dx(K7Zij)Hq6EQdHT?Uk$2c+C!R_ z?cY(6U-VJZi>iY$vvjYR_Kz;QnRVNA``2ON#gD}^y+(#r1k1C8MP73H_E~G&ttsl; z`Fp5DK9Jv64}_{LM#ag&pRVX~wYsk|EHsYKLduD1Qbq{EGhI!TA<`Yn4;v9B{fo+i z`(=YNG#xK^ri%Slu8YqdvB*K3IG>U9N+&PG$XVnc7S_#S!u7Nz@MY=Ri_9Q{AP2`g z&ILd)byWRSJuoxqKh5e~ZN-HV*^;$PIgVR+a;;q4B<1A}0?#$1OY$>dJw^qr>nc$F zzXF!;=N2e#x>s<3SJfJmReqbZt?_by>Uqzgu0Bx_W4_KfCLADd*7Y_oYI%H;w;_Bx zMJv`fUAvW`l`zj87$z=PZhGvi91Mw*jjgeTH|Uqs?Bo=f%M`$E1Td?9mC)x>Hb^~R z)g~hdO@+x?aIH%&Witp5N()gf8*oWb_!FwMwg;|WaiK*uC8qm&R}P=vjK`79S;dOlGJuD2 zA!6<;3myubiFlJKy`@V2IiqihzwtE{s!@MH^P2||zO+IZuf)<|oI~}UZ)=~ud`h)o zHRos~Q-YD|aYl5RFRNkqy^FM{bL1&uFQ{Rt?v`}X)#826PR=W8;QaXNCR|hEI<-k^ zw0x`1)pU>C8c9r^a_37jT}rw+B{w3OhZt@?U@uPD=4ByBje@}KWfkjA_$ea!Z3*^mGjFS13a zCEeih3{31~PQS>sapq*L&LLDcUZObQwes17V47kwkr9L*;~RU~&TxqtGTySLqmhid zJ5RQYh1mz;_AouG{5H9S)0j!N&(v8VAuN}Vtf?%iB9-h@c>K1f^qP%D0j^ce<`<6T zsyze0@LCwf?Z&s9QFU2_#eNR^vC(CH8bvV;ZO)sk?;=0Sa6V%k+`mcBAOAvEaOS(B ztn_m%ovSUh5oqvfmJpYGvXvb9^fP20S>5@NZz?5;^AB%h4I&Dm!#$=xZpHpB(rt_a z#Ji@q*TfpR$UK-BXujiO==Ni}9Y7pA36CLvBH8Q(?3Xi2w%=6y-^@ISalZ z`Iw|UYbg__{q;L2#U}JaHYLTS$VG${OP@7xTdBkQQ(N=41u@R{N_k?wzmQ=X*!??j8{9*7$H0IQb(b4&}-8l9>U+ji=`$ z9`2-eoCOR@kqOY`0Gw)xdu_O(W3rqkFm{Ll?AcwSY5!i0uonRbpy*eMUskG{a!fvq z@f;B?NtQ6p6J%9-(-tN#Q}>WCZi}Co#k6Y8{URs51Y)gC z_7SU62Q0(1YH98iUakT!M%4FDxhy66kW_}Yw`M!=fl_OqS5sd*t4|02Gm(z~---c@ z{reW$N|ckAak9FO4)Z*RDe;Wl`77cqQ1=U8TRayF{+m%{HbpSwgQK>p>r!^ zZF5&GB<-N5rtQ@9Tfj-h(UyN#Sb4Gdi%&3?yF{i${C#2e)o*?QHLs1iFIm>9+l3XM z(m#l>Hyg@Vo8#c<#Xx0EaG0n74HGWvX2QYn>-Dd?N#k>Ues?eTu9O{p4ddZA*bc91 zDE99&?i>}NVL~jg{PFoaIfUMTWtnUXAvQuRyh71y-@6!!761B3iW0z7nL&5M{+&U` zrvaph9z73b=)WJ&ya6x^B-r_{=67|I6`Q(4~_2Pk*9}e_j~XebmaJ@m6hDnFoK}&hIe$`=tavh`z9U+rP2o zUzOaTpjJh1Ni^|GqF7C+{Fs<0a8axc|#>1rXk%s71Md=lTD5 zcBOpKcu@Ub0{g!okDvylqQo%!$5i^`*;4dCWPo`&gw_GISc0qjvhY`Odba z@(&`13u_TI6C=agZnU`PCu>DZZP-N2RJeJeb-RNl3*|u2p~GzieD9(kf$RQiBoyAz zaI%^wbp}MNWMVGU?>SQI!S|VL^+EpZmdfWGD>L~B-Rw^lvNT`nO=K#;qAioNT!*M= zfisU$@H_ShP>>vKj$1#PVpSdHwLYeYa#FK7cF|ULw-i+`##NW_xE*x2nCkGIAL&z^ zC%GXyLOj+>+8Ild-FL(-&69aO7)w@-frLQe8Lj2m5i;p@fn+KHQi-bNdSeqCoSl#7ndaZsAArv;^VRF|M=Y6|7kI2JQyigVBlogp5px!H^* zq(F7ET=yf?j)6kN-Vx9;Zo6&PJ6W2yhdyLNRgvbF<`?JSlAL@Gb!tNHGf}V|fU5On z00F>s(Gw%MGVG6W^RSPzi)KWLEh32B&bHf)^vW%@Yk@v!rw^2?QK_SdXNtD^4%3U} zjqT*mXn*CmUjOt-rAA$P{io4XD#{S%`%4_EZVKv$Pl3Fg)dMtNc#gjsWe%p$yz72=<6MOK#A!E zJZvNRiI#1b3?Fh2oGHV5{2EmgrcIN#uY*EcYR{yN9IJ=?4I}cYXz+{ZJCb{RhZA;= zK%FB$VSbtpw2~(SKna;1=RIBjxI7rV4hYm`sYc20F{5OUsAVT8UM&zTaXjKUfSY{- zN}2R>j~=L!2V2dzDNmT~Ko7-A!VFKW8jcO^#3zI=GQ>aTn++F&2TQQ+d{(iPnSPgQU~dqx+D&JlG8yGRCEpk=iS&+v#*%E2yM60SdzILGr~J<={oX&~3Hd z#U5OwEFZG0+SeF7G)mtJl8_pHj{%8pG}2+2;E)2AWG-nMZZ$>+L= zcORUilZU3!khCu=K~Xjy1*AdYN)&XLs;Q=zG|x_SlyK6RVi^Ur{>Jphxf6x57w&!~ z!<4dy1q({^nAP$;38E=1zDDBwXfWAHdV=ugV)*x1Rjpx%AvMYYK93W7uDyOCT#vW` zq;0>zHG>leuj&lLv}q3kBf7_NV5kw}Kz(KyXaw@!_|{A_>XVyq&t)>li*=j3$=lKq z+jH&XD2)_c;uT^#gsQlk5`e+uE3;dFhS?MwOsZ6PnTcZZgLqB{(>YF#-rux=DTcPo z0{$T3!;?$G9Yg+OFLPMG)3n?C36lBW0#D7@BYNU%At>A~hXuf_8Ttq5f8w4MpT%~L9166%&Tv@#FQ8ou;OZ5_eD8m+2=J*>qK{Yn^vnv%l@XFl zL^rVXb66v2T5Wz(xc|HXu)g{00qa?CDES%vzYhIupI-=&Bx7=gc`BM<4+F>TzmK$6%Fgm3T68eI0i|_ffSB7q^$%2E_->dzk0>% z)1E>uSUY2S?`RDHD-r6Xqw9wsG$o^tNtQaMT1d9^kFL5U_yb7lrNEY;9?xoV#wS7C zB4GC(4AQ|R5T*hqK@{d4$a!l4VjXSi*C!S2ByK+q5$e3|N7nJoCgFU9v^w?!A{Kx+ zF9kM;DCF@98hzuL?b<-P7@~wam(8#{hJ!a6+~$lX(Z|P?5!Lr-L&6c$K0p^r0EJrOIc| z;)em@g}ydYqCb@;IxZ$yKA^2G^HWg$D;amngQ>xxjUS5X?N6UFKf-su9o7vsZ#Z5& z044m$(=cv5l!$9s9sX8AILZw?e%!?q>Ic~k5$YhRNMrW*vrbqG5%YQ0$x;L~DDVPK zL`B28>57@A2y~U$vZ@K2WB$FcIb=BH0S+u(EbO42I6N(rObOBhGkwL9cnb2m6)uHT zAWr_OVry|yT~2Q97YC3E%ViQdN5S$ly+J>|K&I4n@Xh+wpSh*~9;ldD+wwX&9?M-^ z0;=eTXW|D<*GPr0rMGvx?XNE6aqj2Tk!&jq(o9{&eqanzMVM3MLK%)XECUI`)q2Ia z!f!KMwmC)i-538i_Igr*- znP&7;AFmn62k&&0u`wYJ1J4hQhP9l^fp$7EP1CBJ1LJh~o+F<{tpfSc3*09SvugOw zAdtoJiD}_#1;i0y!sC2Z|D!;E50*{b zsZtXM$aN<-1jsA)sxH2Mc$!|nwmAsK+Q_PYKT1a|tR9s`Dq0CNQ~^9lt2*+M7soQ))K z_S!3^DF_i}v`fI{u3dXz{pPmUVyuyQyhVk05m67ydeYznIfcReGLoY}&K>-a+eYfK z)pTuRL_cglF9}G3E{ zI2lDwC+~nvg1=wgDLlPIGH~EQ_xN5()ob%6a`yC*;>4{LYPIx{7c?LPO zX9pgHmg)>k`)9PrYB~QBoM%g0W;Mje?|eJb{Gl=lqS!^IUf<epc@Ey>~5hymwhdqthVf$Y$Z5HEG>ut7p?*OkmK!23chwFjPwk~?&_0Rt^M!?{ps+=mhs9E97ah&=#N>-x z+ltUE2flf=oSn)MR+DcTK1ttRQo6p=YSf%{eim7aYy96s&98r_{~nA;5>C$C14%d3 z7KDE?lC^^}dBOU44ilIqZaNs$}R_tSNlj>jfy>b?{C3**5v>L(goedCDKr` zNkvKHp4>ekl7KqA=w>tUkF0~-4?3Wcz%Am zX$3rbvgVTQFa}u`lES#QBuJh1=W>8Vj~fKuURA5KoIJBX3aiK?v@;1-WGnMt5z$EX zV^C_Y<1OzAAaVU#G3C1E?1TU&uXK<6;62}uA|V$EmoPp_ip0JzD>VCs|cOLZ`%wO)yNc_rcsLw){*ocu}idD$5-`d|jAEo9>~r`jrw#}6ti3PQwxyvIWsK<%7Vz+G2x ziXn5pFeSeK?(QB8|A9w3tUUK+k%di@UjRCdiAzC75CB%zg5{CF@dc+pz){S1HM*so z2Vx3i57eH8Feptv6#6GEu^9EHt!1#v2WTJAHmU@TQ$xP%kNS_UfNX|o2hSORCb{o) zERwvfENVFE)ci)XIy{m~Bm^k{HdbEIL4=5MMGx!v2r5^%>(zrymbNW3`7Zd(IWdvT zsbaAwetf>;W6v%8AMG#dfe);_rW!+^d2!`zw{No@J^{ikvUDr?=pHP!kNjvItWkzT z+su9g0Q)L{Tzq=$i9CrVmiZjYbD(M`E}&uCLEb~Iuz^A>pr)J(%ZBl*n3!C9XfIb5 za?}FevRq1ZN?O@0KGo|&xqgrI_%^5okw^Ima&^yh;JYey+!zzFA!=8eOE*1JE(cEC zuYgHe2r~fK<=pxE#gyCT@wOz(2F?J=WGa19Vu&i>!F?T{xw}g#-cN8@jIG$|q!|bB zX}NO4A8*U1CJQ*>Vv-%{n;An0iQ^I+CmmG0JiLx(uAKr`Wp_PQrw(7Mu@od4gY{vu z+x-Z(DVz3($fHjPa^=PFYF))eoD9$BaGt}7LDIZ7DfqVOsUm!*H)jVT;6O7(hj0wpk97q5N$RWKi zlqgrN`cnf)i1y81qV40Krk8NQ0r{l!ofC+!K?j*oxk?9xyrWpOX&{2!_Har&? zfwGj#kO+kP2R#7&V5WyQE_&z@Jw_e!;c^v$8w9J6)9beSa>dH-_Px!58Gib88)nLSi{9naVRI=gv|wLvmMu-`X#Xc9Bn)-4DQid|d4c2JW3r1RBww z9q{NyC5H37m!ASUuVWZZ~KQc=+7qvt$CN;0V*0nwD;RT)UCg2P1N!K@X`N1ohXujy1~PugFLzn zI689W6A3Bb{^AnIffK2~H?N!H&n(D4di`vFSZ4@OEoOsRA<~JQ<>sO9F8}f&fL#q> z&jP`cFy23Aj~oq(Q1SZ^@{_b+=Jy8v{_e&Y^MdlBQZVUQJ~JI%1V{;%H)(TX|3iKx zN(1BO?+j-*L?H?9u{+KSkt*h(MBVTBuG{PnZ=+ZmQ#d((3@XVF5T=+2??=^g;DDiZ(<=SHO-N{37P28%_~s7w3nK zOh`a~2G@g401X$wzFA?|6E_OVBuh3>h*KQ=&4lqeu8&Af16P(Ip|-Pb*2QXGQN@1} zVk<)h@u*1=dtQGyHSP&GUEvE*7@Hwa_J1>j2ea?!DNax(Jz`#WXFxv!lCXBteYYpp z6GB4^Dq!e2iX3~Eod5{JVD0tD0~8r(UODO5)q*nHIHEY_lZ)d%FTlSK;vmh6>MOK> zuMj++CIHv0u6xT;UQ;ac3A2M1{V4*d>W`wP-0PuOO(Lp*1Y15bY5`lqJ!jGW(5cL> zmxFR@AsaZ+l@xcLOhsu}au9Rc8w0WvWfB1DL8ZaTX6+{L#ijBiA8^whAf4d@<`WPr z4HotLL(ubBCZjQJw~a`^WE&*yqts6)pkaM`eWM_1ER#EU?&3sXsYHKs>&9=0Glg}foft6hNq^oux1q| z)p*Wo9K7O;;+{aQ#{&9nzfu=mrh9MANl-_-bsSNxa%$EI=fW82H@B;6-Cn((FJikeYb($qQE5et6 zaDcN(-L=vdE?0i(Z_EDAcwn4y2|?O;<&eH}BKbD&(IdMdP{VhGpuj3)30-a^vD+6+ z%GY}EnQ*~a=je#9Ar2jlQu)14i>l{eTdo7nB_1=G(Llm7U-H~Q4_-1KE@?P>2XuYmy42t3p5xSKd_)g^StaZrbj+KCYo9#>S;ZChO}R@Dnp;K)D( zv|u%^icHIL9gDv1{pf*)RXw-|un>(v>mcGA%xW=mgC5TP_u*#4(btd-U@@XlUaO%` zV%!E0O4c&}JS#K)6z4Ijvj$T=r^siRw&|eBpqs{h5Y=_3+_>D)oFJQ#)}E%B(QTAi zIyOJOK=fw+?fnP&iGL2S3aD}(`JS*oe|-&gvI?JIseD(syw3008Cj&BlwQWgWv<11Wl3akl7tWdM#+4Ag(Vg!LLW zhqncmtO2zUJh_k^G7XwCvZ~wmr@)nNPtg~ztTRiq8|27}r_S^QM4oUZ^~LLGOR)Kd zHCj{3X^G_1EMBrD;4D*VYnIcXmK%IpBCt1rt{uBUlm9*t9xNF#p6qoVyQ~7Ss@Z{{ z(=iy(rN#zA49XBl`%@qTN%H~gmvou51=pm~WFBnZzN?mkrl3qA>xTSVbxUjy*kn)w zMbU6WI$lVy%9P5Fz$>zorC1{pMSO>v+%MleQr{yEB7MMYv`*7$%s%muW{X~DIV2aT z8BYS~K#C=U+8fkp`%?Mm4a=KSb-6xvfR}vdbZl~%8CFeypAqRYJP(k6uacUn2qc#t z_?coY=CyF6%_||wYUH?ijrUhz%j8k2Uz9i!EKf8t8pXF!X>{B$-U8?->Rxkum2{J= zqpxx8#795}C88>RaI1hJEARezbydY7M>5@n(bV;c!1l@_&hkO?Z9cMJ@fu`d7Da)A z7|_W#zJV;kRgbwz(TM5;SOi=aNI6F42kWCy%fl-%1t9)Gd68Dh)QQQitIx)D10LdZ zt(s+(DCLC96%=s=BL9&|6R~`KS49Z5B2EKs6G_pyenE?rB#avS6+I#Uo7>GVs$jFO zoI!2X8UQ2j*4QflSrq&LaKM)MXWCannFDHScR;%CjiUWie)+qEm7@nZVJ#HwZJQh| zHTZAx0ELi5z&}EvV>;Ahw4<-?dPrQOjSc&-9^q^HwEBI*)qkLlzZH(}DD+puW_bJ0 zJNx^nQXLA>Z-97O4nq6TkfvSF)ir^C0VBTvTTJaGP^^@q*k*08zswBPf)B_E;ZOZf1f(Gx#z&C)FYo&Lsyav++@|N8Y8wYse3O@s@t-gdS5^(238REDXpMZKX0ygt4%1U(blDEuu zRZiGSp0@Ly4?L1N0o7$m=tHhC5Mz@$w8{5LT$dkf7$rI|c1f%_gRE%L5_tDuE{xAm z*=(#Vw*WwM-p7?=rg;WYVZ>~7hgS7%I2%A!q42bH+*=#&13U6?H*8VZ0ks<}uE=s0 zWQJnIWbHU8UOp0MwPfH#+)DlM-_CMWpm{{2oP*U}U=JLnhHq1m za=HW@V1ZE;`hkS{zy24%XHl+Py);mCIRX$o0_qA}k#4Yel+u3xzSY8exraQiWe;t? zKF*SWlq=xCe(sIx_zujBBiBgX@9>FuJsbiX3x#z!dYPkqVG=3-)ZzKl@u~pnzSnKk zM6N`wv$YlS<~D6sv{7zku~hKx+qq!Ob%lMZoS<>Z)y)^o=Xob$#BKuNksZ8L>SVXd0B(~Kmam-r^ywA~+S^zxd{?u6XBeFvrJFXnM;efb9RfRWi~X(s6f)8<{L zf^-j=SjN`dPg7hVO%-4_tWxGNKpiIxTrBIKL4?d|Jpdyw8aBAr$`7LXZrE1dtbBU~ zu)-cw_W^70aC-o1E5!t#a>Lgr??%K(J?8bB8F{jb>=g4>pl?9bhcny12`#B1ka|re z$pUdQT_3uP&Hh<7?}_8id`ETsfc_51=E*wB=6*1k^|br0bmQ$?x+$r8Fc>v9m%}Ir z3ea7|0uj#X0}y%4b%9o%+35LLIT^6twLD-Svb-Oxun`dkGX7~X_(ZpQrRBqd9guU? z?>&VG(HaK^_Lm`cy688H_Q9q>Jz%$fsXEVB`zw%^P$oHl;j~|Vud1xWq?9ux!5jUn zT>4w+b)e+;`yCj+5?XiA>v(6wq{~sT9QNdUSsXRDmSElw93M~Z3meS-FJKeXy;f6~ zvn60-pgjX9NxqI^f<668^be^PlSbIr;G(=00B&on0kcAJ;*h_xkH3af><7iAq&Uc2 zm4M{d%p-ySGEg|(5c zioqg%t^vd&MjNn8sOQ1!%Nu~a<)?oT@Y8MZS0ZA;suCmjAnyj5MbqxZT@=ity-syM z%QvnK9RLB9bYGR1rO#}L+O7iu$*fEaA`bhu3o1@3;|B_d_q=An7}3_DHn^Sv)r+Jx z0F(RxW#plvSQmq5gHdu4&{u}lK-w2bVh_D=g@Sn_`)ffRiv?hnivWHGxU9Qko!rL1}Vl7cMX)C$}>*lC2K_7nLv!g#q`@Gp#xbCafU;o2WQSZVs0 zn3`HZS$&mGb0tRKzaM60w&&*_Al8Y!0k0C> zkl9tBb~+sR1s(pRWdN5^wLhe>0jx$wdWwbAgL~l_z|l`R;05w!OdiR;hbtxoc*}Yq z#T0t8OmwNHKr#w(5*c(giPV-^z;R*KYYA+5SD2P^A@dwS=JQOD`pZ7@9E|9%v@V9{ zFTz+nE`Bt&k3_nZWk!33~$&38LC9L-`~%u>=?H{Uvnw&XdU|9BAQ zb=KTo6QqPfv5adg1`0LUH6%2JReELIbrIC7*0_a?qncN2>S&zQhb8EfZ}gD&Zh$mV zc7;&$h5a^1C59;*U)RvTbWkULKh_&f8OrUsDR&(?7SkG(@rgtt3?3jhBNR+c0pYn1 zQ-teC`N6b#A&OY|urVX;@*GKZ(RP^bb-_XQEZvuHtrO^;Js=ofH5&1p>PI8WW~s^` z&Aok1Udqq3?%Q6RSs`W~1uq1hI8+4QYML$@5_H1sZ~lz7ND%V>@b%vDSib-Jcu7T4 zA`}@VDKiO?aTi*4+{mnC@10E0jzwaMC z9^}5S>%7kE9M9)*9M9t%P^>Y5?A;*KXDwxeOj>|*y&qjj9E6QJDh6NzMX)^_?W)KU>npWZxeG!og65ERa!)8HHFgrnl07y)ftRl z`~GKd? z@F;$m2?i9nn3A#&Nc$N&ksgL$8Oj~1m)ca1ra8aU_(F66!m4&51$A4k+tV?xNktC1 z-QSC+=N%(;m{iA{T@Y`yTVGtD``NRqY>*y!F6VRkn|j32gKb&TueDTP(jxF<0mx20 z7r!PIIKv-o{hW8_G$#G^-%f;?aj4!1cDu>zo2oHc`$1A*auH*@yM?eA8X5OT>1=SZu@-@I@~wGPk8W3) z|KgNaUN&}2_0>%;!fdRg2XAj_wwAX#hdLUn4!Jdkge;9zF9|p?yMtD0L~%x;P+ci; z_6JnDUtWt1eweZgd*0i?cM1p>FaZ-IxrnZB>Saem-+?aDBCeY`(^ADwO2B>avNV+-VDQDR0lN56Wn z@s`CGVqUP3XxPYz6CF)Sic)S1p9_@fk~#9|`^*`8AK5VDnkx>ix5nSCV%i-X3>ADv z!g?|Q!Jppfm?E77LB>xqACT@S?!00|jC{uG+0u25SvmNk7_=0!&%H zkOXaX?G1@1kfRj@%4suu!ERv?Ds>kb5EsWhJD(^Wz36GL3oITYJw;qpdWwwIj#Hm-lCo zZ%b$fAXgT~sL1KO;?AV9Cu23fEbEgWU9=s2)3o_pgUeR)$fAVLV4PZ8 zynv}8&;aQ!|1lcxe#mzhAowhVWV?H7ZNUKI8;45@T!_q@{Diw`<#z;e1K{nz&j*rD z(%t?_UxuMXsYrb!*e@9<0Evvm2(&}Xq;l6fQNwsyybx%J#8N{*gVuy+j&hX!des~y zc)PtEVDkQwxr}fiXngO~k(vM^k%_P&OYFkw2Y9GDPe+;Uis(8(Bf213Q-)*+1bHQF zka&gem8~ENG9*+&WT^}K+rJg%wbUjTfPjF^yFxNrNDz@-gM{@N%z3FR)?kBE2*9R5 z`#z)|NG){;&?%rCH3S)A!z#cuEs!S_itcxaa~J_1WDU0i1$-VzZ28-TLC!Y-ybdrh zNETve_-ZtW6Iq>u~G0-j);+FgH2G*bzPymn%g(Uru$%M>+ zp~ut$>?_i%9N_s&(d1JO4}K%NKX}{I_{ttocP&0N z0xwOkDA3r}IULXkC6^Vh4DXH$@fIMKJxU9WjQf9Rgn1aeL2s?zHVyW|)_XwPsE2N- zS@$vWS_4wo`nBYf_JTl;bRC)PwRPn0k)12&kSnEk1%Xp8}vE zJ)q>iu>?LoipKMD(t?3+tscVlAtt&Gr7OW%O<#WkRau-(bKANJuK)P|ui{13{08ru z8=2I%uNaZN8G~v6l;Uh8y#yX)G9Q5wfcqG}F(#KNQSXCw4DT5XJ{h_JL=Fc>UKWtC zI?vUrg?S4!DxE@=BBRBfbiU)ZqpUm2?8)q|bUT`Tq#gh*GdzXrV;-{1N*&t;5Vy2^ zRj`sYE?cS{R9Y94((YtG6G=T+vDyGF*tdVbb_=3Qb4{dTTEY;PJPS?4um$Rz#{Iv9 zZh(^QD(@P|;lR?#rol1Weq?B5j3Av}xAFjQ$=YjReY2F_rmhOZ?*-lULn4#{5&H^#VGJ`Audq=8K! zRG$m2dFsL~cpX-I{d66m<_7S16lfu*k!6G50K zTEWKK^SOR0-?@gC-^rVmKtW8R$Zo!r;w{W9Y{40!N<$*HXm8tXIr6y0cFWt}-!K>k zq7@!mrsn(teFfC-+|#6(M5~@Ycb#6YD#dGBbXiYFN-8iX2sPgGm?5*1djPU!udMas!Dit}#%7zhK86w$K z{N{ztbihOdv<6X@P%TqTgA`18MjcXM>U=ib8OjOgV zbzJ}C_LbQFa~YSW{=tfno%KAZGuU2xf4+FI{We*-noB0PSiL#Xe~}P-qNF#1`LDg( z>KcjqqrmlETeZbtquA?-aS3yO@kr51>o@&LcLwOLEgyf4*`-I$*JT2+MWN< zK4Zu|_ms;)cb2#m%*|su+&fldlH$bhFu6NZ*#7ST-^a;up62d*GqG?r<{gVHHi$&l++pzye@0)vd@fM@^**fWr4 zpPmoZiOL2&W&%;&hxt&q0H=oj4J$q1y+ zfi)+HHBG>D;o6KT5VX|JPE_mino$Tm#h5-K^Qw^(YIWxI`4K%5VfMe2FJ90I%4f;$ z09o+*GYA)rKo-bjF)0y!RW1i{-ppd~y;VIKX`>!3(YYM$v{lcc_}B>Hp_ZVEGzZ9O;G_%-%+!E>SB0(kdAM4@8i zoU7A{$XeWgOH;NXy7L!0}5oR39+VX19^IyfTIz6D*TKFjD3g^ zI@zYc@cci66*NW%9WNUk9v~Lef{*YSx2Ga3NC$uqXkGvmV<`~x$OWM^-cDeiIzq2G zPG~*9bH2sxD^e4u1v0oC5GHNLr!~7_XnRILOJW4j4fly@&btb!p1#HW-HgNk$J=Nq? zV1xtI5~M?@lqeeEL9+pQ{sI6$E7<@76$TD%#Ce6^(DI@iuz>MU094xug|cq!bjWwI zM|k?T?RGTY9Vnl7yOWX9y*<#Ida#ohEEJ9muNIYsktRSCS`rBbn`;R1QvvaK z7Z~EsBz5k$$A8D{u`$Ln+t^O`A`l83;fxV4!Cv`{m!!z4R*pb}I$jgw`5_$Xn zu9x4wVbPd}J0fcGA|_+!41Z%mo_Um4%HU4)3=RYnlcLc~p zw=fYxVd+Zeq2qb8M^(QM?mHh<3{`V{8p?s?W%{oI9%PKg7~*f?<|rAzsk%Rpg=nF@ zjPw~)eaTi46OrJq#U>>F=!b}(6zBQr$qpAs@i~8wr6NqCAOCE!eqpq_$+!0x(ItD4 zZnIFJ5!t&B4v(pWP7jieAxm?If!DlW#N75O_19|stSP9Y70|{Ey9@Jwnu(D9Iubb~9zPcJAF;f|l`B7pUiH8hMz^%t=L>0I~Gy z!yj4Yt2%GI>2&Orr_gFFS7K*9RNu5H_RK;ub}gAyR$<0VeZ4wc*JkN!GRAYL4bL5h zQR4My)o#_W_EH+o0MIV*>YkT~YzN69!ium+8*~%4PQC6H;cF zr|%T+P71DN`XD}~?G>xH%KeMc`5SF`w&-L~Ncj{bc(s+S*^pwM-Nr1YrH}{6U`b9` zD93`%dt_R5g;{$@DZSksP2rhXw`@g>mJAC%WW^!3+{o>4cV}t$ORP%uikc9`Crk6EB~37kdegUt3Gg;S^0eAbkl0;CC)tHUn#%8ZP_SU znh7JtU~;;8qprjAgCL!W#8h6#du9TR_?WdwS@UY zCG1kpMUUCu~AiAziP~mkAN9le!3}n?c;8uE)yiKxWJim)qxy(0^S&vo;h$ zhCz5%UMG-^e@{}+?1YVwMfmpLz3XaDhQwHzZYe34)Is0y@jHiHm(R9o4qu!}kVZ4l zCdkb=`ShevNLMpo#Jtej$=-ndiM3zdh)Eu8m69zdf|OuCe}}~kn`%Tu3Co;Sj~ZpD z?qElm0#uHSXL%STSUf7+utjnw<+LS}{3vEsv|2M4>TZKw5w zkHz@$%^=x12*|~6Z0(ip7v2AXBuJCY5|UH?;H1Sy&H3D|n38eRu?Rmm(zKdTHbh(^ zT%36TT4Cgc?p$Kc`iwU$p2+Sxvpv)pYX~ZwyKf*E3kZBcpq)I&dc>*T6p@ zHyW|g*fDd{VlUQ*z%U^i6g-~Av+}~}D^`JJPT~BgXreR3ZV9h1U$dxJvWH9);dTlj zpC8OjviJbt{QD2@aPJBb@R+gy*b5zxTj#=K~JZ z)E6-xmi2+6TFD+qWi&X2sZODq8Ogo9Dxu7PjEPLR0<+cwP~0op^wVrY8T#nCWR1Et zkRW?tNuwd1)c^i+rM9jsT?*2xVn*$dy{PIl7%*zXVMe1^|_B32=IV`G**8yqSSg}wx`qi z<_l4W@-jWeKGh9COyaG)A|dchlOKNz^u{{1=)XusKFZM#`+U4Fh#-l%jc14N0EKdi zbyX?=7}}lH@cYoCO$1D&TXZp|y?@ZsX8?FBwfJf zdF}VQ0Vkz8UFDMwu%G0L-$E%q)R+|J8}@Z5DC;Fq^`!qiquY)0fIY|M7Yc zB@J;tNYs)QeaHVw~U8OCT+8dx*wh+|bTe906f-#~V*gd4cfo zdnhh0TatIEYcQl>vUZ5gspf|FUEE4jB)+)eWRygj}4#Gp7MD*Mz|`?QkAnUraY59P`_?k~x<-8WGoH@VY~Nq$O& zlQpRk71hJUBs!y#A1UuWI73Z%_q_{!oV=Kr|A@nsP3w(qj?m`NK0%!|!Pd^j#d$uR zma-Np|K0h%^A_$K)#?@N2b$yfs>=6_mv%o_&@3kn;Uv$MBrA7Zu%lu)& z>5ksJckhQq^mYWSoPxGP;M6ld0fBrF;uC5jc$x$?`tUFwmWi$CUrFHqOp{ZDt2mtd z6pLE1c|sFdN4$-iscBfvJLfPl@^&nS?0-J^_fJpp`T6-jm6ob(ZEq*!=VQ|{X4tso z4H{Z&|8C-EZdVS+8F6z!MjHX#<&d^*kW-zb-TmX9nSrl*QXcp5yqcQY4LBXbGzgP< zq*^S+frfAEFNuzeCLUN`1i&IGMho;YQn;j99)J6HYr2A6yj}Gb?zy12Ss3b_{-Q)= zG~ubKP9F+C*kO}HKl?L^zkr-D2MX%LaIvOeA|oTm4BJ5D_efk^e5_NT;LJHqq+zq7 z^czq3DAq^7dgpm+np!E3b!inxU+;o`RnaciJaF~N_L)MXp~@ofwTs53${GHQbB-A= zb~>DzQ}T^P_8IeaRm?P~FEDKir!t>ELgyDylu5)k)nP_jI1;$_r0%+pW|n`?2g$x- ziz$yfyXzv@#PhSiSK`sY&meg?v$toUuWybS0R^Z+Z|3L)j1-QAC^BV!$dFmw9YEjKgJAQu13SVzJI5#wjXp&2+>ntc4t1HDeVz@SY$X)f96Z3$N zM3bJQp`o$w&W#G(Uhhg0vND9Cma%cgise8}N}#ZM5nrCQ;L07wdD&v#(u8iNrDH|% zr4QXy68f=LfvbIWZNK|RpImB|lR|JZZXMU_EwmrCUIs@urDjk8iLhNgo~hu-E}+hA z*0N=g;s0K0=mcAxy=r}?>r($UhYbIvl_hRo^=QN;Yw+hS!L_KRcRC+w0!$p|RK!Gn z0S+C+x({2kINdgAnY^8#yZE z3;Y?p%^*I_l-zmKtX7?806pQ2?ScK$r z=BL0~BQB@pu12AI-O_oT2-%$Z?={}u4dBC{`l1ivr;^i*r=UucnR$Qcr&HmJzKV*9 zxSZ1hzfko#berGqdo|2@!SKjV-swmX7Y|a)el8~42kT$Xi@sKeNiH!ht9_)DC2kpR z)G8iC%ub>xub_xXJ;WyVSA_R-U-Y>XQBnjtjuB}nz@Rc#F&&f!)hOUM@*7`>9jj z%3eNcd7+Wbu8GF%>ls_=<8;Q~z>R6$bVx5L@(PlE!g|S-DZ!ZO^GMyI@r&?PwrLxk{d_jlaba@~0{MxNy@ioW2BN2E1+Icx)^eENkpv$@ zivLk;4%bk6N9KK@5P>`+fzV3#%xjyy6^- z%l^}tUlnVL+iFGlbh8gg&i+V0g@>M*nW)XrQ|Sm{z=7OViVoiuQ#Qe7Qm?Y_DZF#%)l&hJNFf3|G@;O0cIEw@ zDPavKQ29)ypT)tUMXlj26X)##xcC?;yg0XL+f>Mg)!Q|5 zP1`HOS8deC%%P&|!LMh=!}>;Y1MI59YXyb%HWe$8W!Qej1&c9rkkNa>x9xWMHsdJE zubSY9c?anc_W@{lj4%^;v*TZkm~O>Yi`7ZhWtoNwJ6B9+pI2Muj?MI`)&dpw*vJd_ zIo#BzinX{#-pWnMO~3sOKV&Ra1mU=8)Zp!~A2df()Gx4l0v;j&km&0s`Fjyvnpvi{ zo#Dfm2MUZDRnqJ7Xb0LSpt2(XAVn1(!(RHTpiE-FU(YPBZ*JpKt0Iiu#LWH#>b4AV*Pq3#qPlq1nc{=L=L z-lKC#_!!)7$J3t>fWQuE$|;jRP!;mkmn>Nd&xaYJ1(ZKt!te{y#4Fb%R0ssF$-FF^ z$xhU0?#jFxE+#@tmKSCVv<2}@HbsP=tU{(b+ZNVgS6_hWHeZ z{&f8KaWQFGGS!=pL7YHU`P_>Umq)(E7H*z2@zAXEEl;!J9@)evFhovHbZL3#?ZM}k zg6+~#!Cj4s#dsv!rOjk3ZlGUiCw7mgzH-b6a-p@@vTxnVXlw}opud?-gXpwe=LtxX zYw5(x3*JOksXGhI{)yF>4R8arvWLjL0%`Wo;`goMZpH}81JWb3?_ z?RHD|k6Bz8(e$Ti$S>-_|51OKveOlvs!$Ir*b9*2=2*@=&J7#D$0v{{Lw$mCy^<>q zx$5JZ`!3<#i!t@sfH2KmLxe}GNbHcS<06lm6Il~&=wzjo95|nzj7JJvpX5)KHmDk)EkAvr*+zR#s|vb1eggPo}Q_N&kEfB zsEfq#GLB)yXtczh!<=|4+d+ayAg@ACm8@VtV0ijBnx%Y#B)R)>^lx6V>+w^V(V)z@ z@}%4@KTRn+E=aB3C>+IopK~k{+$109=p;?+?W`)FtYF8-RQO)ly?4m9>^#Qwcf>3s z4Y$GF?4Lh4(g7up6np(=NdQlo|Ec^=oC~WrO zSO~{SG+fSOmeQ{tIN-f!gXr7Phspk+Vv0oLa}A2y>F2eFF%1UDTu;u1SLk!xrTKu1 z?2>Rn`@VazxS9mq18KFxXRk@}b_3`EH)GggMDf zMlX}zw3@;Vl)@?h;({r9yz0x;i@8*H2nUI?5EoW8J zfk`8A)z?!02zjt4j7=JN8p_8n*2j|l?;Rj!0f}!6j6+H<=>wz8~$Mp2|{bOS< z>PUpdh7HTWI;0~G`nmTxaMOm*H~moO;UR8ZPrDa$0?*xb7EauwN2L7Z1x;7}sJ>z% z{`c@^?Q!guz6&#iR97CL$$p|Vl|uGRGIzg~s2_*%2re^UHyIWHsf$Xra1&@CM8`J7 zKL*#5oKuXsE`!*dPDn!xUS#zWD&!9C(m`&@EhA}@Iag|hhuxxeUZXv1auhMoQymFr zeR2t)j|RcXQ$QrM#6%l|+2~NQ&$OW##D2-X@EleVeJ2k3yA%&VnZSjC!bS-6U$n=G zNm+hGaqnG!_wJqBlvN}Qi@f~&n#^gXhriz!%y9vD{FhQ6@6r>)4>#rY$Xs7SKQ2XF zbw^&|do@=8RwY^h@&P}-*AXCKk($5is=xE>+`%R|d0k>*VZleEJsXD7@6P|v_fb@E zA-&{|D|k!ruEzOfPaqFVUy8?m41-Cpj8BpXy+lO8`ii1XYAvXB$7~lh^PDx*UpY!?l z$fne0e@{iwz=`%__37y8 zVG8Fpp(xKvBl3ZTOkI!a0 z@1`PNMPr;BU&*AuLC$2By6~7U%0+qqJ7wj2V9KPNmII>QgpZG7(;TniVf+0K3;v$k zpREp49zpjse_!)Ek1##g3>W1pyuDQOM(c`|LA4X!lr zB?<37gfJ#dQNwQqFiLLKq)gXLCtRvIIOI)~pF;8o;jMPR<=G{AzQenpXYEI}qg25N z(;OJmZ*Yo?h0mO)O#28&G1G-Ik(k-hm^Q}W)B6G}dXgq60{PC|1gI*0AWOgHq4)(w zd`e9XAS(omdA7bfN1XYxhUJkp4q(k-fJ7PqEe)|B=8-7}u~an*hgd=9pC<-Pj!69% z9w(IWZnjb`coLpN%#PbFRsxA!Equ(Ap1UHs(sE#?17jL5IHvMJqP}w1h^7=t=*bIJ zd`?sx#Xte^35{`3W%|RuzRURb<_r5T)Wh^hN@vkJa12A$GUIufe-6vqj>7qivwr{+ z=a3}xLVOILm;B-&C=ud>2OhEEV-n<*{=w472&YF#`1KM}QXX1Xd$AC?>U%MqDRX^_ zEXQSJ1Wv5Jq0V<=W-a$$X9i4eZGEBJGaalrR{5rm9cp@1H|b?)3LOTS5L0E<@H=n9 zQrBR>u@vqhpDPjyLGw~XP|z0&-6BV22-x_aA*k+%FuDHvr9>UyM%ZRApe_@GTSz5h zs82#$15_GV>~@W};TS`VXx7#l05ot@BNjq^TTmfMmQoUuavlA6s+i|Bid%64=#SGq z$6dFSOk*=S%h9d$GR=WwQp^nZn;87p@4eSs(i-OzGHl&pt2!LX)76~c_fuFXmjoV%+L4?(LS?^jDu$e=1 zCm)w@yopj-so`)kQ{Pn8tfJX{u?hkpBnS57Qp;ZeMHr*NnN3fj)z~auOAq~D_f%iK zSG4r_0mKYjKb@MT2c`vsRf)_8SLXs#$CUk78XnJv5G2L0a#c<_(#RkP*c7SNX!a7H zk)282$!qN^Wfi~NIU-imlm?7a{G#M!TeVV?^g>`KDf_|5LAYxX$!azQ9b!N21nL~O zDG_vMaQVE}*flbz_0o}gP#;C7CL}rC0&Q@0T?w1f2rDcga}z+|oRTgk!M`%rgkFhY zDYSE6nufR{^5R}$<*wvGcHQ!j+ATX4f<*qJz?v`L zWJvX*n;(+IPie=_d4bFOBg^g#cDtQf&iEv{>)y1D07c~4DzzJt4)B|fqF(OIuo^8S z(bDltwVPZyYyD_lq?J5CL&J{%B`I6G|Bg`UW!CYwc-_3Y!!Mzi1u*I?N*AL!xK3XMRS#^y(8 zT3$HubB?u!{#L7zsBk$4E16?j=IO|1szm~@&1(L1{An-V?NgB$$0biRK*0WhqMKtU zR7Qz?YlBE*v7H$wW)5BA;!Z_>o5Km^1Zh`ynCBt z1Le%RfoczRw6gNRaG^fsQEDv64+3`6j_ zsq8Q~Lzn{zjdWsZ@*7fH=PlcP8V4V#?`DaY@(TES5TL#CSVR2$l>FB7Q-Zl$t|@ zecd3Krk>MLx_*Q^lf!IIk6y+2q6{0;7rWm()8N@6D9HIzsj+HPgsj;+ABvW0slP07 zIs4US&L}{fSXldd7)Iw&yotLNC&C_-Cs%ADJADWds}SH^elrqcE7Cfm6j4n1*JZFP z`m;${;DxMqZdz~jOk`hjWm!CzoYk7%tKUvv?-oMU6jBhc^*ZyApWXvq?Ju)DR#(EGypp5(0sq_e>3ZHxEvAyl z`GQwOo)hkTb{Wm1c+WLU)Q@yd^HIce4ZX@#o?7T(65g!O9!(wAta2}}wyT;UHQ!## z_H%@|y)!C|oW1JkJ9De9S6a(>h$K5C>NvKn1?R_OF+XGE?7ZJAa$4|Xp=&h?EnL=S z4PF%mWqkp55@C$DcJCNdZ#{GIjjiBW^{biAPT76HFGHN6Kv!`L*W2EspCYEybu205 z8%|cIW474es=comEX^APlO|TV1^Tpd25!{3QO%@1p$SL)>Q_7Z#89d?y)%Ux zWc!9EHb{~&WffJt&s;oik+HR-&?4p2%PG>1%soNJcuMf6-ZZ7F6q{=mv2R6tFlcYv z8C1WyjEBypDUaz86*qOqsR~!@;WDSZY;0_%x}7JvqU+NOi{)ktDmI%>c zsLWYocUIA&^*%8YMh)5#w-5!7?qn5z=H9Y0B%?i(OyQNV(%UrbYaxr#jlpEbPsbfD z0GCDLd9`a#pfypoRZIX-g3QPd|FUqKVPHvj_`ZVy0kTEjWt0F|>HtYSZ}CR2EA5>67wmNDB@v*+Jk*U8=np7r z-WX2z=q;H}YD-i#J^aaEt*4WI+EzAxTU#4T9<_Lfy6xAuyz`OuDmhD3>0K7IGZk|w zR~(d@<}x}^Xr01)!J~JzinH54OH)SH7}l~B#i-`JT&!?%dD`&lO!6ep&8kQwLo9Z3 z7^fZ~5Wg?X+6zire73L7CH_V!2}^G+QXRX+bh5*i`CnId#U(kltu*=C_;S-7K@09R zog5skwoR~(wE7LuyikCYey{BqCFQAoE6^CB&;QDG&nER zxzN(Pe7<~?Kz`pj22=HhkbE_@J|PGRsuXMSG0H8f4;~(=sIGB97t2p8D)B6i>zc;q z-i>!Wo#8-N7^*bE+!sW6<=wbw;pAZAmX)cTb=>(^?NW&Cau-| zzmQ#vO0KA1(k(7|?gzY@1yHG0HZDHUwH8n5gwcw@>=r}ezj-DOSi6bEIlh1Z_3yA} z`-Qp4NJqyAd)MAR*4!#C=$D?NsBhT3r)w_^8&`Gq^|?5R)DbN7K&?Mo;j%{@LwDVG zo%rnfdK|x?C1jeXhpl#uD|gps=|AO>YbPh-i|0bNU*W>xs=vD9Vu!XN?pZyt&fnM& zF(NJE3IV3y!f^R*n-5HCHh|erv4_P^nY=s3WhaE*Ey;E=asqRw;-_~+{1J+4%suAj zNgt1t{={Pi+E-H5b?fj+sa{O;N#N!R+xbDMv|jW&g^L;YiX)+V@G&{AT2(uIQh{Ug zV=bwTirpD%3tEYVfG88$N`j<#?H+ue#sw3 z=(y}@bZ>Ld9`}16Su zJ5^EX{TND<8_v@+GsJO}*0a;o-_%qkHK-@^)9T`my|sI%0l}Q@lr;8+vNT6WqPNr2 zxbd`SiF@8_ooSs@M8ZmME7(`LF_dN<(ZqJnabjN{kR1vanj61QYx^P&LoTdQU(lp? zJx|)MX}a~Apa(Fglu_NH3plGJx|`Fjk8(ww*8slDfclbOBCPps(OT_9%3@y(clpxB z<6GLtrJOTn`5hfaIRvx|rmE8`4s&WI6*kFOJ(Hh4fWIz+8DtW92){ak9I~LOaoP^~=eXh?cBwF<~v<-1KG( zvY}Z6Ig!%04Kpjs8H48FrajT$CRF=mwaC|)chU2rSwA}~nT-m+c(2<)dqrRg9j=X} z>^eO=`hE{=H?2W9i$iyD|J9NLm?OTypV*j$-W2Fj#1ES1>gO8sQVYSma8wjs zJtYhg#l-AP4Z1@WmcABN>~*dZ>Go}YvUR|WS!Oe$8qLbuRCW08RVo-$O)oC8pZH)I zQ5XI}ceZ=EyAXvIin6*3)l!B0B$ev~o;Nqo#fyx9O&P(jl|0&u<# zDSi`=ey1bA&~7A3-5JeTb?xNk4b(j;zKOGXy1H+K z7AAI|-mAdpl4iU$^x&ny1?Qb50KO3M$KY;zw3yw_(WLJLetf&Os}WNI_raRG*KBTc z?Gvp%GK018r()KOc&DLi9T}^kq-~Wot3R{oWddVkVQ7vt7|yCM|U zCS>&r?WaIeDA~e4$7+j9)licaJRvRhH_jX|V6T4wj(6q4xY%j`IX(S;anX3@=g;8o z?rx-o^(V1s&Ge`Me-C*8q{XF+QTOQup}STM_0GE%K?{GQu@C`pAC7#P?YM3Iq`)wg zRXOjfL22;x9&9wT@A@(_*nwD*die|^YE$Ip!zq&Q)oTctzCQwPw)h8;B+J7ne9YOv z@V*S%@mn+nV+GZX5Tm)zC1Sz?KfDkoN#=q(^HZBz_gn+oD9Ph>OBXVpBTosG{#Ib6 zx$OC*C83mawSBl8RrB=&4f}E!1?tM9=Asz+;JZIq^&qOpT!j%XX2|1?a)oanCOCa2 z8To;@t?%;h`KE=x+JE|g$WEKMMF5vsY~H2%uvOKhb6o2s;{XTfAaKcMUyZ8DV*{`F6_!iEbwIsLIVfV&*X&9;7Ixc)TcRiIQ=J}dtxi~AK(G<;d%A#PZJzyDKVA+M8I-6PUueja*DsLcG1Fp7n72VIRS zfY#@7N6ix>7eqM#%8%VBPrB^NoQW_#|InB`2S*mma2+N~HkZYO5g(@h{Y)R=tvS4; z?$iI9#gfy<_c$7bH8u#vtQ~2fOxz zqMqp8d7a3u*V(5$U#Vq?0Hk3Byh#p7@cBPJ4gRePFuniO(o&v^is}|b+)%aUJFwL8&ssy9NuNoAgpK^2MR)2=Ur<=6TND=vVB$SOUK@52Rv#*+l7u!kTV+&>i3}``@!IqSpuyyYc1v z|4UAQYK^snx`#*P!rnz@ZU;d`l@z&26^ooES_r*0Y2SPRJD_$nrY`d3j}2KW6s4W^;*nj53!U&pR0TJI*k( zX`zmN`5S}Z#}_F#d#^U4T>Nvk`6KUh%K~2c@mrbXK>i zFhR?p{b_wguJCjyC`$+sd{MRpE%o8(#A*oZqjSLPKI)Mkb}uTR6r@PF|}&SLoWnEsY@v(hgM zCqp>6UGt79*>F#t`#y7DadpYKV8*8*`uVG~ip2$18sFKg_*gUxQwLICn8mjCdp-&> zPfX}eRU6bQ>rs3*fNPt&Tezg|8CVvj_Ody_HsV*9{qj^UOWj+)Og@X9qbB#NqxbK` z=qs50o&qQ}0>tfp?B{1tFk59;^4zust1x#3_(-QjoE4P#)n-Xo@yU}J754j}*A`3q z%QfwhrTG>f;CSA8rg9sTx0-nu#x2vQQ>vZFJxtr{0`X%1aZEZ3Ow^yA7O_wfUT1J(u z-B4K4_sG9l0wsd4gz{`*Nq@W#T26$ZaQT^z>8fC$*|HYkdl0A5%bZ3;O;Bp`v7$n# zugo^LDV#Tx)+0`T`$3HUL&+F@!UtCPtgtwS7Lrt%0BL9)Y9 zz5!6+@+9x0Drn?4f*2SWIV zI&AxPf)19n2k-3e=b8&104=&4qwM}TIr%R!NYIh*-(CMd_TDuS;?C2bsT_5(qpv>v2yQq(lirkOxWkudYzzO&v^)U4aZs;m+7{JKo z*o!$YkJi_=a~FOSxUl%HyJP|G(Zc`1pe!yA=!KALROeF(q}oDRB+q{Wq7EO5YW#(z zKb%2W5pZiZqcwUU&qRyB7dytzK%1eruP^#zY_Dx%(GP=7CsqRleMJIagG2&@`xYL& zQ8B{o#^iV@dpi~P_lYX|to;>V1?4p5Ku)%3lrKH+W}*<}0Z+qEZtWq#bf7C+NIi-3 zU&UN#o%<||jE{lrP{GvH^sO6z&$i=PHA#;pG=i!F>I6QZMY!VPpFarEnYRp#Y=2%v zz`cF{T861P%Br^N^?gQS&tkPH5#VQQgwAz#c#+Nj^fFmZ@iH?rd%L??Ra8{Y2?~B{ zABECNm>ND;{vY@(K8B8=`&hLsn*PAen&bGGiP|U%`ezFs|9B}6;xYM2+N@Cfm{~v_B`_Pibvhydw1Xjw zIIDWX1tvz%ONuLfMfk>Vkil*3UK03ggr&p$$IZ0Ftc5D!tZ9(VNEP>}4*Mtc#mmdvxx!7t1eak%jOl;t3Zu>j_VlPjizg~On~RQ~-pS(2Pk5mI zd#XLKec|w?`%(Rknw3 z{qsfYs;h?m@wD4|;O3Ex{@;K9qj-E%XP`Da=yL5EDlsW3tDr!|%iDWG$K@$UAZ9Y) zg$Jv_A{CZ$3HHFSN$DAxxU)wLE|Kf7qBHJ?QKA>|jZo+KZy+UHtBV3H`6uN&CC(7x ziL}^{4db4AE;sn9DtbFNoWo>2jv_nqa%qV#5EFbS_HvwPRG(Uw3jRkZecm`{;G~vN zYxv3tHndo{e1Q@<+zsFHKAoA^x|rtrHA8F4K7MV@4fZkayA9-T!9JbS3lRXu{>6he zwZ3@nA0i1ydq>?6EXboqrM%9VEAt@Ki*rV3jweoy-uhT z?wjfX{yPw8HsfG^(gI>6#pT}j7bC+0=bD>CTwGo6`?X!>SDu`Iby&OYA^Sf)c_OV# z0NpJ+EpB`-oFZ5M_)kpkAI$`Xo?Bh&@x(=*!-tU44z?w@po8pa@dcP#Ie~WdArLM! zL&qs)R_CbL1+bOSzqOU$#o51c&1YqWYSW?oE9LyV+it+ec=meHFhoEdG2EUw7BaUAX5=*}uzEet)`o@S$75g`mu^`8sV- zFY@Rs7M_M`zwIXuyR@{l8L(QvSuZ@-zed{MJyVFmw9dyMWjP@BS7#321Z5}@#p`~5 zj>rQbp>rK-J?gkOaTl_nBZOJF3E_Tw{O0Qb_kP60sm9x);m<7^HE-WOXzC8eKSD?1 zg`<2@GXm?M^D5b+|8|E@4AX_RtlUMpp0lQV+8CJpQpP;423i5D7P&Jg5o<@if;_`^ z&*9!R$l3c<_`@`a!#)4kCIqpT<}a&xH_6P>E{D}ws6Zxgi@kLR*Fg9M0I=cQmwt{j ziC=iApVkRCI;>R*y@;aN)9YcVvet1MvcM29d;B}qI74>@EVaH)V@Ky+*f?_vX~8Ib z4!INa;$+Mu`kx+a;Ne3HqqPRvdw^UAi!XbSYb5}e|ChT}?9Y9Inm_qfUtiy1V4c+a zxr+PW;8-`s9U#aSgN&43snx?vN-f%)YT@>x=gv{34!CwEivrUBXnlq$C!}5naT!07 z?ZcxTSEs>c!?z#@DGH9<^If_>oh(atVBr3nc6MZq{iCC!zopiE%+0aY7hUoHY(w}M z;yb;#_EUf`Ug6Z4YWFX(r!=#hZxfKzJb!vgj#%qf`r`O`BgrVHCmBja9DI=Jh@2wb z%SmA)XRZa+qk^;fUkIA7Q=KTVzJrg^o^*3JWo%xIAeC%G7Dm>Z;w5nj^hw@pc-YCd zxHeEj(X66?IRuhBG@~Jz{UI~*!Z8!pp<5s^HXp6={K}oJ+xvsGZ?Et^Wp`>M8yZ|$ z{0ECUS+SJQU`9~Zkcq2$0tWE(JOw_kvUczJ!9*g+wJrZmQ`*-$@xWkCPezd*^wT=t zhZKY2DiouRNViW>oeX)f`vNzSSwlyw6>R1>ar{RzUuZ~2Ler%;_Sd59-@PU?`I3!4$hHsrVRU za+bX!wmH6GW+qoqP*7e|Gx{(osc4BiG7UZnH(~p?H*qEqlPYfaf2AW41&c%22c%XH zM%dKC0|kvK@vvWAJjh^q4WRO?S+2lQF)4f}&t?>0s4tvJLdLz`y0J%N2|{Y#o5d;cEkc7rUgiaReNauVvK zb3Z3YKTYd$nhUYnLGj)EpZC9 zcC&Qm)>}wuO3r62kmHV$Aw1|88jo=;?&idR9kSs%-)bI%a%EuVdt5f65&po%%wDx7}%WB_W06Yi!_V2uA1g6Pti!{@&p`|0PZwDo!A|bL(T*Vd zr|s%9;Z-+%cdU5!WlTsO7OJ#6k}?in@8gQ;tefpFx}XR&8K0sS?By4e_Cp3 zweiR__ik+C>;Dc*{qwx@x5gNtQC!^IfUqPC;U($WF0e2KgYEyv2EzZl$~Y2QzQ>Z@ zYaV~9YVXychkl|NmOZ8FpNE`!dQ4sGf%eBu70=^}-^ka^GGUd1+ha#VRg; zDUJeK#3&(04SHOzFqL3gH>kE!@A@Zy@ zE&{0O!-?O^8UC7p`~_tJ1I^?Au`U|)HFiR%p!Q@&v*Qs_d!wWYRNN0 zjQD_kc&Cwm=^v}P!&rX0(o}Wvy@?~EZGa!N-J60^_a=7SdqX<^_P>Li z>63+o9W0*YQ)hzaEzGm&PK)7&;!)2aSiV9F;x>@S!vpd~A3&z+yvgpL>&B*s`#=c#5yqHKVu4EL85OKM0IN0X9~wZb@|Jw;sSR|cH_#t|~4Cv_&> zY>PV@d#$ql?0+2bCt!zTq2N%%(v<+UV8&Q+Pra#+QGB|!sM9~if2e(l0LpgNyea<~ zCHO0|fAU#TGJ!xS*aD(qWDko&an@%R#J_d-%@=^p~z2 zjC}q#vuI?pEM4xu>sPdXZ?7}QI$n+N_(5N0MpS9KGld&S_g7N}^N=Eq3`imVrWL}a zCi&s+kx!u4wECYmeO)4(p>h>aX6AsMl~`Ep*8LS@@key#Z=JUPo`GLfqlbVNr#Q6d z9%b!Zbz&VDi!W(fgywaXz|j?aq{)h|f}{|;YH7Z+l)32azAz0aVn!54g4`ASS8*z+ID0zsBL;*RIyFF2ABq zA^%Jl2PZRq8SudN-Lsb&Gyq9yFvGuXqW|HmFpL^?v8%e_TlT;t^89`O1%P^nkJv3u ziMUnkpRw~VGvxGr>-UST!8-q9hHuDFVQjuxi&q*qUA=YPl#4-34Pk5eO`$D`(Ob`` zrk~bNou(TQjQsrdP5x4);+w9f0mD40IEI|@+hiZQhG23NE54wM!xllUp-lyR>?d#( z2>MyPZ-$f|>ITnl8+-)bicpA4hpM+*6ZrRmH%{%wJ#>bpUcDUdow7xgTK%ohd0lg49H%SHBNn}F5p860%3>W&O zPa~ae-7v$7#(c?6_6Uxfb8S*cXYrW8#1Vv>vxy}1P^64){-dmCJ06kTj_0=ojlHKT zo%piW8h_NBb3IYTi@jXwDz)aBu^lO8((--ddty>CwUjdQ zkW9E+=nzB4{WXyH&_zz&Tk z$AIoqkil&=w)JVtio$t+ibV#0h6noE(t)Se+OZOT->--rz?H#Bgt{1U$XYF;2cESMhiEdN>S#%xpCU0 z`SpQHAA3(iXa)hi!J()o$-2O_{PLbLZ~tBCY}e&v&vt;*DH|iq zcP!S)Ak}uxf5(SSEv1H62=~r@!18WzPyPL$)Idu|$`xz5)C#Cd=aU>)QEnsQJ8V&R zQF$osMlIjWyb2yzSV%@)cfd5gLkL7)y`5RWek#ABMATn_%2U(cB{Zt_5(>2TtrvXf zjrce|n2G&hcEi)IZ{mMG>dboLvVts;7LVE-X?D<4j>PD6(<2bur$Q zd+Nsc5-d_%i^#eH1t`Mx!cl#84=jvoxQ=V>mi)26{fGc`L~Yf*y{lJNR-U}ba!Hzk zj#(XJ^B$mXJW7_VCBI|f!Lo=)007*k^0(<{;vJf7?w-NQyYzXT+P9C-S>d9L7JzcS zfF}*OF2Z1PTt|^Xx8gaIf*q$D?k&%(!u`zz2xIt!gkLZBXYqMYQV6kfE zC6f{kRD5(+kC}-N^~oQoJA)sjB98m6=|Ofg-IMtRXQD)a&;wk9n0M;VRY(kOD(@BL zT&J$pdD&o~9g0AMH}S-j_%>brEpoE$JC9PFqy{EMaWlYqkD9PmIMjO}_+O;o&;p1M zE=#?4RDss%^7kT%_cnHb29FTa9-%}OYe5mh)^PaiF?nhkR`WoRD$_|_PM!vo*n;v; zOye(z3_JLdbFwvJi#7y-xedUYa}uaX zk{#4Bxrctbf?P-rVdIT93)4ZwS+5P3(fs|TX!(tK=$G~w?^d{X;TUza*TKhRj~{RX+&_ z&*i2L^HIVZoOM+yWNtRj?DfRIf$T)w%{Yc@Do??O1%#h000|#p9+biO8he`=AJ_QlJ3t>A@WN`s$ z@7i_ zA#fVT-#vlm70I+#H*XRjzyb^_bQBgBfyZqon678S4lL9OSpRj34*cA?38aUF{}+~k z|0q;*A`J3ie)db!C9rOk_*&h3Z)fr}Ar5^GiUn2y4(M8(SdCtNVYBRA$XiWPmzBcuE!#cJAmZk z5Yw!jft*UQ%@61V%wTowlx_Mq3vnD%o{|W#RkoSO$nF~zjHD4nPyaCVg#*w$IY6#P#wL%FKeki=!Fcn=K z&R5L@#&v#?!hx8vi8$RaqEWa#uhUepM-&qF3aA3nX>}%`*IjvDG#E!B_k<4}I0te^ zlR%yn$6)tD=8DJ%{7N7t)Ewhuz#%kH4KzsDVek1DE;RH?sKfQaYWJe{H0iXWAJu1$ zl`R}N=)}7^=q3r(@7v{>k}qXPbt~+EnI+zjBA1@njxsm30kqn9fLHqIy8UI$i8ryh zuwRUT{roxpX}%bX%oha{7sBk}N!5iP+wMSm*GJ`Id~s9H5O%Zxz1(0R_q< z-@;6OT!1`OY@I=DU0MkLF~PDMh0uy1^(jC=A6ZIF(0$t+s5b_(rNamj-HnQ9 zGnupl+S>;f<)k+sGQ!#B=qX~T1#Y#{j~k!u{Tck`?B5##Xt(AgP0>l~zA0C={$t#y zsa^mJ-m3|FC_520s5XG~D!m;Tcmq&DsQzi~lO^W$B8aqIB*!d3mih#uQ)#Q4%X7P{7v9TGw1n^Uy+-kT}a! z4P5R`u;2Fv;n15LoOgDNHQC)9Fnzf}xjdIR&;gXn9U>}EG%P@0*V9+goSo)8y-)5L zL@h3Yrf!WSiCF4l^Zb_o?z%ti(Y)`DX9SKHb&vqT)WBt`qC1AB!+G+oBK0%x=CJu; z5o|^#b_nD0W8v5-5AGV$y7M}P_b~-Pw&KsSrtSX*0`Q(M{7i70nA;SgmcBLiP@Wus zlQcWE6h66(dTk>d_#F2y&s75LlNYn^T5T*|k8njHJ_bPRANccOPc|m2$AL3`@bP-k z$azR;Yr|;>#qOo|R*$Y04xi6aO*5uGgD+6rx|VQ)>x4s}ituE6W|2-U%Lz4vYfv=X zl%~M@Tg!>agn>bXfThKiCr}^o&A`UyeIZ}C<^$=(;)fim)jWwl4m>Qm|Mx~u+sg;E zhX!805O0h&Cs>)jvGVTzC=0(DY28`1a^q6C&*Bw{y+u-#(V_Ty$rXFoxL6!N$?sc#R7*XMvP-*Zu=+7@kn{-~j=iO;bIaFg=4pl1jHEi+3qs&@hqW(o zhYPP&k8Se}l#H*CR&JHOm;gqhl9d%#W}kch7pQ!SH<$?dP$kMYXk(!cr3i7)X%F+h&8+#N2xPL2JjYFnpXEZ^EvcUNw;p^KzpJ^IWw2 z*WK9(Bn|+>@--VMNI#4TPPnAyaIrOrYTa?y`mOGXrhQlIUE#;dF=dF6$ramp!*go~ z&R6t!K4?QcBsAHILud*;3#8U7yJCQ)akkPGbc{;yOGq1=q# zC*W^htuSN(Ep#bjDF=#z9OZm5<4khL{CFcM`y0GT&41>akbZ|mgW`9ce6PoRaY#33D+rF2ol)2 zn@af7Biub-;6-Mehxuw*UfL%h3p$EuX(-w8iyaUZlP!A|{-WrPm1Ig!p>MC$e0$3) zP9lS_E9B|stB)yxpK`1TcUBgbJ#tQ*8V>Tbspj4GM3;P5+PLL^hvZp;%H)W3EvV#F zm`V0sb#Q!MBi3_Jj2ruy!9K{k^vX(I>W#+!F^~p9*sg^Iv{ohBpgu<2bAL1pLm%v< zsFW``0WnC-RSX3$ZEHPR< zAOA9bISpah6J!nN+l=}6;)A-NnlCorm}X6#|BdDge4U7{)%sykQOJIJd{5JG8G;_J zr@8Wms?WzG*RAwD_|%qt&h!;MQ#vA5J{W6!rX621n>@?zj$W?kc8GzhY?+hNFzaU! zQ)$(k+i=U=)^0-h2q(|0$Iai-ix(}@FunPfK)`R=r+xFQ9jME{tPy+JzN|Z>2vm9| zvckTk7DiL6O8joLzA5oen_fNBaNxThFI1Ep%-$&B-PX%U&zUu;96~rOJx6*q1%JuQ z`5Yt86l9Cud`a$iX25>i;q^U0g5Fa{ljELO*-yC7t|Fkik%cA5I1>mhrzt{+UQpy( zyD!!(cb=6b#}FS}i7p?%^`p|?3vX#Wzx-ipBt$OEriIZ^B_u@OEL7(X>y})Ur6Hvi zWbH<2__Gn1cy`#DY*CVGb+n77eAaz&Q8sA4sJ?D#cG%RpQ<>C7DMeiE05hpsrjFX~ z*zWe&Qq80HA%qk?bfq9Fx}m$|iZ$Gd8Gb$(`&^B^_WOefl_l&%_dE!$xOb|xU6Re8 zY1c|A$S=_?+XX_uk#`8IsA26_i{{s)TKvG$VdxFM%wr2QFEXi$mE*9E-Hw8RNu7s6 znEky>A`4z|T2}#Y!*d{6<4mSumh6Nr6=&>9Yi@P~VGNRic^J+we0Zfdp>PoMQfI@t zDbi0K`2W%fybKeh<|@rPy2!WBaCvo_`AW|3%(3i5>v2^VjQ6uw?%bsCn~c$km<`qt zo0k+3Qdkrv4Th}J$!KdOM>@s(ia#Mijt_C9^hoe4x_PgN84aZs%aqX~Gr0DYZ2H%T zuZr)LGk5~6sX|L}L25j6pXh_g%r&z#Vy(2&+<9-tW>?7{3qIilhQwN=cM{l+PG@NK zpvsnRRnXfcs+oab&E6Ph@FG2aNcOpMaGUvG%EZje4U$$YXkrc%tS=NV21 zqjDNJ^*YKkwgu-8*>~JhUsr83E=dNJ;L06>=`f-E0z2-(*r1A@iFt5TK8x=Jn$Vri z=O>CCC*By2S#+f`0lQQA{7uzEv?)SGtEJmc^|Us?tl4EI>3f?yj2PKI;AX!7ENW1D zVmP5dFnJH2igxm2vXC!3Y zEfd_LpI*y1*|vU1kg{#nI{%g^@f>(@84)6F%MIEPOEKsB!93>WOQcVALHd#%+6c&!f)(Ch$B?Cb_Qvq|wf z9mooJVS+WL%lU?{zK?V#m|hp+^2!es_?0n+11Y6SyTs<7E`1^NRh?*sT_c^#8E>}7 zQH5XkJ=>@{nwx{tuD7rsMfrceWx_4GxJhG%Je$jw=jRB_Gs66Q-}RpoTISY2zqoDX z@Z1`UafnSG3?f7;@Wt3D9gy#f6se#$p39gX6`X_~iTLbd6RB1vZI&2_41Fu7CH=f& zSX`z*2NS@-hnr&EVKp)O?KU#Jnr$fUt)r5wK|2c}sht_am5-D@YbLuoR%GQ6pJ`H0 zHdrn>u3yx@kudf|wu+Bs01#UHo%Q+3NJvu3ePP}{E78en|^PrD#9_2S? zOkHfA-p;)|2Fog|jebn^Fb&kc!&+twu>D8BA@Ly_oU_S(b!p2Bq|m0X%GG~ku?)ia z8NF>!G^ld9Il3WPrfzFeuV{Ze)C#@1`>s0e-D%8D`Ybu|l zy75dfoKM*>p6cTTD$`Tk{WG_tjx`)pKVq$$*L{yi{t2Jp<71|Z_ee=dxUcO`Cw~6^ z-ox+0c_f;t8=cGI>%vTnwaJy1Q=`sPbtif%{il*em&F7oN4q2iq8aaD+)oPkfp)f0 zo`CNGo)YCuXWPu1VZ>uAp0TWrZX@OMy*-)PD)dn^A%?Z*%?ow~n|lOKPc6wvuRjFleGTW+{nwc)}liBi0tGg88z<^8;ey4No)mobj+03MI^Yz2aB2OJ~(cO;-I4pcVNp z8kqV6sO1`DSm`1}w4x2v1#YWMLqwS5m1Mdq%U@fVP50r6a8OU*_^Q^8Wlke|E7QW3 zTw!~~(HF3OE<(0b`ViJq3Lp1$%gZJ?Pj`-N0>DP_ME8cP3z9%7ycZSa5merGO=b7$wk@j;( zk-YRr;v!4Sc=J=Wf$Oo_&P+!6yQA@aovXE8Il@U^p@~URop8`n9wby@A{Ph5S_f9r zn8U=V490}p<{YnG2|Sk*k+kl@(j(P>%Cqr=WU8RyK!8Ey>?gl0b61lAhTw!-spqR$ z;&=LbioAZ@@_r=TdhUx(Qj*lUp^RnyxTm>SrP^+l&np@iUy#*cVk-*TTvwI5Jx}^_ zrRr%_+KCDQ&(_)?4CugK@@bzti)s1@T5krkU1sqG(oQqES0#5=>@%KsH*F>|C%ISQ zD|`zwEh#C{(>-Jmiz+*#e;P-WgKAJfeerhq)J&Np_KnX*>OjJmAkTrZ&Q%~fT?1|o zcGlOF4irngYASxS!Y0Fg%P!9>MupigMrSH_c+*SFJduf_%QGLX?i+qkECKS=oXYZX zJxfja(%355FBEo{o2>PQl8R?$pSQT)BvR`)tL9^R9eX$~`m>Vs+wKl!`r9W+d)v6u z=1=j6-IRaqQ~P$pn$}d+c)3`=5`?Z=1PeHwkDxcGbN#kg2i#ZQE@?%+w9c-wpQ$tY zEa_}JY}RdLTe)&e?8czE^qG`=VXJel^M1NtJcP_wWJ^wEI+L1)CE5(E!n@?#yv{aQ zw=(g`F?~Q?g=J!Pvcs9Q*tOT6#U?_}=yKP-T%m;e%DUvmqR+5Bf;Fi5*+QMiUs=BE z$9#5dM1SDX>suCg=Wawwq`}j%2HGM#4aWTk_mAOeGmT`YTbDIlk<*6u`)iqyh+aki zp0R3od!F>1$Jt|jVjWMQDf>d;gN#$*yu8#vQWwV^S|k4|TQ7R&0!_0HG1t_mLu(fD zEq#4qWbDOH=V3vr;?Y2iO5&{PHbtrZSl!Sfm)~q}mdA>rm*=u!3|P_?xDDMeUjnv3zJaoqCVuE5gABd`_VZ{oKYftxBN!-UH0=8Z6r>z&+f8p4q z;#YrnQdQ#XU2VEC+Fskx%c-rv-*6)sgzw~$-C6*j77eX{BfFGA&hZT>mRb z1Cu{HV5M8DCh0u|HhWkXr)RWG@89OS|LOwLS7pdMJ#endB;UZ^z6jQWUvFJ5GN2I=D*VVIzF6{JMQ`YL-2&@GYhsR;gw9M+Vi4U z(<(h&5~4k%&k(GB`Of11<9h13y%ztzz@tPn({9_!nPYJ)0t}iP&{k&Jibr87qdpP| zo{ak8#1J04%-4O>q2y!`iUKxJg~BH*^w?iA27W|3(83m^xFW=lQ{ zB)n7px}ck}mwAk;w1AYK;JzJ2OxCfsDP)Gj&ZUjRoZ+8>+{~;(n_dO2pKkT#AY=3A zkzE_#dAN>&Wm;3>YGM+7yPz18n2Y?{;>eJdW2@+)i|(p%AP1~t{ed1s6MBuAQPk<; zD2Sm14Q+uOM>CX3SO?ETRm4`g7~vT&Aj-WrA5yKSfe`PZSS9v59ov#(U}lbe{`_o0 za`I!?4Bfg-z^8!LXSI{pc4iIkHz+MrhR6J2ed_*^6QA;n>ZnL9Y)+6~U$vXFuitk2 z;I3U^t0~UsKHjij)MM$Ic^?c|sb5OOCnZJuZmqQx)$b(cbs+36@TiQ9fpSK9Qdh)3 z5L9hiTVuHI)bHr-oa^61w!0yB!wi$liq zz{@**vgR%-8muYlaOd0}EPzO@p$R2~dV^7c*?jAbUDzzErBON_l>?@dn%I92bsj!v zW6TqdqJ%GVDOcmf-b^T*S#9b)1|-&#@XOFuVQ+Bzk)KXFe0W75>bqHb7yyQd1jEzHuNaW2{R})iALpwO zj!NKaTOjc&2i^zM;NPx_r+rW&6n6=HUN~tk>pdPgAAWE@3vAA7Z;_>>rpAJvs)#ZZ z3rodc&g(I_iECA%L3=lWaPcC37wQh&1g+T!(30-G*;nP!@3rjwb%3kKeyqFr8_cF+ z>mvs*NE56KtV1&S+MNXUEmlh&mL3YEBAGcWA}!6&;=A(DX846VP-O;F+4Cmh76Ahq zZomxeHS=mJla!Ax7Z=yZuFo(j$*r*J8|}L_J~thQ$Y8ZZpGYbnt$y=0A#g*45aYkS8fRGJq0a0vzQ2bhNH*nI`rJF` zU(@v8ZWpMeAZ?6>sdI4RNkG8r5v+ik&O!34p=U3YU3humXRMYd{(dYsJMq{8U>uqX zVK$HdLwhXTgiLaTnEmR6BABWdHKvGpM}fJ(@=@Z}VHx8Z()3&T%W2QAZLAhR2lFyG77_Tllu#MO4cSD20hlqAqo*n2g= zV+bO;Mtnw993b~i!&KG?FW~hd4!NcRqw4Z=7h)!gblt7x8i?Xq1lyN^b3PwvY!|HYwB=Bt?A_>q(Z{kyY&p&XHyc`U*L%{)SaP0;Er)$F z9!AX~fuG3fEkMoa@@-Mv+pZ=8M|BzwOKoJ5#-KrrmjWJE!=0aDx)QZn+625BB$=Ni zz6d7LmO^&kWJ8g|#6UbnM9r=?{~cg_Z<6%)z)Unt$nd#r%p^M}C#$GP70#=fkIyg? zYR_H3K9}{8h+-1H|B(ov9nJm`m@>+9Ywu#hL_BVz?C`|s4gG(SzXEW+hd>jiWqe$# zKy7JX;4ne2^zk_rt_UV^E;wqMnz*ADFpXC?b!Ky78Yni*fH^jmq`ce{A5*ZpZt=N} zxp{P74+Z`q>Tb;+aePKzL-a)2_Us=B@SP~-Bcl2yln?Z~G-HYq)}8cD#lSPX#7c*m zi+c#E6;uA2{L*w@zu1Fhd}#~I9vMmc9bIgWph1Q1`qi8wVqZO^Ux%!`4T zpLLuEojwQim(%VvszFa%T35gKDU4YHapn(vK`rdWEvV@n(N&(y7NUUs^5aJTOfV&2 z)$sWj^!@w9l^$O+7KSS|SAvSAlxg`4L!=uWTIWkgo758c_JMn)uV@7r$x zq0-5Jvl0#4U?835xC;vR zoC97-MtbG;108lo`{vozNYT+&H*OL@E7LAFh@yqRc~hcIYo5fbY@nn+$wsrR>QRyT z%@V&T`PkaNiSGBxYAd_aWL9I={zF%)SVnI8pZQ|xS~w82;x*5k`!;AmAE2^J`kGj|X0_>^=Sac{uozm(g=X@{ahxnM8nTnG<~1vsa&iNt@X$Id&^WYQcH#v3YCn6c6*IHIrIj zwr%6VA$Xz1KfE*SZnk-Kl`J@0=-~m6-_Lg`ji1qWX2fe_GH%KFZ5G zRrFpv=nYR#c!<>aS)r2+W#aUAGmqhk63wI)Whupl8q2N%Btx`co@Q?ha+j?^NUL$? zgg#E=vzr;O=NikuT;^AcOy8OyjIc-qDI54-a7MrP!3cbZ2-z&$OBNVb%DeYsH2wwJ zB_TD^5$yEZ5vb{5mw2zHhn2pD%srZ!Zt&wW?kwTBCwVdf{C102{|#qA5=G7NUO?-s z2CqXR0%1tk#2WW+mVjmsJ64x&j=S&IhrT1+Pw&?~U6?$OmAv$HHoE^r9&lu2K?A9| zPFzVQdQ(h8%Bg01uPS(cWfip+70SgHRSh?`_(ThrYEmFfHLOgR1aMx_2~K)TN@*Sq zRqpwip|KcPqaPA3efk5w*W?$|avW6zx(YesMJD`l{3eBU6h*jXgjMYVlCoZ@b^Dyy z-q8VkR?z1Kc`xK78w5Z%(EKzHoeGk)FiM-S9^DTJS3M5AQqc;D+I>1^@p z$srt^=Nb@9S#!VMBbYKj((QjhHBE{aSiqnJj4ie`E*zRLMPt}4BAF#md&P$DoMD{% zg-E+8^-pnl&jw&mW@a(->z+voUV^l`>=eW3-5AK`fM4oop7?JvxP|`b5P%HhG!|xc6o+fKyaK9&SBTC{+P4xaY7v#I*lZL;zHv zm;=m^roc114HP7#Ra;g#Lc4%F4SAY5W>D86TfpLO3l6!b>NB7!jfKb%19b+%fDX}m zh<3#W5j-VB&f@QKp})QZvY%xwlbEC=FE9`R4?+Je-$;?WYQP=frb3_E$WaO+Opgb_SXZo{SX zk5w9)k~B7~7fjKuDtSI528qLbcv!AGC1UspuQ~ByX56VOMHGDvT!k5aQrXHPo}1J* zB>3KoQ!%DF|5SE>ktR!eUxf{1b094rwHbj`DMm>zt?caVC-wDXNp3OeY+#RKrP_9tp@fp{gMEFsF$~mZWL3@i>i>Vrrwn$q=1RlHCxcc5x zw}Pkc=I-9Muwe72s|dO1qz$RU=@S2^)72n@+9j3vS0Tq}-CA@KD(1=IVM8CIfz|ne zHfWys%5=pdIV;8Aei{KM*T&NO`uae!mNq!=|JXvI=hB%f$qhs? zpfQkrjmMy;yBHs{OR#U>?ykgq_lsYp=oV5)lMS^!cgdgn5|`aRML=`3J0 zlntoLPw84y-Yh@Ga;m?$Mtk`nPtheJBD`LXqdx3U^{Ib(8W0tJ(JXN32I|J+I985g zR+5YAqZ1?=NgHWnwp|W@jT;nqp6W~$-;zvIJ-0q7(dF5>-kwsu?%vgQl{4vZ-c7tT z!1{GSv+x$*i*Y>X)>AoxpAX6sq0o+an;Y-F<_kJjX8Sro9k%zy0Up+U`-xEd@CZ1U zYQOk)Er8O41&DQ@hj!sbnOidNC<6B3v+fIgLyT?|?BwAHgBPC&UP=IxPys+L7{ zB)5-=OMD2J(UTyGlz{lI0I=vT1c^?$_%}j%nFQ=w)4CdUg_K&78%}KAc)u{>!B9p^ za{T%s+zsF38fU&7I$E9C9gbSU-Q^iJ{O;?;ra*6V2BM0Enb|MUUo#hOKYKz_%lWPW z$ADq;C2QUrpFy=7lkzsB{~+!Li>Xj*9g3koKfh>;66-~oGEMzE%$QW(`{+4h3TGk@ z3Y6MvK=3KtjZ-J?yKSAG6gkee_uy>V0?>&p7Wmq#hjANo0@cnoF0)Stdw@9`_bt?{@UEzh0Jn1oD3q$tR8dIa z?i_`lK|UPTDfH<(JXM2(GT0-mQxr?A0qD8?dpl$hk>-?qLT*rwxh-}`Mwm?gwM>$Pu1js2*bY5xayKU{Z2%Wa`fW{U8 z9{xLnwiP4A41}>qWUwVN@XGh!Alnaq7{k1D=3WB_=MpHPBp?XFsaOL$1=4!KunbqhYuH)q4%6k=-VxGFoECiYnTO4*5pl# z5(TOe-OX)e&9&Hd13Hwqz~z;=u{@mwIx`DFfs+^2R&Amv_|3XbUndKE1!lmjtVubu zOX&d}$_^Mb*wq0BGYJ8Hn)A((X80r0wkjqjCjGL%q-b?ft{(4UVMf%3+a%?ZJIaUb z7`9~)79+rPlj7+ZxyB$HOSh5>BzT_BA!2%yGHa|9kjP;J=KTZB#D)1?+VoA)>C0>JB5(u!RRyHmHmdNH=bd#!%)iJmCyL zi``7Bi=x>D%$zyE?aUYJcXz73?tWm|N#B6My@em3Po-PkVfic(QuViGJqq^=fh?I> zc+9NB;Wdp;d?T4SkMk;^yw(!|PtE4+ckTju1Ww_QO7b@!A_wx7^l5wVl^(eyNa4)g zRsbX1iy-n6mZK^0K63?-_1S$a;3bzSE0Z9Z4=1U7rXn}agaY?)fZmZxTV~nST4<3+ zK~lY9%+GrnLE22Y`>(r~#K(hb!COWE86|wXQm&lHuPwU~eY2=`Ez~PrYN_GC6kvC+ zd@HR4uyDK7)`F&M@&O8>KsYI@)!z{9F=UP9aW9*r=F@8{5S3AN_ z+-=aD)_J)@SVv_)I<7h~^N}$|`<(~{(SDcs1=8mpUvCIVNHDr~#{)B(TO*@$RsHqN zt*s6a{&wa$WjSy)iq(_i`|z}sPO*ws)%a}^iJEKRjRa*I$V@^DrN!wy1*YDi&;|IQ zEX(9;;@XwF-W8fL`C`Fg_Ysqqj&QxtsjfEudc$(l32nTqkNNb}OK+$9s__`Xuy#^; zlUEe52dBS<%wM84(&}s^#J|9P&D;Irn4b8qu2niEIX(s+`O^!%u?O6G@*XXEeytZ@ zz&pb!uAgnV!|%V*)(h{cE>xn`p0@Dx^z{RH>6ED8meJ#gu$qf+;p#JzxRa2$K@@RGFhW{Ix5OJOSN|;j)w7S}5tL-yW)ee$E%v z4c`DJ)9Q-kYJcdRHS$)g!uQ`_g5qw)lOLjpo~dcO zq_%o7K;(MMaH6sDo?-a+oSbRhGbx0hFY z-PYnQW4QBbp2;q+Iz;*XqTdP{bep;o5M@3}?ji1xnkk{8Q(L)ZY?NYp^)QzkLGZSD zTvXXYJguBVrcF5+r}U)KHjWj_^JC{$a{+WcV7_1-|7s60>i@N)z)-jOWotntM_!M`&Sf2N7WU&5;; z=jJUf+S-*qXD&wTTc46afeJS_hXHbOsP+&?YrQL-nhDztVh{o@?^^;{*+qpZtVi%fv3fy#?1T0 z;{~5v+H0LTzSns5_LrEkG$>x3DQL~ng?3@a4K2%Rl-8UJ;pqubVBy|=ZSvz%47pCM zj}hQCIJXJ=mCEHD?vcMR{ulkUL?rnmKLfwz1uyNldK^ueS&RY zpdwx(MhU2%RdOZ79lhDeHqu1s3wQ@h-;R7*4nm7GGVLSkDfHIOTF-MeVvzjk-=IA1!LlXj|w9axM-FvI*(M zem<+&n;h9P(Xo(U_)&wKlVJQSGnws|{?Hd@sz`L9(2}*K`-r*3`o$Z~6!`G8MufZn z#w*%7HSSL{-${js)g0~RH={kc*#tc$DvRO*sf zKbgl#IFv_$xv{geUT=~g+T*zTJv`34{cKZzN0$PTGROWCxm9Bi??1R#%ocSIHM)*k zJs-XsBwV}=doCFkG@a7BKeD&EOx3>heL9=At$*X$QR4?&*s;T3Dfw^4AmH$;*E8$I zhGdS{L7tk5>M^NY|;~1PjYKP`t+-^2zekeJH-3zGpAlC z;CYT6fQja2C?NQYdTZqy2Zd~}6nGCZxeu$CTusEVwhZ(vNzq}iQPOLp8s1pDF z%gci-?RM1HuE6WOEc55r`HtY>X=Yu)6(eV_Y~skpz^Nl?-!INWEXaM5=+L_W?3%41yU9l|bo%LIc_98XO5)ef=t3$5F6fA#`67o;EarKez)6G2h zc0e{TF!lL|a*JB#l?+TH%O1VR%88Sotvsrj$-~*9vklq z*8_~Mx0B&>1?;;$hmXpU=SLoskhN^B7A+)EzN3KY&MHmK;npd9x>-B${Ocp5evFw; zT~TrPnE1*!RsId?Fe9^bRb8_?Ii9Mk8LG2Lo~%&UJEh`YCj~gjSV0@<)P7T~Iz#65 zgtc${r`{u?@e$RS_g5J1-=JN>v^MRB)Rgri6QPXIR)uVR*CuISiQ2;>w=tIU_dM9V zuM#^GQ3rwbTu%%|?M9-U0*Oe^B3oM~s}7%-AFbqFpVkZ7%8_|{=ZO)Mh(LwN>VmP0QbJh2Q6U$YzR$YZ_`rUidGe;w9Oh!v_h8`P^^M@>2qgbanb%}4Z-*YQ&GyVthrb4SNF9Y zUcEp@LBl44o`m}`h@cJ@JtEKd9c`R`vsPvJK$O&o+V=P7Ue4ZgGV1HumuYjHoXaAD zQIhr_pIaHYzJT?UCV>L6T}Xv0S_hz#xEjOniITK^bXMNVRpF!Ux0)attkPT$LxsAk zE*1U-t^hQ>(>%rhqVBE3vRt=raYYmmNkx)M@~?~ zvZz_31efOx9zBi2n%3P{WO|n9-R9hFVE3-kRgM{%3Z=QJudr@0+%{2%yovd#dcRYc zQ*59LokdmQ-49-;<3ikjyxDkfhmUbq@ffA?$GBS7W0)Eot_crXy+;JBWEA|Rq5gHU z;vKxCNS>GIIx8c>l!lYXH)^j&BD;Zs7G`Gicpq5k)@JmIA;ER6iG)BeHbfCuj3<`+ z3d_Z8A2VZ_*ghI<-=B4FU1%6OG8EhhIaryO6QSx}?Xi@g;fK~QfyYAg(K{kS%x>&S zXM$dCE}b8o+rvpl230o-(kCf$<{gGxJDqUeHVxQYw}jc}2M(Lxl)CN)D-v`0lg{Z* zI2R@Ky>S4>VoZ=5Hl`#r>QC&T)Oc-=TxC>lKe`sB1ZJR3=bRKZ=}f7Zxq)Bh{USdc z@}Q&bf}`*Q4tif`OeC5 zGBBje@z?UuHIiwfLd6!aCw5PJ$&o*tv+TXZ)Ff-&E<=bNBbISJ@Of+WfQ}qZ{*$lJ zRci8($glBLeW`vva7D&0_VS)#l{Lej$7+B<&RPU7k7BptP7`T9h}m@|i`P3uYd)5^ zoa&crlBC#pEVavEBz&h`{O)V)V#6r_#z4nd8EPN3s_PT-fX-}tn#J1tZ z+8L)`>~uNW>fhepTYg*<*Q!~RpO{XpJYi#sZG1gfrwBD(xrUOq%uPKhJib3!lX&b- z3qd1ory?|POQ{Ol)tGDLSo7Yu*AfX2g!?z0w!}t5-ltY9muHXb(~h;B4k=%`Jkd&* znO>iq_P!z_e`3kW8{LL2#vM31AuNDT{l7lVv~m zK}&R-$mD%VdQ0}UcF_f2a{o-`QttU+T5bJ!$Kko9tbpmAJcL^ za2~klt1$qJR99DfVoG>S$}0F6L^o`C+>@euzzH%+J>O7&31Piwv+!A^2I!|>4%e5u z>_!FZ@FvSeiDMXegaZJ{$N|64^u+mX6ODX3y@iqyTe*F2h4;h&sE||q6ZF$leD>Cz zTTQ~Bb)hz%avIqhN(k^UUkz}QUC_&_X9|;{gG}G(@rwR@!(Q1E7PaeJbqf_1;in?I z-|M0WEH!B?lEuGI#h4O zG$@O}0&T7KbiZwrjzm2Y>09e^5zmZHU@*fkq?Y`M={AP*$cb={ZF}nrL|Nj&H+{1x(px5&f^gqX>?WNX#bC_sGND{ z?0OyZDpz$8zdAFSt@2V)s}?0!8z~&tCIKslBz2|x#F#Xl6vwOLyEf7jCukX5GN=2= z0K>BcaO?9D#D`zza=se6Ekq|n_Ku1%W?SI#Wc zs5C)uI_!UN93rKx`NP@qG^=HY*{|0FG+@jpLV>QdrBxobvq+ViMxqaEpDar5 z`ei(e-W$prpYA27<@Zlws24SNu5t&j+Kz6*%7|F~tTO#=$fJKavE(Rv7x_?hpLzAi?s`Z5qrHuM58?H?+apmfEoOP>j3oE+ zj|Q%c0wdq$41THF>k#-O)kUG;Iyi}3Xd}p=O#lK&UfJhcBi2KkMp1f^Iwtiw{Drqj zaWEw5;S+q)jH|pl<3ONp-L11#{IhB<`5C_d^i(jl-LOkpOl4ytz&u^oU$86nDd8k# z#M{cYshm?1-cUyi&w<)s>$*7;&!`lpM&A@|BD142_)Pecr93<%`4O$Hxlz)NAa?JW)^tm7(^9gIt)w!|uahP~ss_{KtF zOKh*NW`84}fNl~&G^VW&HmzIU4RnzkWWKTdI=IJd5WYy_H*&tydReP^ZENIx(mZ~OfH|Z`9I@h(kMp(O1kt~ zy`h~!n~wv58#u8C%4CC} z677ZKTb!ytzEB3CV@C*?cs)vC?Q;7!&MO zx!oG64_2wA>j-)1Ut%7|?q!6~P9iX&B|LpxDF5}l_0(9nA3wW(Jwcs`yZ2 z+jf%x{gIc_6Mh72lq(jE@)-=oNaklwOItUi&+c9!3d9m-qY}*+r0icjM%Xe zb{C_?kE$^@G*{3y`wKPgucG9U`OSMm?03HAk);8lSmH(4x!gGS)wkApEcz}~m_O;2 zlflhIkZ~#D*y`0F>sO2x2X91wU0MgcLsUokW+Tss@|!$qlZMNyZhN$E?xo2hTJ=D^ zs-fMVjiT(srIjvnYd(CHw9Pa7;Rnca;MmW4!_5n87WX26^gmj;pto`@R=ql+DZ)Z) zhZ{5icPEGPZdvWGQXh2fLayg?)L1sXdRl8*j-$#julkqy11;+lB{+;|5~Y=g8o4Yfg96%#HQxWx3c`oHQBc5OSnud|45SH(`jOQYe8JZ48X-k9ecR8mRsX zJx|G6E4>Qit}1liD%NrA!WXp&1mC&RfVg3tj|e2kw&y(#2*176pn$qi#p_;tjFXq& zI<1cTEPJ{%(24_%lm2eT){DHy!+y+}rzxB#U#jR9pS`1O<8Xu1Y9$3ghW%b}AkxXL zzbU+GJUA+wAaFf&QEOcyw%u?0%zG*@}@`~7G% z2_>*o6SgxT@AkR~T-q?N8uw>?ZP7v5^Ql*+W;uT6+ybD>!b8HF0}p~OAMXfTTQ)QH zcs`dBn??(F1kUA0^H~{EZwMD#UdO&%1Z21C?X945w>Dk#%|78^%iqVy^PLBq@Yf3# z#BY-439%!eN<|>f0YTHsXkC<)5~&ORfsnuqOqs34<{WA;arcpIs0wgk$Ck>&Ac92~hN$qAMMKTGcrC0C(duQY&H z%qq*{`QxbdO>narL5wcew#?S2+el^#CtMR@JG{nBZ$%e=3df9%vsRw2c7zsI&H&j| zk;gg786#`PXbx@9l8?TWwn^qm8_B!n?|+z`{PKQoVBof7s|&6sYg-H#qyGZr=9~xy z!R2bZd7_0dlD92aUj*zCIcB`>fHRe^Ya-OIRxhwrnpmG_Vxi%4o|2HYUYY=X{&+p! z(a_rG=|V~@#XHsL(}YaF4TGOyKOd4PdPaz)MbaiZ*f;r!+>fUCvEH2)_8a$BAeFY< zh&cOm8SLulo-b3r$<7Z+M$yiJ>4!=aySZxEK>G6L+7mF^(X4+wY*iAWJ&ZoyGQ0wg z5U92Jr)LEclF2&g57_t5D)Q61qrNTueCtxNW)iUnemYZ`SNuL~sA`I%71(Hq)MchR zV27`6Qi$8)1C9Dr9F${PeHj0c{w%|A0MGjpR(8$7;x$P3tY+i5heLG#n}DF)%9YGH z9CmlgVmKze>;{bKPWe14TW7AXg%k$Lt^_c(>2+Vtx>#_>H@slU3#DhsG>@Jc4&b18 zo*Qy=-2@etdUkg4zCQnX;b)aNWSDg~N+Lb=;gi*s)}%S8W&16&UGi6bDg-W-d6z!!^WEktsLVU$gHpDdjce(66WU z9#;nK5T~K+Q02O0fAXd1s;!aSZ4Kw9b2B-&-M3o|B2Sw5wEzNgh54ZngG6^>+732~ zt=VD!@W#dw$Xpw5!nmL1GTfVtuj_E^%wiL55v1s5JYv!gu^e!;fCmVJ( z1{Ww!N1_x{$<6-i&7v;dIKjwB?t|3n;9gha*kKL32y-fw=)t;Zesi9H!qiqhsX<`7 z6@c>790zM0L3mg?C zN@+jiv%8M(j0FoVn|_j9r!lKjXf}g z+Dj%eZ=k=3PSo*p%{vUY`m)L3>YAE^of>OPj-D63?ExV}Lb*XakF9bSnm0Ll9nC`w ziOLoiV#naC1W~S8=hVi_UF7#ZBDg^7)0Tp&F-QV$S3(K;%ZHEjsTezEVY+YMW@q^& zq;h6A;+|0N7=t|=SHaTpG1qi>gOrXM5H#O;p}<*-o6IFs=o9XFPx-*k6@EyPv#gTW&}FqGo6w zslLQack?Z(Y(8J}YxzDu`eo&!*xBqV7T>qLc9umy8A9JlLhf?!GNCBmNrQ``;1L$- zcDY8BB>39$?2|;^#_W0K&57LZs3Kn!uJrnc^2Aye3F{#9G$#Ty{Z=6}x%u|0up)tm zf!6?3PEd5J;B-n9WwJ?+9G>j2ax|Z)P}L5x@|y|W5=Cqh`wU_-nSKF02EV$aRBJfQ z#(oXeT{30_sdES%C z{-wz*T?cTR+s9N?I6{wD7Pf?=%UT&pA+2nMBNdP2gr{Q1arniHX6vR-zeo%{q#)h( z0{G6@`YG_auDNJZ=SyfhhU^OvsmI-AvgyFlDw@{pz5bFNum_n^o2CtY;h$fhji$`5 zg94Cs*i`jSzYr(;uS{uF*z{}reTMRelS$IlZiQFQ%Ni=ABdw$*E49vtk@^f72kX*r zK4H^s-xVNb+%BV-5uM|ze!m-IlBm6`!0hhG?=q+_$}PK$n_Y_DLF4O%1j~iKuRCbj z-r3oCGx3WSBB}m8{a=W#iv`5y79=K?xC=_16-MDNHIZ7$>5F( zCVn}kHU3U+^;h||9+9O^7ZYN)zO(8Rbh*`9B0#-UU?{2!V~|6Q%(x+mPpod0uPXnc`AXF39KIJy9X!vl$(05{7l z+sz~TS12iD+(kno^@0L(Zj(DMndQlgBiKeS>-`d;G_m)fkPUpdE?!pGm60HN?Y)w} zx#CYb{E(XM>A03D8B;Y|=P85NbNWw~G*&zO#D2+N#*tpG^a&gmxE-#o^WFpeZ(p^@ zOwfPp#0f;qzcgYTzx3Ez%Wjyf+p?Br`rwPd zd_^m}+t?_U+Z$1Db5hODK(br|P>hXQP04J7CHTOuhwU>vJ6#{KdVNo>IS2Vz#5|ZS zalD!+u#1-3cyU0QG)<>q@8Dc;8nv3(UVb_L5xZzC@5j|}{w3AeWR#s_@HeIa{k!}X zPYobGEbP8P`d8|Ce+7+|Xo7@U);hDGMla#s%)ex!yg z`rkZ{BU}{rNdV_6asIf>y0)O_GzcizlKgQ!z&FQvyOekFqvFe;I}Umuqt>r}F#)$h zA>cb)dV=TM!8Iro@y53I1TR|}PJNT>Loy7UywxvWMK0VZEL3#qwTUXf9+zZ<)nF89 z({=TtbflZ#Xx=EvvubJtXrSLkQQ(O_UP`k=iu*}Thj+N_5gjt03vo3ycGWo4OjwDW z9zA+n$?@T~mw6>tS~|`P+&p3}2YJ>(A<_9Ilx;7UPL;sKdc16?+>);>Fg%e6iG$nslGL-|)Jb z`LV%L{aE7mcmluIWMl>lNzXG3ZL97e&TfoKMfL`-)5qmEKRI|KqjlE)OA0x^nzgD` zMe;?9iw457AM1ql0gE=Oqk<}A^~2~=Z-Jm-nEOfDwYwZ>Ud-2_ua96)R<6x;M2?{g-D{Y z{;jKDhWWGgT=n(q$+xtv?|7lAv24Uk^xYq`FA3?W+zX-nUoa(~rqA?ny%ps4el?pC zYhW?Sk||r;i=jNPa)FvXKM^IE?x9G5j#64jmFqjqf1CE1%YJOGYp<3snP!scd|Bs8 zBM#rHMpI)$L-WTF7Xv}@PQ2mBAeVKs4|IO2zbL4v*v(3}4`;_PAfk>)`8KhZeLEpE zbwGDlaK=^ol)^6Q41GZaBb_P-PW!%!l76YrO+^{x)#mWsn_w$Wb1B_^ZbU&HOU-e~YIsB0fkU`^EtRszTI^3@ z$1lz!F!N;A<}CGZJ-}I<=Pa41v)_>FIpu^s-GN_bk0wj6KMk9b)DSbrW`U#tx; zu`;3^>U{8w`VT9==G+L8&37ReS=VB6i-%a6ZJQXPqQN}BulZq4sHtVX`ub3RHE;7e z!cS{xkJ09;8M=!`X-q3vyQsBYk2l3TUU4g;YfEP=qNH+)t*fRV2vxZ@`j0sNCk;Ai zDOdmd5TVG?4?-h!T&VfeK-9N}>gPO;qn?XClh2RJIWd?thYXw@NVw}I9o7up^DR4+Vu+k5oN?YFf^ z%Sz&fN>J(nPo}^G@{hZ1W4j7vBc(qxILa2Sl-k#}fBGyDnOb%zt|Pu^g03SrjTbAt zbV2!6>?*hDHoBm%WpjNYSnoPBu}|ZKp>H<15O>vL-)1^@@l5W!DutHDLfUT{@>WXS z{u4v*0w^VU9aXZ|1Xg!F4pt5>3Mpl5CyZ?;%$6<@S-K;Pu%TkRKr$i5Ol@!lc$wf< zaBsRj!D8_h&MB4rz?VTdbU-Q*|H|z65odsiOlr%sFm>l;0*Zb~U7B(S{1YEqDm}GL z_<>;qc-NJc65G0Vl84nm873FH@o7MXVH&(BD+YmC(e~uadZNsS@`x*|XfQfPP;FY2GXg>nbCQ z=0!{wF+Lt4EOUC7<-#7tX`qQ(lxdmkd?rr*QG8RFia|&2hvRrw%K;!9G{o{n`M!;M zj)V0H<7Y9%iNuugPy*QLc$kahPrBr)UEYW;V)!2-mcxV!I{}@-N^jWmD%YB484Rvr z2zK4|=sEOR`$ulNZw+efHXp|}hr{?fC;bYp;-6bvJCF^qiaFVol z@k*3h7tbhNX5URUhmVSP48bSv(p8O1X#ug9bCY#?PgzKh01v)_eUbzxPf2voUdTW7 z3^unUG|$YI7pBC-Y9eC&r3fxIC6b%fXEFNvoR}L!aJQ_+nlB%R>u0-1G$>%|ybzc$ zTsD3?V{yQ_CP*)fyGSsXYSVH?n-7^Tnrk;bBQ6Gq*vAY61q7bDaIsDMlg{7~zFj|% zely7Xa3pg>prNQYfWrafC1FV;)sw(DgNvnDv8FE96~dz;7j;ObIvzPQ6lF_2a!2?F zEH|oJ{I9%F*LZQln}+apGRq^ERW675-tr!{^B9lX#pyDq&s^AAVoVnG`9YW`>N74Q ztnFY407MYAamA_Qcy(PA#f~4Yy(v-G4){qhCco_tjQBTedXHfq3LEmk=fEJ&LI zsIV6{$Z|=`sF;>+*mddlZLEPSZ0|t*=>;Rd_Cd*u3XdOem2Ue-i>r<4I(_rbN|D=D z??7v&vgVc2In2lFRPzv&>D_(TXBI3$LkjU+cDLT_8N|mMuw56x)2WKpt4RtV<=|a$ zY828Jj(%S++PMXSRSDy~O(lBC^Hb=bkUimuVy z%`lv~%`xa)uG%pMa0MGPH6!6?*;=RJ4`EmRnTO?b>E%|HE*s_?2Nxt@I-^+eR$@QyYLYsyBVJ0F9&U!A z!Dmc0>n%QcFs|Ud%5xT5Vt+|5@kb4&zWcV(Iyfixn+6L;ac3+h?aw7it+$pxaP4DE zmQ?L+Q5TjbTS`IQQ<@+B1(;>U2JSb@o3;D17dIA*yikCJN+)A@;?TYYDAV7YFb?lV z0H(RS1>RXy6cSGn9vMa0_grm00xnG-+f$I}#u(po-Bw0%(@34WBu;6M`=eOEy)Up? z#E8#4>fl%Y!8PiiwOFqo{2J}LUZm4{HE}lOY{myiA?#8r)R_`1EUV^B$*+(=0;V~0 z97Gmu_BPGu<5#=N|A8i^jZhPQ8vE?WB&43gW86z#zna4qu7}Py7PW@r_uS(=Hn> z3h8YR$Xti@iup!tuPJ-g zb5()@C)DGPz?mLLeW=aicw@S{y4Kvx!5W$mun@Vzr>V=nZI{v7Z?f#Edxe$9R{?ao zFnZ$gsDsuz1cN}qg2zu^kIdVc^KYp4GT%wKur88UVQF3%rX1{tUoDYurT*=>(K|d4 z5+K|#frl|Uiy;h6&z??K76^yRI@U`Na3E@;Hp093XyeR#;iSg_$+NB`Ij~#_j;$Z` z%?_Im3K-_L<#7J$iN#_$)hLpTzLNh1r`~oUwQ8);pI^h8t?2zcmx|Qw zmQ`lPHf@;`fUeyotYP;--vO<`6Cl$*2o#D&NoS2nw_6-dQS0Ofy*5IUH&6!k7do;p z=sQnE^i~R%8a9Rb?;;Sf#q<;=<{Hh{Y3c%D_4P*~O4!^tYlA~Ge=794_H1+myIK9E zK~0UOX^x4iOeiQ10C{B>0)Mf^7#DoySHC87m&o98T^ClsVDMmsPYVz z;17Ys+H}8f6Wc|jpTYAh%0T~KzVBP(uq8jPyn&d&o=ui!^;hRnnHzVbe#DIAYf&cz z%=F)&lyxk1uU~aa{OR~gU^VIdrr(Uf)GIY&0hY%Ik$U&a8TBULs$D>AB3AVGbC~bB zVmRWj_>X;Bb(i3%{yFZyOH4s7lUt4@d)IPOB&|~Wd?

v#ADmd@Fjjc*B;H4OA0q zBqNiJ!7P67`*gUk>PHTlRlX|5j<~7sS7&1|gm3mTZgsxi8GBNJMqQlelVsM~!vM!^ zY2^+2pnnoHn6=BdyVXf@{LVMXy! z`xX&QTLw`3NqGp_;+>uej}%jVwF_lkV)HnHiE9ve>_t)mCYiNuLHVN!%N+A#6JwI@ zsu0zILVii^7nI9IZC10w7gF}UfDQ-G~~_d zmS}YXF(6KddgHQeLn>AiwE87^qJx!ZCkE{Z*QQ>^A~oWxXOQN3=SoL!l#I@_b1?os`2L*5lE zy9=IqiBGc!sKJ^j%%!+bwkZpvft)ytN}x&b3p5i&BX$%RWgxg~JISgTN0Us1+cfq` zYSUrNjDGve2_7p29am&7HwRB6z{R!PiGB#WD;gdT9S#kgCbMcREG#)k6{#8CUxy=G zN6+To-G>)Eygf%{6qW3Np3)zGpzD?Ud=P{}k@g(CUGI$t#!i-}Eg=TwElAiW$cvIS%2yl6)u>-| zQ=z(pvmPs5l36cd*fXSbWEAI8sO>$Go|mP=ymXvfpjA&opvQ%}>?D`@wqmJLi73;GV-_-ZzRh=jj7;OmotS)f3v|uf_zX@ zsV8TPgS50mSowz0H4%k48;Ag*t*j)Ot195C&zo2nsvf)If-B!7Qr7u;T&99rE+?x$ zn@7LcCC^+iE{EBQ2nAxKXCyXEc}^E6Mu!U`6+}SEMx#zSom0@xvIjl2371B!IxjXV z993qa#oLQp=J+YzQ)~vF%2Iss6&HnepqKjl2L3!D9cks~0m}uHxE}oM%@EO0(Z5u; z+``Wj_jqK`IK|iRm#~Ek4I*xqABL{pEN%x)|z*k zgHg4%1|>)3_m8K!+Kj03OY}L-F5X4E8sC|~4w>9F4)VBjez;|!ZBm_-pr0+0AbbB6-MPg%Xt4%6`0}+`@ko_UzjbjkON}yH z`@p^MwgOss|YYpDRU z6gJyv6)Y0#i~fw|c?DB&>K?tEUej52PXHmc_BQ+HBeG7$*?YYD8oRqE0;b+vTJHy+ zBsT{~Wj}e`w<9_9Xy+%OQQ7%<-*dlYX}g)^mQKqBH6|735}ga8_q%B3{+>z*08?`2 z;7UT%i-@F?DFhrWX9cyae<~9Tt4o;q;RHX!b@n2EI?l)gP`}LR?gw@KDxoj}s);r~ zly5V)3QtRDBeyt*MaV1eRZh7WuiR~w@Q*gGx9`VVqYEPDp!CnapJ15pCTwr40GdS9 zn)V7Ue_8Xp={LEIwZ=iuTnSBqs**w-hw-}?fccwGpOdgW5rHO2P0B9?3XUv36V-F* z<}K~84q?u+%x@sPdw=fY)HX;Kv{aNvEyg+U!mGnC@n0iVXPiLB2_zCj!rx&_jSD45 zL`K-w2DisX@SOEUV$924Po=ob{L|mPd*{bgt!(7Olvxe6uF4KF)2FAUOl-hg;!d+wK=jx+m#VgGH4KC`x}Eql)uf|Rvv+NIUCADpi=IxD7QEue~= zN%5s3Qj9t7?=XuoPzZ2{l4$ z;QEe*1B<2;AAaoD+h+oZlcK>US!H{1W4r10h8SG@AHRV^U3x~ze; z{SWfH-nggL9wCDfp5YSTe@F-T9KU=Z3`%PTKObb+`r4au!}MODlV**G<3QbRj}i zj(}$7nG$v(;=PdVH=a*~L43StTFIfNtZAnk*>GdZvmL-*o@riM#>W*FgJXOYyyiRw(4TYG{b63K* zw3ZmX4RHq9AwnE9jItm4>)}e9*-LV99B|WXEg|jC>4W*f;dvsaJa(uPK10_m-ml-M zVzPu{g+AyKRNhVJf3KvzJ`bJi$#~-Q;T%TL*-qIQGXm}9Z#i8ZfUneyf4eEH@uRjo zGs(#vr@~C8N^D|#e3%%IA&(`|Wkc@HXW9M-Y~++$1WA{I`Cq3dmLlp3^!Fww3RI11 z?A@g`vrLccb07K>vNxQeMrlxJhD+mPyZ(5Qrk>-@Y6q;E%(Lt`g`|nI>I@B`G+;+J zPm;BBb9bxrv5#7Ce=hqNGCA?T-0u~*tee|GZe98AV|rh(zUt*CR2kc^#1y=a zVd^sgXZx4=bIT%lYA8mPe?FdZmqUv=C5N6#^*wOO^zV>FQ3+^UHMT@)%CNoGko!r| zkwkTK0gqI*qkGjg@UgzIJC8wvDqD)02)BXWb#PvApNQ1UZWOD+`c~+~fPXNrRpU?E z!HYQr*`{deGjk{LE%w%^P4dCdSNGZ*pFEm&BdFZ}wa{3l)$4DtmEltNQ%P_xHQlY^ z8bcrp`m9HU)_P<`stQGW^%s*|(%g0k_Mdk;LWH6!351@A&~j?SnNh5cgucqs?$2L@ z^ZNDpIq^E-N4-1OVGlOSM*y{DA{>`9L(d5U)s61VOq8#F8=s)QY&&xtkN#8-rz~SS zD!%aEX8Rb>uQiEetGn`TuDx=6C3F*OPYD z-SWz&V_C@Y#Xz40Kl!=sCcV6Ag|=&SG~48;C*h0SOD$&lc`{#YStB+wMi=iL;x$|ouE=Gn+XgsXGQI(e`x z`cj{OpjKlaDB^|hf5xFX0P2)QB#E@_vS``m+%+__*SsdieUC>n3dC<&vEu+v?bChq z;bg;RvspK;F;FqMz&5>Tqwfy6kT2>T?l|`Z206C83&v(j-@v`yhITFU#mYww)tQ`t zOavh8OUp07ZHmUJOtp>ssRlxI)Ii<9Mdy49rOfx_!I5h&4S!c8~t!k?+CPQT}4?7J=Hfgq|^O(l9s&z zcCpqz;%eJRH#m7m4p*P!t*WYOs21arkdR36juN24M7L&kbJ>C?=r<muBEfP0#vAv3i0K7N5bz?xp?!)UPPqeKk{2l`xIwv>o}bTAaW7Z^ zSfZCk=*p&Vu&Y`SS!BT6W~uOen#duUC^Q1D4o%g`XNNDLcq#eGS8%EgO28c)sQ)UM z>%x|+^ui0}(#n3PxYhJkkp3A`o-PS=!4!P1Kibb6{_h)S!3f5Fo#=O49X15+99CWB zJ?913!^Ph-|8KTBhg3dps5Wo#QGYn%IbjjsE;M}^Bj|wAE`4;5vq)$@^Gz$H0^^6( z+r|YT^*MP~ixCrEY|#Jz$Wtho3boC163+9Tf6i5h{Eg&J%=l!~c4J?@fx(}>f%qPd zm~xxU0r$uZ(B_KQ5Aa#sLEJqJ{zQK$7S0_$?$ZjO?ST%k0MSF7)+0T7I$->eAeQN6 z-c{y9A&MxQq1N~d?4ff-1gk>K{hiO`W^gdXP! zEaVjYJ6E8;cq~qYM{<%c$>1Bz15s?ezq@S=P$DucM6eF}Pq>2xve6RBoPf~a_NZeX z`AV^YAb9A0a_Lc^fai}dv%Ff`DgWeHf1@?Z$s38TR*FG&pwuB#yhDg{I5sXs5c?<2 zYGj9hJ(mE%Y<#ilWnxS~YaNpDNNurrSu|L{{~i62o5b^o%^C0;jY}u+1I)azOK#S%?Pb-uwnMH8;W`+phw!iOQ06RZ&jE|xl z9zmkmivU&c(>yoDBP6qK>=hRZuw$ZmPh{|8e}b&qaQ7|F5%t6$SoNvq4_R;z|0Fbp zd%??GjRdFjg=E(CKcPYYIFEmdWAW#BoB#g(tu`>r1)y1JD>Uoz z4;IKoOs?Wi{Pk&MMBM4(pl!O*@BO=zmn0(;EIRgiw2dKf@&y0>UOQ8(QCC z-kW{fNa)oBWzimnUg}wR9LKRJ?t-0ZM55oD!ze!HiBAvNxk?2#gIoIe`nGka$+ZF9 zqkw!m`m8v}cO-4wymeFi==v1Fu42fHP1f!3L({MV3IaVEV*6&obLn?}%6G)kjqlsP zrh1r`{l|`O5Yl&u|2iE0Ii#^|R9G<4yVa)uJ`(?U0oY&gWV=%wotMWB#)1!|q^1A< z7%*nOXVOTCdwCRk$r9K=!|^F)((vUj10M}bnSYnIhek)Oz(@Vh(f^-i+oP+mt>_)cZlgeTAKNZ6w zMx3%Tk%>L*Ml*&TG>G^Q8O{2^-iv={i4dtRVI!!Qng5C_4d`I2iZn?%i70 zKoARVsy+SvQKOuV@z^MVk8EKU?2~jw=ifQPZVR-ch06Si!`zEB7{&~eB8MMnmK#Hd z!5vB_<6l4K(km(|dQkn{zn(K8E%QIjtBj%jw=i=Xo(qJ)HoNq<*X&cVJJ|3N#cAVK zOqg&CuKl%hAU4YTp~!EQ7XGETMlI(N&XWr`R?W_uau!J_x8*ZKQ8Y;Rw{JMh2|qn$ zU-Cs1#Kv*esw>#2joC-|7~6d=bBj)4=QtzB)5Q3Q_ zv=8Hq1jlROHfQwT7rG98Ioe?4$1@khe^jjy&Wa(?MhqT<6o)p8xZh^51!E0u^W|(!V5y z5HjQEzrRKWLu&z4ZAIk22r=ufxo>i;ff{78_n@Brn^5Y!hgobnc)z^7TwX~j{ zUb-Xp|5x=iSk@U^TIO^*&>q#sPsO?ristzQi_fcY#EbF&_RxY;Cha^Yhqrg*5pfmeqxTd`-b0E6uD`H-^)e5B6fhZ!TZV|m5)fP39-UXej${;?@WHiZ ze#_Z{LA9^|q7?)Mp!218aQ@2ePBS#*R%q+MFX3*m$S+eauLvSz*XoId zP+k7_cZUC2x`oxtSM~JtU~5Jr2trS(2|*PTiy;o$puY~AL#v=jV16p3EZjIsq{lp8 zpT*Wtrlg`ev}1)8-laeT*6F&6MS`?*c7I(BKobHhMYwo*HUHdE{#&=%e{GAOVtJCV zYDP22MMgge7HB}|vUO?;@`xH^tQQF3~bKf5)>qpUjmVi#c9a) z|Hf6BU;OvG&^r(J2v#|U>92^C4S%`{B^xQ!cOh2Vpe_B2Sb-7_3amEJ+y40sunG2u zP9TUcs1*J*XG`^0D>1P=0Asi(X6gSFf~~EIQ%_q=|nhCXcj(LYV!v1 zIO+B}29XG@16shog4e?0Qy zyf+063(!Makn}^rjDK`+XdwOIb}K(WLbcc!!Fbv>)ym;T@{s1pxAP6&6c2|9wPyge zUE1YM_c<&kLKOkMEHG}n28*I({w32w6WHaN_WR|Bc#r&_fN;kIv$&Aaca9A2{icGc@wkHgHswM=PD}dMzA5@5l+REO+Dlk|)PXW8>=mf{1CS9O_{;g;Ie>t^l*fqk6S(0-0-?A$qlOV#4 ze{F{+fWXmx|LisTI~@JTeh`Xyz^Zb@xzw@5c}}FdXcMY0bbX`1Tmw_|-|TAOQY=NT zEdU9QnnPEd->|p!zi0=MVuN_xdr_OAl@^L)=#tG?FplJW=OM7Ht}2flU4{3spQGxY z1D5k5$Um9vezv@rJUFtSedZV9T`q2S82lYqIM zGqFjqp(QO@dQtfkK4QR_PcL$bAH$4AwkaS04KQZ-W-wU~FM321j6lbQKxT_YP#nZx z#1hbf|M!|eRO2GZb^@gaP4A&zUxxP2QsD0|LPAqqPr{SvV)aTqZzw-+{IhZJUp{?L zOfbebOjk@WKhLgv;cttH|MH3ko`V4}B*gvG5M&+>ZqW3VQAGa)Zd6+Kt>X0vv`hiP z^cRaFK-XPSF&y|`3;eI*cY)ny466#Kp^CnWd;ZGz@4bR*fO1^+Do^tgE#JgogLl3)1%{5uCna_n-znpPq$%^jLJ{;ZW9#*&Hcd!QTvD zEjnEm5Aa5kmPw({-q-(VXH9%mC;#ryc=#8#FC7w6#3OcGswpaw05vv1i>J;9bc#Q9%@UkbPq`M5>y!twERX z>4DJTttv9VGNPK(3>;(5+idq+gWf9dk(t#=x$;f$1qA&A(9<{xlTc*E$6Zxqg88NO zU#suMbK>aR;$D;DU+t21ber5eWJz@&c_Ma!$2^U(2b|4cFIWSB7t`vHYrOXuxcJ6v zS@Mo_ORerp61K)Q&Cx&xu%I`te~B)ckK$qW$#IX!f7v^Cm9N`p+Vo5vs-mJBMlIm}l^1u!QT6 z+5|tRZlK+uyEZH(r-Bh8+{#JT%;JxFa<125Oro;Grc;>7uz$H(=7CbdSrVh#DZ0#4 zBmB?%Rw3uGo*b8=8Y1%7uewe~#liaMvkI>)TPcT0$r-p~le zAyRo~G11c=aSM8kO%{aW8|W3rJ4YYpICkn;Y!zjNkU}SExM+sY{C+)H%RCVFDuxL~0g77iMJzqL*nsZe{{^kpYLg--+wC!%L6hG;dP}DgGij$Q2TYLCZ zf;Owh+m@+&<~A~{jIUO7_2D<7N?px-T*c5qG9~EOINBUNVdaw?yJ` zfynQ4k1;b`%1jqC>q^O;Y~1gXHV->y_m~?8p9{P2=~MUW;x8^1>*;(|O<;SLwfL^D zBbHKCxt;5a)Fo%0DE+?eA32?Oe{fis(+5|s@e74@VoT%2!@@-lwLPqrEO37Wo}>p1>wGl z{+>mln+a$CarepxU$8v;c{Wrm>hdDL3eQrd(2Ri~6%A(@&g?JQCuOuM8;C7k8;}bR z7lQz-$WXT;w6ZXU?D{Qga5V4^E+&w%O09K-pAcASFV|~8|hZ7NC4I8L)4GvJx_0@Cj z%V45cFTOm1;Y@VvN$pMfwg6uD|H0mO$5Y+E|3{KNQYpKLqEvLua3qu>vmzs9MMhR3 z<5WgvRQ8Gprz3>yl@?jqdzGwXmKhy>*ZX~{`@Z{pzMs$c_xSz$`^SCDIq&ytUgLRP z*YgsR#TDX(59__YqHU&pI(~9s3o$_m_obC1(_i?O_2bsBP&fl%!mcgnp}($vj@ili zOV=uIZBj&e%6CeT@td(R{bL`QmDNG=9`OI=2X;c$knnR>Ct-&d8*veA4o=!cbCqwt z)Ey?Z*M^{<6jg6t8k-%H9#xz0X)e&dkbA%+L>7dkQOeaS)n>Bq2H6@~n4d;&tNFb+ru_DQTXYRP;|m?RMQ08H&k$ThP-Y7_mX7 z?(Mgw1Kf&jkr(|topxP6p`;!3nl<3#p0vSn?sJ8L0)v!ngT5*_&X7h~zgJL~ZP))T zq9E#L7&-VE`?jBcd>}MwvN(Dy>+wIqwKDfhlm$Q)&f;}K*=d)5E5}>0XqvbFoHjVF z11idp`qiz(w-2AQ!_cEphmPtq)nl)fD|0|PN1=#e@`XbIwoKGkG_ofSLH@HrBW6r( zDtGsMbJ`{3^MW2H0dS!$o7FuWmJatcQH}ZQ))QQ}unknerKO)sTI}J+kJRm4`YG&o zBjV+N*vA`_G=ys$>R0*k5)_PVz_R9ZJg)FX_wItipL3x)18~T);BGAW{bp3h48!Q4 z_tL>#|8t(i-#vV8m5b0HJT*PNb{=@{pakr{g391vZ02(C(a*Ei`x9C)D!q}t6CgGq z)WJa4!TmBLiJuXqf$7&hCg-sJf&~Ef{J#zSF;m9ys-3hi0yD73Ds~-??jma=X1V#) z!D#5PQ4QWxi0mG+#kzy4e5q_E8tL2YHa8LteVsS6ZUtY^0&x)a^RGrnAHM7tt>EqT zOWD)>fhNxK9RhTeUvU%)0vJX@&W8{$(&3J=Hs+<-G^TWvO}mT$Ym{@zO*FW?fl3Cl z6VU9ltiJypvLPD?9)&+c`3+ zlnMJDrlLcV+4&fiqqvdcx}lgbx@`gQI2^})O>}voe0S_n(M8)>u#xhgFqQC$etFU3 zKPD3kAF?bsm`9H!cNE$xnZ=)hLk*o->@i4}ESZYWKSPRqv);&A_TIMruaI~00&~uV zkw}PmE9CortMfI)kL>PbLL%O#jA|})(7z+^kh*dr_X&8UwW2>*^Vct`Kl^EL@Wq48 zBNO>D`JWcY%l)?#Ure`G`*K;7u(W-F&RsIn-O&bqFf~l6#IB?d@MM>7FEJRh zXX5^l-oX0L_vR#S|r4kq`dwzKB7V5sqp zg_p6cjv&8e}G6h{hh z7a0q$(PRwKT4_n$l@UGcPvEZ*=skkTK*{z)j|W-}nrVV+R4 zQd_#(H8I6x$Uy3Ny+qGdo$$zw8GB}-^I8nTEY#~*dte0k zurXjE^%QRU0ZF@JKmZg{1i#j;|M<8b<}yD? zg)6e5$GzW-*=O%gTa@3lEm}9Iv*>}1&!9`F%Y`u0L<`n^ zVFpP-!<^w$`@gG)ux`y!iONK9vxQ0j>fwa!$0uXk;-i-~^fCCwdbO+~P1 zzkYUkJAC->H=4V#bALuK1F{mZrkw^JzF5a{tDcg6)_O%gQfi_kk$Q8nU~fHd|P z%U3;MwXtnvWCWj~klpX2UOuo=oHrchJ`O$SJ*# zUs<+A@S?X>dsZHU?m~f;yZRjKn&?Rv>UdjA45eP#Af#~)C5TIE#)GE!o`m}K+3vL5 zf?MJomdj@FWf`uok;-=3ujoyL)9KcWQLYs!EERn;uiL_6 ztXfy)Ri@vvSnN8P-cvi7>vArUAnstRfH`s?@5+XZisi;{Z8W+I1{`tS`tK^SdLeq> zWE$PCHc#ms~kdC_4{dzGRFjkas_ULpIp;fWpwAG12>8mK#F(ii2P zo7m*B;@j0?CXo5l6bdF*obS+&T=BNfZ>AKbCaic1aVt0}5t@Up|JW;;BDO9boRjUU zx>b(2TeHZg-?aCp_qGz_c-kAn3i<&tBmP(2R>srb)oCuYSG8j1yw>ItEZbU34j`>- z^jEL0o=zlYWEGgcp|)piMVh8Trv?8L1LdB%HJcMkq2o-F&4L`TKIH)0NG+tq@r_*0 z@Jdim6ow9(ndjc!DaZk{)gwe5KbkRP`EdcrTmv!-2(h%hU*JOI_~=G;<(4~i?elSaPYt$qH=CJ}Z4$E1wyI@pskFE+Q21KQ!EaQ14SIMi z5~BB5y{&sB;(U)V)v{#baDSDQ*3q0N>|%7OL^M@jOc6ka6BRqlJeDGWb-&izdklv( zAVr8c+%6|wXKBNTxp)qxrV8$qzgdVdihGfIH;*LuT#8tJ0NCZd;P7(0V5!eEhbiKa z&UP2Xoq)_qzil!qVorwUF?K)+;@hmA1D!M6l|w7%2QC+@JnwS_mW3vT;&GX*k;uJ^ z59Y_3l#u>(q9!w5zHytivQl`8<;P(ZyFi1*FmTJXdN320YP4e`JfRAB_*vTHM^H<0 zftq2*V9=BDrMiRRPoXQhep^P#^h-NII1Yy!gm>IXvPrYn$E-ko-e_do%)%lU?(>=w zseB$G-TvqdnPQGz=8G+zT80h}XxE3HLZ)6ogLK6jYCd4%v1expL*o z*XehMy?!izad|boU6yBAX5gp=UPj^gkwfn&Q!KXVmGl%%%tg;ktewUWUUs_t5eDMfb&d+ z`@`tPUi5GQ~a?!($Nx6{ZZ$`1UAz8kYZNH8KdGSjRtY9ex<-3 zOPFEzOz0Y|%o_1?u1xjy;#x3}xh$|WG@EaP)N71DiARmi{qpw+t;zEH{F~F~;=_%u z=9-_X*R0P-pBU_MA6_fD^tv~&AtFDEo>Y)@mQ!5UW>38fy4EZ-LWeBscC1?_J77K^ z9RAGEw(MaPEAqn6`nV-!TY!(QD&H+^k;1Y{p{~kCrqzwtpJf#{TyhZ6Yv-wD+*B?uug{ogzPv32U}FeJQuZTs`PK-Rs%Lq`MG#xYA~y#_>j*$|SZObaRv6!R-IP2qB z0zZplVg!!R5YX&fYCD*^u1S3UbdYy0d!n?T%Db_>%c%lLzQjj{x}yiV-`rO4+1&A= zH9BnMa(^^XJ$A*0FC#4yMXu6N6S^fl4wQXTn@;U$A1V@GzBPONf zKj7lc@V4k%j9i}D$$gzWcW~0kWxwlCn!#p5C7)4G3hC5@h5TAP#%c8yy=b-j`gmY< z?=@O#f=tOg_^5SAW3y_c(ZhU|f&CH70^6Q^b6>NSm>XdAi5EA7>R)=5mD~=?@8W63 zd5L&X$a&=Qw1vFgBtF zu`D*J$m%qiw(5ZfH&c@QQg6=ocnt+fy1UG;66BfIv`Zb-0d}Q{T_a ztsCFzFKc4xHFPM zeAf2mUAcI1BO7CcZAG1d|60}4g&UY#zK&Tbw#ZA8sZpDE73(1P6Id+e!HZjdIU_T{w+{$0&99Z_VwYIK^aMBT0et{zdKU=%B zyRgedj7lMi+fYW47eohNem695kMLCJ8ucv~g#;ho)xpy?x$+Se>zfsp6`v5yXGE3^ zIJ~-NY^`Hg*XTKHq&c}_^WsVFzFzyDin~s^^>laNTYM97S5^}-O&CelbsNoBxHEBDsH98(JO|ab?mG2LKQwpkz*AdZAFr}?lH}Zuv{@_i z7k1}~t9Jv8-D8x7dQW9yP7*zpaJbWf_%`c};fG8xuiDiuzjl2?n%tIo7I|MOOrMY? zI#RvMH=rV{6nA77zK2kq6+fMx9qy5FOkDDIR$FhCtNO)~ zFDLe$GYVF;oQmw8{655_d)u~gXI*7&}(Og&mD`My6kOQrKjjNLwo1J*;p<9y8B~S&g zR8a8f78M~JuLT|1S+IQSl%gYts0g1-OyNg{wo&stNk>BEI2R=aZkj>Y<-O$-^$tsh z{Mt1KUROT?DImx1(aOW-UUh}{HXnB~=;p=6`w~nw9r}*1YPn=YdTe~$I~O^3RO(c- z*G+>gEsY1Pi|)gQwbUiu+l_|KD)1OihBWNYjm>!3oXh-+3m~mldh8pBJ#G`Dx%^H? z>Tv2Srgmy66CTu-mE$(YJKUy%>wU3$9Y5;(`hiqD&~L~!cP8zHxGu08iLH}Tp6E@d z+TgmEOs;Px6kBAIQvAwYrzXxVb-4sN^_Yy$IcB=Fs7roV;RxRDG1Enou_zul$hS_P z8pts}Rhdd$!g#K48Yn2*G_@Qy=ec*sj?ER`qmN5ko#L<9WI+e@bCJRv-*@1w&5miH z7w3Tkg=3T%Gat@6rz*=jV>HMRqo#2O+eI0^Hh5vtmvY-JNuy%G5XwTlZF~}Wx8>tT zMitiG>=`xW?7UyKztXbjN{JH`&1H7%CL(?yirzT$@vD8V!J2Qn(j(*$82_tY+bq~!a3IIIzA`0 z{b|~eiu3B@v5_a7p-kqO-b_&U~x_`{;Bg7E}22wqN5)d6R7KMASr-0jCiMmUOimpIKavZ{TX2;gp$n zO?_7WF|(K`Jma~Q;oZC;Kqfg$tWJ#fPL|XY(k{{ToP%e-$y9T5d1erUX67**7WCzU zwgdMM=(CR8OsTn37SUg=u5RlwzxqUGd}(WZ5FsS(Ov2-)1-*sUq=w>l^&ngBjI0S> z8#hlm8z20@z~rjH4hx$?!jRnd1e*ac8oHa7QX^>Bj-BXGZu4|6yXcUf z#VwbXuQ)KXQryDXjzp$+#y_;~N$u!3!|=A~ng3Lp7YSU7t)s(Il4K>+K--ycZGONF z@vHj7?qT!hquT1V`m0^UTb)}Kanm%qw-x3$dnYo=xAQhEZ2mSVzk1$MipV^$=r(Jk zs_LJ1wZoNxM`U3c@hXW|UJw*7X6Hz}JRlKP8H>4s=dg&Cy>8dO@S0lDv?2b2;AGyB zBTUo+?~XcqJ2~Zg?~EsHkE1XkJSIhzl~X(2?mxEhGLTx#mELc_x2DI_pux^GGkIY? zkA11#2uE0~^*Ykav2BH;^r7}_VL#tV%LVGCuBsJ!n!eY;K^bYaR=O)Pv!cr%D~^;F zjJ9T(dOu@*o*ZKPiu34o49|qy8)F;rINZ#gCb~x}V`t(4j(S2IZ)R}KoiON`+qO!m z*pVG;oO`}#aBXnO{}_omA}{2&ZEi>UVn;95WvuZ@TRGLJ5?K2z*jHEJ>dd}zf=(!I z>oeJ*9C%Xp?7EN9D7}~yS_-0R;R(Rk1yjZJ`CHDRK2JLu2_2laEdp$!`M5%Fui!;q)ay(`N)Dff$neSch=VVR0UD&c^A z1@XB|{P!Kbj+Ua`Ih9#OyYBbDeBbbxA%b4MeQ1T%sbrHSX;UqDAlCvtCK|AMmJhen zm%Bs|);?n`w~wq48B7?Podcah##i+%IlI+AKD?k3?_zSOv;DAs`OTtp8IL+}Wskk; zJv|W)zREYsya|ACt*yF3y@YbD-Vj#0!yU>;OLxW>Wt~|ywEQe;@N#du{z@%NJV}2t zFLTv}`k9d+R%9aR^Q82B&o-O!oK3DRoUYqE7`vRtq(;4*O_Y`{RlIDT8Lu?)>9l$G zH5`j!(rdVj==}!*6r?@hqYhvGwr2A9seCIT1CRmeR2C-bC1Wc1p~toR#$1AGKHa7| z=8a;Ipm|uQ-W(#K+aCK`)9`6#`haDlw`a+%y)x=?)Xp=P+9|?t8#mvhHG675P_9?@ zh}L8n9Hwxf!BKHvnm(Z9$M6ur*O*-x=d`XJeiDyx{pJ9IPy?n^hq;{iZEc<#gA|7q z^N=Eq=ElZGbM6lMC?AzN_rOcXq_PwM6^)fNmN}bp8gdejddxK$X|c0?sRrgwQ6OJQ zn=r~ex$4KvD=O#%#3s_p9Gsjfvi;s8L#Hl9Y?ys-2_AZc;%(}aA0Nx5w|jf?jxA4k z^is|Dd)Py_j!+&scikzDA|bI|gy%*jXH|kYhRd!yg$R8uitZXOTd#|0yd)npUA7LM zFkhMbf^auCso&D8>(loM53PxD56z0rXPpA?OC;Xkh_t(3>K2zAm@7Xxnb%>Q11u9j zG3@SNzu#u3q+Oi%F>xim`2l_jvf~Si_uQ-fN`e;SB8SCw&3`O^E%JBcQS`xLC?B8Q zLpH>Ox6=5TTI9z??Udd2_%3%Y9c#)?i2;i-gF-A4 zn-~CJXWik-uelM|i*0*8AY9Bq!AX{pG|mZNVG<#=>22>1T`0TUeS0(Ww0z=L<7h1A zbK(}f3e{5?%58Ko5}I>E)+5x02w?D`_F#o!d$^0|I{+Uq$gh13iPQ^uoyQ6<4038s zALDpEqN06i`YACZh->j?vBvYvP>&{M_He_ABa`3vo-=}BxPYXH=rHjvD`x+6^{2I~ z8G%^4`tgYl`mhG(*M`{zHR~l!nJM#-srWk1+i*c=(M!{>^_%V@l?_OIQrYF%)-0d0 zec@$vHh2Ttf^vCaB@?^a`Qjd@3-HIdN#fI&sdNL?u}XSnrX2v7kM_N=>B+avHoDKo zkD3aDF|VN1(;NctVxB9`(MR2K*y{N;85J1Zow~#uq~4T#7=enM*XNv{gAja6vFq2y z!;n}q-?z;N53eM|Ls%*OxyZz&X#~kc`gobQB|F$s%`**gjCBTK+tdmd768DXvxt_T zZ&Wh>sEN=Vm33tR;8S)XJ_%F)N+hdvH$t~?L_$Ihlut#%X7Z{$HG1`q%MXkBk#0v? zSrEJQmL~ehl}G#aa|CWGSE<1Uaj~!uRI`>GccI#{3!4}dV4C!B-!?u?_3>BD8OLc^ z@PgKbR{N3N#%|kGZAu8}VockuIYsTve((cl*E_8z>I;0>L<8pJ)4WFTH5V5%t8;Sj zwe}XPmUGBOz$I>wKCQ{`7DB5VDru4ZGD^~M|MR%5IMGcsx+MZh8z{5k)9+Pk@3PVy zxQM{J7FTR-8I3}2k;gA!(hGw4jUe;?NTLDS!uHc zFC(@0$TLx@BaPWTATP}zZDnzv2`iS*g2J<+&!xj?O%Do+rxZpgC}~<%e6V>E4ThUg zWD6%iU$_G=*d^ZSYNxiX6+d_wXT8ZvS20nI(%tgw;OREu_=py=8#Y+F0Y{iqI=+z= z0mO)>FG3+(bIEo4ozjj^#pQ%FgY5Ux=<;{}ye+I$@XMgqHq>-)f|GogN1h`D73*TS zh6ar*R#p!HO1xrke?1k{=vT%~09;)71@FLJr8-L3jT}t@gICmqWR;*tkku(zeDDYl zdE8Iw3pwz>_HY_9MED>*{$b_q%|h{A&lcI>Ffjd3&w;em=BB2HW95xE$l~ph=%96C zjPtq3o->IY*PLO@WZxXb&@BEQmWslNS8YAco%k%tlnJCo>shQl!x9jf)<6PXiud|j z4qy-65&|B0PM2WydK#E}~cshzNSv6$(eZ-H6W-7vZ&KR)?+U5=u(lZ}DQSz^O zdNQ_6SVoC_6RWq~^mzoyrT+^@f-Jn8Zt02`PB?%RI}FiMi4ybH2P zy-u^KF$@lC>Q(HxMiT528^-X+c66JlDb6|VAW0TqpXnEK{8GKcJ^r!I7JRDZe*T5G zZIX^pHMDl(?tnZNG)uIyAW~gTbcc+r+J~w<(Zy>`yQmDeH-@YxK$9mgw&*8Bomte7 zO4OjLHhm=5NJaR-Q$fgkEa}*`SDKa5oSp?Ap{}xtim>|yU=?noMZ*ynzT9Ecz}HAi z(^AshVXJ1;UFxlH4vzaw-X6c;OFPkqA z+pbIqk`&1U#=}k5iYeU)EC8Vq5!>~lEF_RuL4M}O^+`U>DJGZASs*eh6`!1R;ZMX7 z86|rM2h0t<@n%XJa5pyHGHn3n=mW4vkFVbr?)&Jsf$}STr-@ak2u4!!l#NZgXALS( zkoA5A`=+_T89BSTH%$2RhTj)wQYP4)&SLDi&r@Maa~i6F>y_3rO2H=9RLVC3u?w4S zRW!b^6w=Jwa#I-{@CKYAXjj9BMC1-yfjl9frD{f3K(_ho9^K!drbkuGvUlr9r|=K$ zH)^xna5t#t8!A_3jHgu+V?(yxA?W))YK zW)30&;bjPdAgBx9@5SQ8PeTwYX&t}4=jyD^$# zYxc#ym56MqaEzK9>E{eqH4aI5ny|1%v+8k3THLE+2~x{BGLekva2WFu}2b`*}y( zkYmGz=eE9?rWQQD@_CCTjnem~=shp#H2O-|aMpC)TPSJt%A+qCkJ>7u`owdgapaiY zyTF=juAeYzmkk=WpNqZk);?jAsabJ<>eQ-DL%DoCGk^b5%KP^!CHJ$)DzeqrCtW&l zZG8%822G6u6~i<#p&)uyA&^!#IWDzlB4SBLsX(%$jJj||#r4d+?YlHRaTgP zHa6#&e@P*cz1yV0ZHUf@MmJaWNu$)Zo^u*wk5-F?T{ZelvH;M$6EjlYo~w$V&+RQv ziK}(97y9b?+D3FWPn($cz~gz`lIs=|r?Y-wr_7>iE3qFHU7!C&bZw>6ojQ3W@P+qe z`5_Z0??VR{j@Es@8ZEHaIDITOk8${F%6pNe^U_b~Kb@6Mo(;LES6{=ez;5icBD2r( zetLcBNRH~RCDH|=oAk_7;roLPECV&2SeI73mMc3bZRAqp=~4Hy)Oox)aIL}-h=D(e zyfYL#fD^XC$E-h8?J_hh;aBs<-^V_ps#|g!Z%Xzr*zsNb<58XP)CmqBQ^~p8H#(EF z8{8J2UW=E_oDOS~bo(ed4)zf8&1#mv4!t=Sm~Cdzt_iNd^6pPT zTx8)Bz^@t9$VFD*e464QpJKF}cA;{nifZ2PEd=8G-*V(e6kg77N|y$=?vIO9ap9u* zOFwKL!}DXxw7T@qUNGajLGiS|k4{JLW1$c>k=P&AxSK4x4B0@Ey42&F7r{gqUi#?% z`}$F~(cFOJ4NH3BB^VV+{vq#CgZHT>Hd35J${}&TacO6CKP&#dQcoyhLSKZ)#Y&FZ}*(wM61)yt7iQp(p?KjhiBm zCyszghY=dRh0Z7#_C`jGP~X-L3lE58&7hRo%+ z>dnvG_=T>#w6f_2yQM`U3Di0q0KVrwjJmP0u_<|Zn5&o?vMKi9269@Jr2h4^f&&v+ zcv1#q|3x5ZyRCXG%(z=6Yz;kEcfj~(6j7E&juq}D?N|j4GKwN;*#xV{nvz@E%Lt2A z?*K%pL-_WS5^|b2b@sRnrRuV(2LDoD*hCJB5CXa?cejJNH9y<*)6}%yJ$Nb6AT+pR za5o&ThyVSgJ$VI<3{p%gyu_SFPFuRGCUOjD!dZFw)9c<1i>~r28A|wJIg`K-VUxQ_ zwvPGCDdXeg0ixhjS66TD@1H(BL!&#yNe5c|mw!zKyZ+bBfc0}BWkU@APXjwYFYj)q zzRcCmLqLLgra3y^S^;$Gr{4jXe~jR2Vs~ffUT$vgyVLaK%Yx+_r_7`#oV@N*M1nc1 z{#Nq;(<(t!403A2fkUhsRJ-}NFCSPNbjt_;_#cq8?G^yea6d4OHNdc8`(@qI{0eyf zst#ke1!aI<{MXOY|LN0ImdyucaVKr^% zP6hQSCmLLK5cS7m;Y&W1O6cp0w}2O){wXyVJ+(Bt0?0YlQssL7R{hy-Cc6K+zWcUD z0l(Y)M8_GRQ5pYlXYC(S_;g}!t_}wW$J1xePImY7#E)|W#Hys2$k7z>Z!gpf91*XT zYzE92f1WW&?ha+-l&FUKezV!Z%9EC^1mwv1Lrk1(SN`M3y}(&tzMSmr>@4tyM=99X zHjKKE+;CaYOamD2U#=*uTGJcVA@q3crrB>)tP5lld~h`)KZFXYO6e>Rw&48=dLi)| zX(T3Q*DDn*ZmIFFR^i_--`^JmBF1PWPa*ODS#ARo%8#V-Jmgmue4iuBp_bUqO=L1| zX_L&kt$TzPM*wkrHa?7OkqaD$neE!BEKWZ7qF5#!m*$wq53qTuz0<~lk~xTGX+8+I zYVB~UKYC7l!JqnhXnpCk$$2ukt*HENg$|!5Po28|+^|UdeF!h_f0f^`1n2jW1@}{R z1SttXmQlDszy=>75}^Z5BNNRTRy=|LGG-T{zMzi|Ts#fN%|T?Z?bS&N_KkW%PlT(Oy z2mu3CQE(eeNl4>$B)zDn+Z)oXYN5kWWZs#Cn_usngxO>k3EFsv@W_1uEUUIVa$X1# zZ4dtKIs2Oez=hE|ySl0=DcwGT!JLG&F!3^x?f5lFw0H<~8G>2y3E%51&46;wZ8c3J zUKqTo58d9-5T*u*B_rV7Km%ifwDW#WvPK%eKleQ8eGDKAs(-IXIPn^BQd-a+=TEG| z9eVIp1JY^s>9w!e&E?sr;<9aCaMs_zrTYg6foQ})L=8JR78>}2@H2iv^K zkWn{un&`L+?+#}AukwMFYJT7q5X05pp=C3J46mOey)E~2O*8;5!LVcGlodFGf7wm{ zMuPJ*SRG48z&@$pU>`qxEb2+p6ZA;xfx^uY+na_$#3us5pQ!swgVw!0-`rROYEqf1E*w$O$g>wE#DMRRje7!$_0R zjF-BW1lex8|FZ7F__0iE#uJZDi9Q5AwhvGOZ@xVH>63_d&MA1A=v_7T!63tK|74oC z`#>h8iH@ou?{?4!n5(|NzPY=b=%{n$Co-+K4e*YS(tjE;Pt79X^c!sn7k$N9^paVX zH9!YOu3^*=gqnh2#1U$fm3mN(@Of(|m{KqE@;*G+rSvpA`*6dlJ!`!P;lkZknf8BS9shlo_aT^Pmyh2V94VTo+R8M#r^X!^08|0v zWnp*{-QsDwPKd7uPmGN@6g+teG>XVEl|k}sGB&Q*P3(p#x&JRDH{5(-kii>c8o{<; z-TYszH?T|4dvqh)CLt$dDX^cG1-|d<}irWFTHW? z(*7$}!?1DTJ#H8yPkz=VM-G2KPf)GgI zZRx{*V3Vj(Vg<=E!5kR1_DrMLzd+3zRyZ|TdB9uJ1;jh2qs5e|nR&R%cke0L3JBIC zyfa0Ab&99dJ!~8d<{Ht_4wa6Pe+hI^$?$x#8Lg>@n9*^6U|~0v)m-85cJFyu4P>6B z3QnU9^&)+3oca@w0shNzUm%tiPr}An!SkDb;L%h0070p~1alui-KYS;#C2-Xzi`rE z*PO5pLBNoGXmyo3t@p~1RcG)ynH|r!)9czkAeiJCzt9Lcr40oV$H4OS!zR(7MvgEX z4;|^}&UECnKtQ52;o=i!R0MslcL`RBgim<8%8M9DO-AZx#-?+Te)14$(e#AD5Mv!x z&|Ncw=Luepv)Y3`kVF&6Eqj)liEA(L3A(1X`~;^@Twqkg=rY=IJSTcvKupe68hTZv zWMtGc8`%PPoB?h5?*H^uPntiZe>^f$PW>FTu)=m8F%;w?j#Qam@^a*tu2iPJ% z(}ckW8wl=rhgt;Mh@6di@{}JPfjQlm{@TT?L$}}~q>q|fJ`` z=G{`9w2lzY)e*s_&{1xKojdEJvJKrvU7AiLCX#YVYMct)$(^v4pRx=0>D@?oJOyJE zS^W4d#R3`SFg#5Az0 zZ(deUI3cVs01+jvW~@o$LZcCK!8JO^ti@jtEkK9;e` z$uK`Z>MPdPj}{kQE!~g*azFFP-+x4;`L&tk_w9nX7ZTY3)&!jmBd+Z^Ha8bmT6p8OlSFdE8NLo*bixgFfv*b3j4tcA#wmt)Qyr4 z{yg~_iMomIbM=z7Krr)d#Ti;eRP%UDKFnK(Mp7b$X0IT7LWCE(ZsgVwL$fbejZDd| zEe{PtmVQmo$6z8N8ahIAcllr$L1@?n)Tf{$j|2b5#ahaW0X;N~n<7Mwgn@YB0l2)i zUp|D|+!+2GDZCBe00tP4H-?{c18JB8wcch^6;Kjn4vdKUtMeZqlrCbj{F*&yu>#5n znjzo&x#!Apqs+q3_ex2Vcz|cDUF4*Z7?LLJnO~H(rR@RErQ|s+xBgU9a!S zs?{8^+qju23=C7*>yjVS|8ijSK(}VZW6m8zZH-et*tU$Xi5PTM^APYX!)T`HkqF$E z%s{FsklZ=2K(&I&l<1PEf!(Yl_Q078?_7nWo$~?DavNsEfREb(h&xCJ15V3K^ym*h z{(Y0$%BY)vB6OEvje}?-S|Z>^8Ej|wWyQYQ12d0Qxt&j*xi&0HM7JS|1HqrbV;+9Q zZ2K$-USbSY|j_~eKS=X0bsF;m`>8l2D>;c;HAXhPKvY7S33c#wJxOw;a>BN|7)8r-^zC>8; zQYQ2O67|6LBZnrnvKZz$(ZNy(!O-J(_{B)vLcB0TuyqKCNLE*G@I`e$@TQB8Hk z@tPU{JK*WhxS_Rcq?5ET032hl8uwGNj2Xa}ZTQv5G72$NC;(1{ms2g7ZucsT4t zD6D^sN@zKsrX@VxMPkqy^(bGlgUA2&E}J=!qO5uGppW2tfMk=r3~>-!QB8~qyp|LW zvICKW8|X^N%X#d#DQ&>+nHq}Dkp_>$VIkBj!15ue zwH2XttAM6@y=F5j74ZywbM1W`4|=eEDE&S;6!gelg@HOJ!&f9fUvEjF)h(fYT)W%n zPdfkt8eEbFs8z;d^tZqkE9qSlJtrUo$(lV9FxB>0h-*W_1dVQbz=vn(oVURJjlq9P z%0}O%w-srG00IyZSVBO@Mfe;oqOc8TJ?wfi<78q?>%-(e8a@bymC}%RMh^yWtoqms z9Xx1tBEBwO_8h=+LQqC;_JD|NBNKJB=8Nneoj3l}v@hv<2?=zmp9ASTNy3u9N3?{)}3*2{va6BPxwGR0p=n7PiS|qP`7X z=w68QyIAhGf=D`YCjNm2Go9du!`@!W*P=|-4A(TK{b7m;`Y|KgU(VGSL8=Osd|)@K zz)tMA#w~}z5t275<-$tYz`tY$?L{^G0}c4aIas8OeDV>SO04RDS~^7S#e<1Fa+Pr%C_Xl^At`0+mXafIDYI)v7;RyEfEY@Pt~X3#6ugxZ0TDR1bFP3M>A zwyEo(4DuSrecZ~B(Z$Zo`l}v`ll!39umMH^IVnO=xT@N}hL}4(6dS)S=JJz329NVzb`QW> zBYIW#r8DYjC(^?}M17>t`XiV;Nz&o_w_!2U*Hk;@+?}mgPgeB;0^{Ow(+c88VW5W( z;NI*)#j7sp)>QW!kgQK$6{)}Q@4bj8(CbIGxW|!@$@Nv>0VM3y*C#F@VZpl1 zqhj($r>T7a&Mom%iWB*9e1nAmc?@klJ_n~0{@3~%XE0gzUifQIQcH5@nUI@~itf|! z;B1N~(okMv45Get+aC11H}3%}7Q)iwV`C@IjgB$?9PfX40erb{=+q4Q*n9N*4PKtG zsEgVK5!+B8=G}I%nw9Djc-=v)kvjZ7*iu;7uPctO=qRqk+V*jC8bFQ+j3Nt$U0NO| z7YF~95>ngc<8vE6%)~^ej@~R3g;9?xLcM{Xe^TR?xC`=fbG#`3gYB?2FQKRdo!#DZ zFzwzjf}wN`Dl#Gc4C0OaQNaLhr(Wh7-$dzxm=ck1G=NtYmLu@@LE6SfULYcpC1&T> zJsu3nWpYQ4z{07)O^j&%1m*yr4>ktI-g2~fo)HzDVSgkKIzCI{QsR?N_5GR>xPj&349 z6?rrP!pR15ryA~Tpu`{;;$f*s^h!IZ{8q1@PM28y^u*vS{%?j1FaIN!wQp)qpCfEV z+X8EnCp<+L)_5=DW^^3da1COhZjrv@|khpEt-sVP;ZC#~c95i#cno!2=dGMqukZFfrsnA}+nZvmG@t{*nLp z^G{ECE*wuZ2uapRU_F7LeQMN??9lrY_E$3FffD&`%v8X^uZQOAb#jx-qFwkNSJ4@M9iNwjqMrhXahcbszX7Hb?__-N;v@n;G zJN9%3Y@pl(XN*4Q1si(Am=MW#@xl1+pPMWt0tV*)+XJsA%0S&TLTtc2A0>(o9hvkr zXvbpl?JvP9sDYb%fGzMdx+l+Ae@Bg;rkaqA2IOp>xCP6xMdhvp*ddBA+gtl5$y475 zzsT$`D~74SW^4X|0lZ+4WeouFhOskkfV{}x`J>2r2bnDPD&-#rIvOO@ zMAjTh13+pOz=!nre@`|c_rhJ{7X?58#^wOFciZ@^lcarhWF|@n{%MB;8 z{ZpcBr#YqG=FPr>V24P-tQ|DM$mT%8Z^44`YzJ!6T zPJi|RI!42&A0bJwPn(i5&>^<{fnR6Gq>UKT+<|M#sIIICQ&fjLx{;c}A>+zRG#y|A zu5%Ocz$iEEBp|(~PAACA&A0>l5DNY5kJzjyhsX=R@nft7-RHIZhPMWC(B~FsUc6!Y z25FrZQ|ptK3L-#Ia=BgJGimC4nY&B-8l;xif`OY0b^_8wy&PCmMy5LCr=UPUe>L*J z1;K>1`84FOpv3i{PNwiV>0rf=ocWmn1)Bu<@6bctc-DR8%?-O%2=AckU$zQ*r|$=R zv7vh){oFw}AQwce5G13#;pkjlDG5kSPCTw)ta749dWbGgUy3hT_nh%aC3LWZ@6xW{?;(D@V)4xC&R*x#6#!jsmy}i! zQ@tvu0MQABvWQLB-9lWBIvYA7gjnkHW$?K;mr|z!HVvst4}0pRi3wE`)uJBA%BAmr zVq}uABEx8ReZaF|44&K4QC7Zoy9Y=z_*eSt`sP=^Y&&Q8u;c=h^cNO4i0>@M!K%LPWsTtBs0e62HXa@t`ilOD>EFNl$nS;65H|;(3hI+ zjaZi|?(6$Ycz(&^`>z1{I+e(=<1}U420ynGi7N-0DcijDS@3JM$9P%c=WXnQ$@Zhi z5)nz9c}I%u*q{#WKYsiG&pbTxm;Z}O|G)iU`K?V5v4X1OfBc}-Gv0_^*MI$i{kzfr zk1vrQM5F7*__sHq(&pEXT_6{fj%ktOJpcM)kWQFiAN}*icR+SC(kdVRccX=W`|Z{L zV-d*X`2Umr+fx6vvp;kYAj9i>tw{|+Q1Zb{%uN*H=HU<=KL=s{gGd>N_Ya^g4M!T6 ze*>nd@2k*;P>ag|lP0cHkGWF$XCO>GKlOlP}(U3E&TY@DZ9T zBOEBl^u7S$91>?&L6R5%X?>2=VmZE|*jx?i9Zxed)RBU%_3sJm`@C0L5=s}#W?$1M zP33q4wm!8m4ASds!&kVeZdFJ*lunvPICxmwO6@x1t#8yGD5$nA=o^W)F7sUMs+wp4 znY$nQ@;=~lAA7PkzV>5O!DH$sbmJ}cY;W{xi-!y<$L5nlAE1r&U`GkSuEWumk3SGD zGj|sQV&_=1FYQJNFVuxW8BRC|P}w~bW0e=>vJQEp%J}EG#`b}|+D?2vUGH3{*+So4#>4SkLNPwXiG!C&}YDs37xYLQ?Jb zN%lJ2M#xw6VL^XfYr@V4N%mKFC#)g>yCc*9*FyPWq(`;K8syO`g3v7}y{xeQ{bSi7 zi5YA)Hw3Qo;~ZYoBKg=!7;pxe5vnZG5YRpp#Q~Sq+T0~(s8IO=o43Gh^9pHP{odPn z3@1Zewq7Oy(O{P5!&|GZ%dol!ZUn~v{nBYBfc>x_Jv|SdvA_-Ud6k}3PTUlA=IGbs zxa6)9NRghYKhdV2&S5~@-wKp@mh565j~polB{dA1<#hkyzU-g6gmhWPM#oawdfD=g zc3FGq8Toxs^fOm8ezhGk89^P+;+U%;Z@+A3dXepo*~-mj(NQw#b3i)sEi2hY1K!u< zpl$h)I^>FGZ+Ew_$5dXVNqO()f#vs-2+-h#iN8zesAH7J3|_Uenyne1>^*q%6TNng ziRY1v@<4q?a*AZbTc;q1D1;e)tfcV#<@o%R7B0dU<%6qG#|`8U1@4q+(Q$>D8PfOF zf%R$b!{R5!1*_|B!cAEWIRF|{bAp$-Vx=w1y-NK?)b;(>p@xCv6y12l31Iv0%4=XC zSkzTblZGr*Qm@zDJS}(mn1T1YGP_aOVcxc-2{{Nf?qoAwsZviQ@~*a~$q!FSeo8Mm zv+4|6w`!jE(Ly5Kq`Y~vyYW=W<1-a&z^<`$rVYwm4vXI;*--aPv#*S9v`u@g)MI2V z3x#22jusIwOzNoKdU4KkLEh9{$oOrrlv?fItdj=T&AD5IK*hve@2$I9FZ-V9@A4CG zPhrF#jQV__1}G_gDfK+0cD${J4n@<86%lNBgn0?b93T~bKtXl35)wXsh*{Wn)In_- zY1CwWrQ^0TsmdZqvm%f>%9S3{et}Hn{eI=a$d<-z%@QD2{h*e&DNjlpDn1y9s_sgB)_uFJg!EBf#=Ac1} zUrLGy26kR~RYE`N25)5nLK7JsT}xl${@Mi6q@{kY0@A&=w0oF};JV$?$;qkTZ8G@c z>~$dfu&@B`k4>H&d@(M%w%Mgw7W;5@Onm888#Zvao; zb?z_M@-XM~aszL((-&J`aE_b0fe`iYbkfNR%wGllPJea?yy(9v$p@y4dUK%yKeI#> zaY(|X^k^JgZ}`&rj&?&N$T8U^J`qLx=?k35@&aRge_na#B9z+Sut_0)J$10`6vAe?_lRfAwQ&3z3n4&|2N?*CceF)kq|a9uhZ}WA_lxa+zqq+s}~RsFG-K z!WVq>zpM)uKtwQsTg*}8hp5K410Um}UA8kxMyIv+xEf?yee#W^OL~&1Z?EIpX}WnJ z4Y=ee2C5(Ozq`q0Tvt^`D(Q(0!QU?`3iA#gysbbQKhvCLqF^7{V20tXk3>9(eug%v zCfju1^}Ey#{9{lpIbccGBE*Wz3s6NYk2FUmjkdYJxHhEH-*(aYfkd1T-^Wfu)M+Vmm( zD*+AEXFa%=j)@zR*uMU{H{?m4@iAxqY3uP_;4AdAo?eMv`2mzXvmIlIuFDmUG*D`z z%JV8~XlMxRSM2}s@y&&jmO4V6Oo(Ta%@gaLq+G}0nm}4((kM_cJUy_Tm~N!hR@XgW zJHZ$|`MtjCD~Y7V6J<^-RxJ8_QS!Pth57sacLz;!8)aVRYjk3C^KQ;kn}7n(ORGM* z(5kbRhj0;ldpj`ddFT*Mh6=iwpM$!_jAzzHb&uQ4Hy|W?zFOCbRjU#Mt zi&gA;G4-wX;75XyuqGzgh~&<(e{FEH!fZkU)cCnco)P06UkaIlD`#BI&OnLTfjLEE z2JS>>87>PW-Srz!8r#m& zaN@gTwa25loEsG%hssq!ONSwo+PYVHCE|*zoOog-6#e#z%+0=GUR{9tAiIAVy8JhL zua!$Epo7J3s`I9QTNf2pD_U~;boI!So2xwoiwZGWRfum`18vWa+Y6_}dYv{_u+Chl z3)GrP{;~4awQoLoq;g8K;(Jik&WwIh(%z!SGnaW+qplC_>AOOy`Mo)^{EKJY02NNm zvVp6jxVrGw|!)Ewy$#cs^Z8`rtW64Wv4==#wAkz5cv4d+*g)b`39R5s>CLNz zC@LC+kPKtf2H9tfu^n11dm@an8}nYbY`@mu@5gt1>)#y59QX6gGtYfr_jR4;d7XEg zzEXcO-=hn9brzcmN;rz@Z=n4=s_3d`vu8ZiZYKVyk$WW*c!jKb#~BB)BxjsJ*x(<@ zGr=+_-6;AvM|dox8-VE?yB2JEZw~L|U)M8mm~!UG@YDUdGR%EPv^_^NA2)= zu|DybJuksI&jS*YiDFiE-kBwuo*fTrb zxjf&F)ATWCm#=LOCd}KJ%|B`{5&^bH)@?<3N6;1i(-TSF&jP@`G{1JdeRi;&Ja8j7 zH$ns(C2nv-pAo=eQ)GC_DOPWInl;%!7FLHyfYBl=5q!HeT)FSffl!Mj=Q5Ocd0<}% z@!J7(J2=Tj)mxtNEDjb*iiNquRMeVl#QwHud<=}PMklWc>ubQ2JKBK?AqbL)Vd+k- z5PRtllHvSlLmRIThB!<#zjd6)ieG`<2v}UW%7Lso4wEZLj9+)5i9gK;n)u&;ye4IP z-cNoJr<@zT0}2bWu6EYfoGi#q$Ooq7UZ-yx?ENrVF~6=H`vrmM?v?XM!qSs|{Uj{| z>HKz4WAaxa&o@u}&*&AK<-$Js&%d@JQC(PE{|9FafYJzlm~sL)1)8o>Hwq>jQ47Sx zmdD>}pIBpbvM4xpLC`RRlQus7{fD2>F z!sg(<)ihvkbC{clB-!CgwVEkgZ~qF|YF+suC? z9CuF(0+l23dY$$`60pW0?|1)NV;TBBU5a~E9r_bW4m~kj`V%tN&4BUs9Wa2oNXJo# z*&w&o9Cs->AB0ZO@Y>gR$Vfs4wZ8yzS^-tUkEKs`Z39J))x&y4ntje1%V06xebAkU zxI)6I-Z|C}YsiDhdIgdZ$a#A7gCx9L4?*W(IGxKz+zUAB7Z*qI=tw-&eh}{x+~<(T zMvWKXyo;SfeDVO^QE37os}t?yIUFd5M#ou$E%d_e72wRPAgDj_uoddjkt%n8s*XRv zEKb=cAkyCD71XV@E#XXo7SG4K;&V@rkX|=(5Jvg@sJ?A`Q&aEN=yBgVw2h*wT9-33 z>M=c-bUDn%k){=yCcdkt*EFcZNVch~QN zH0u<2;L2MXO8Si*nuOo+DLbG>kF`nZmrE1pGS1SO)^woqUls!4Khm5)K&NehNsUc@ z236Dp%Iv#vjKuZVY9B0xr&|TWch)Cv4AAgwsAg+b;R)?K!_1> z^_L%q#f)oRI>xAGHB*v!o|;NzGJ9@4c&~GQUpqK6QYz`|YMBeHM2*h)WKDp+3uo=N zm=1bw$K`b|Zf%~5{ZJSXP?)bj@kxpksj6nRu7Rp~|Dk&FOql;zLjH^H7*A710TAVm zwB)h(z!?n0>EtLCdq&x-XTp^y9E0bKr-BVBrQks5jDLM)?ROPsvASBL;1cZHc~=o< zk!~_PGxaJrS8FucjxB$rsgtH}n2&MmC3UR;3Y09|1~6beCcF+8hpcUS z1}yRrZqLN0M-5xR5{N4kchCPQ5%o(zL4d$$sE8J%p=^g^9lTN#fqGXjfqnCTpbu=S zlhE*_TGkI4X6?M%d}_DRBwS=L?o)`k+ilM55T}|PqL(xnFxh#J;vj2s7tC<#BuNu4 z&`D(i`ePFzMPL;ad}6@}r5rSIY|UO*hKQ|C@w9_ND>&|-ylgSl3u4gKY59za0~q0- zDxmcDSN)jEI?joLGa`?lf43QRm)NuuTX+!SiSfl)8PQ4XwjqC~P}K77EcY<8pJ~?k z2abfa=wM7PY)t&wWO^yE2&MCMeM({s7z&@mdo?7qd4(iZUsvbgShKnyv>i7usoupd zGt&Qah`KthaF%SxxHP8QAF2NVmqgST&hL`Q)Tz9~J^Q;U2-MYd2l204Dgn~$wcF$5 z+2y9~OM8jySGFOmJKW_hJKYq=nduhIJZ0b(aWghrs5$S}6j*vf#dsm|$?Bb53A47U zXG13^3Ij}>Zuj9E?o~y0&|3o2*9iW_|J_6U z)5ig=m0V2mk!NOp+o55y#6H^(&wj-ppooQVvIho{bueK2zJ+JCjRimyrRN*^9S zM(oV4Q7)J`>Gq`(T%#S@p`S>J@dZRvlhf+hs_3jCbMzh%q^`qk{rq*@_D`ulqFwxwL&5ZB{s(W3w0Y*b#a z$uZv#_~Z&qPxMb<5kKJG+1 zXxk!xDY>x-(P!CGg3o~@Z&N4|BX!yFE;^5K1^u7X{{dcF_*sr@@^e+O)&dBT<=zFO5~|$sksWG2a{XVn zT>I(IR8bPWK6_Q@S#oSY97H|0sfI>2rJi(qE+bnT0mm;+PA9XGwGnFG9LmDq;Bgmd z?9lKMuu>bXZy^BUT9E^%$aI`=*Q4@&zpJPv$G(_H^&@G$=+3?1%y+!DRPnI1?77W0 z*0k15CY3w+Y}*h&(%jsfUkdwM+R=S+ky7{evI#R@R8=ilG?daAP^V;XP$^8`=*Q`H zoANstmVsv9H~TusV|oPs4aHT>MGEzyDQ`Knb`rNdn5`{@PM@Jj#&B=pY=2>XDLA6% z{p4Ax%J>8gu|AmeIu)~E0|7AtRoukJfiYiU1@0R1{Q!{RVs+(b}BI_X|k*! zGJ78-`d0}+=;ATCYMDDan_beFq%&N}gUvu3yuB1-r|H%zhYC8nK1C~>5x74G3Fp+` z){CEL1KuGX6F>$khT;G&GRS@`KfO`s#}Q=#IVK;DNR?}I%kBI@`Fca`QK|l?n6SG3 z@qs3)FQc)sr|r^#ic74#_6Xo}4YHJ+NsrPPCgqAw(#e*oBRnpY^Yw5NUMG#+OfTrV zC@7|GJXT^!e4|V$h^HKPfN>ZyFkZPyg_P12Ifg!bXYHw*^C-~bhz1) zBCTsIz9tVuAO}-R4<*(tWF=Vs?`$u2ZWdKNc?F~P!iwdk`8Xu%f-j&C?^<==>nlw$ zMG7abBKCati!vaJ9#tCji5gF-NCM6g@{5yhS$}13hx6h_5k^yK)cj6SN4jh4VD7cd zSeK&dH&U}IE_PaGl~pNkTXjhl@?8#-C*81}5!CTsLg-oi8D3QkcQ`trRbd297bt@s z=bM{Z`1~kGkyDUmjF+Dln$8IGOF4Z}?;Pw77556pp4@TV1#5;TvZmTp#yGV=*=a8- zm%WB=@*bC1qVK?-$F=39QUb7fpCB@4aZqnrpKfvU@~@QT4pv| zd=^&o%=$}X>=^m`RK_NSSNFr(ru(a!#{vDdonDu^)dk=GgSs{xuAm0uSblKAIn?kfuz~e+&q=$8XWEP zrGWhSS1i4^UPHUxcSz4&uLkOD%Bj`Nt=-zuUI1S(8x#lI^1R=V5NlfpZNX`IR#e44 zFz1JUNb$_m8SR`|Z}+YN&~bF9wvoXd>c;GL%Yu&mKQ*55FW6w?eE~x3$=5a?2xh*_ zzTaLrl5JCWIHZF|BJm~kS-n?CbBSjl3{3kO5dV&h^K;q7*HRqxab$I|tgpsubFUys zds8DU3ktysWiMKt|2f=e9=KzUivV5E9aQjxS=F4`%sfuHY3*^q2TtDAE$jaPouicK zo1GH&3x49#9RQD$c!ROR^LZK?lw`!Mqo$EFgjd4d*CsfBc8i_AJ{ONEr12o+SLaza zAEtznOS&VHiAd11ae})#xWGNesKGyW5$j$iI9PU{MSLIziIe=VpO*gHLT}BhpFx;a zZd|*9I9ePmbPm*(w8<^;K?^HyC9u$O6P{3EtAWnn-5I1#7eVUOeeF8mN+e_9T7gSy z>!n9t_Ng1>1P`Qs7MoLKEtIAUgSnO~E&lHI|2+6VZ+yjU{r@@Di#RzBrRrQ8Bz vj}d5SkuBD|&F&P$|7Dk&zvlnK&7_ZTJzKx6I8_QBjBCH4>E7qN?Jxfq;c;}s literal 0 HcmV?d00001 diff --git a/docs/source/dev/kernel/paged_attention.rst b/docs/source/dev/kernel/paged_attention.rst new file mode 100644 index 0000000000000..6fcadeeec27b6 --- /dev/null +++ b/docs/source/dev/kernel/paged_attention.rst @@ -0,0 +1,525 @@ +vLLM Paged Attention +==================== + +- Currently, vLLM utilizes its own implementation of a multi-head query + attention kernel (``csrc/attention/attention_kernels.cu``). + This kernel is designed to be compatible with + vLLM's paged KV caches, where the key and value cache are stored in + separate blocks (note that this block concept differs from the GPU + thread block. So in a later document, I will refer to vLLM paged + attention block as "block", while refer to GPU thread block as + "thread block"). +- To achieve high performance, this kernel relies on a specially + designed memory layout and access method, specifically when threads + read data from global memory to shared memory. The purpose of this + document is to provide a high-level explanation of the kernel + implementation step by step, aiding those who wish to learn about the + vLLM multi-head query attention kernel. After going through this + document, users will likely have a better understanding and feel easier + to follow the actual implementation. +- Please note that this document may not cover all details, such as how + to calculate the correct index for the corresponding data or the dot + multiplication implementation. However, after reading this document + and becoming familiar with the high-level logic flow, it should be + easier for you to read the actual code and understand the details. + +Inputs +------ + +- The kernel function takes a list of arguments for the current thread + to perform its assigned work. The three most important arguments are + the input pointers ``q``, ``k_cache``, and ``v_cache``, which point + to query, key, and value data on global memory that need to be read + and processed. The output pointer ``out`` points to global memory + where the result should be written. These four pointers actually + refer to multi-dimensional arrays, but each thread only accesses the + portion of data assigned to it. I have omitted all other runtime + parameters here for simplicity. + + .. code:: cpp + + template< + typename scalar_t, + int HEAD_SIZE, + int BLOCK_SIZE, + int NUM_THREADS, + int PARTITION_SIZE = 0> + __device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. + ) + +- There are also a list of template arguments above the function + signature that are determined during compilation time. ``scalar_t`` + represents the data type of the query, key, and value data elements, + such as FP16. ``HEAD_SIZE`` indicates the number of elements in each + head. ``BLOCK_SIZE`` refers to the number of tokens in each block. + ``NUM_THREADS`` denotes the number of threads in each thread block. + ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For + simplicity, we assume this is 0 and tensor parallel is disabled). +- With these arguments, we need to perform a sequence of preparations. + This includes calculating the current head index, block index, and + other necessary variables. However, for now, we can ignore these + preparations and proceed directly to the actual calculations. It will + be easier to understand them once we grasp the entire flow. + +Concepts +-------- + +- Just before we dive into the calculation flow, I want to describe a + few concepts that are needed for later sections. However, you may + skip this section and return later if you encounter any confusing + terminologies. +- **Sequence**: A sequence represents a client request. For example, + the data pointed to by ``q`` has a shape of + ``[num_seqs, num_heads, head_size]``. That represents there are total + ``num_seqs`` of query sequence data are pointed by ``q``. Since this + kernel is a single query attention kernel, each sequence only has one + query token. Hence, the ``num_seqs`` equals the total number of tokens + that are processed in the batch. +- **Context**: The context consists of the generated tokens from the + sequence. For instance, ``["What", "is", "your"]`` are the context + tokens, and the input query token is ``"name"``. The model might + generate the token ``"?"``. +- **Vec**: The vec is a list of elements that are fetched and + calculated together. For query and key data, the vec size + (``VEC_SIZE``) is determined so that each thread group can fetch and + calculate 16 bytes of data at a time. For value data, the vec size + (``V_VEC_SIZE``) is determined so that each thread can fetch and + calculate 16 bytes of data at a time. For example, if the + ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the + ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8. +- **Thread group**: The thread group is a small group of + threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one + query token and one key token at a time. Each thread handles only a + portion of the token data. The total number of elements processed by + one thread group is referred as ``x``. For example, if the thread + group contains 2 threads and the head size is 8, then thread 0 + handles the query and key elements at index 0, 2, 4, 6, while thread + 1 handles the elements at index 1, 3, 5, 7. +- **Block**: The key and value cache data in vLLM are split into + blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``) + of tokens at one head. Each block may contain only a portion of the + whole context tokens. For example, if the block size is 16 and the + head size is 128, then for one head, one block can store 16 \* 128 = + 2048 elements. +- **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that + execute simultaneously on a stream multiprocessor (SM). In this + kernel, each warp processes the calculation between one query token + and key tokens of one entire block at a time (it may process multiple + blocks in multiple iterations). For example, if there are 4 warps and + 6 blocks for one context, the assignment would be like warp 0 handles + the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 + handles the 2nd block and warp 3 handles the 3rd block. +- **Thread block**: A thread block is a group of + threads(\ ``NUM_THREADS``) that can access the same shared memory. + Each thread block contains multiple warps(\ ``NUM_WARPS``), and in + this kernel, each thread block processes the calculation between one + query token and key tokens of a whole context. +- **Grid**: A grid is a collection of thread blocks and defines the + shape of the collection. In this kernel, the shape is + ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread + block only handles the calculation for one head, one sequence, and + one partition. + +Query +----- + +- This section will introduce how query data is stored in memory and + fetched by each thread. As mentioned above, each thread group fetches + one query token data, while each thread itself only handles a part of + one query token data. Within each warp, every thread group will fetch + the same query token data, but will multiply it with different key + token data. + + .. code:: cpp + + const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; + + .. figure:: ../../assets/kernel/query.png + :alt: query + :width: 70% + :align: center + + Query data of one token at one head + +- Each thread defines its own ``q_ptr`` which points to the assigned + query token data on global memory. For example, if ``VEC_SIZE`` is 4 + and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains + total of 128 elements divided into 128 / 4 = 32 vecs. + + .. figure:: ../../assets/kernel/q_vecs.png + :alt: q_vecs + :width: 70% + :align: center + + ``q_vecs`` for one thread group + + .. code:: cpp + + __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; + +- Next, we need to read the global memory data pointed to by ``q_ptr`` + into shared memory as ``q_vecs``. It is important to note that each + vecs is assigned to a different row. For example, if the + ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs, + while thread 1 handles the 1st row vecs. By reading the query data in + this way, neighboring threads like thread 0 and thread 1 can read + neighbor memory, achieving the memory coalescing to improve + performance. + +Key +--- + +- Similar to the "Query" section, this section introduces memory layout + and assignment for keys. While each thread group only handle one + query token one kernel run, it may handle multiple key tokens across + multiple iterations. Meanwhile, each warp will process multiple blocks + of key tokens in multiple iterations, ensuring that all context + tokens are processed by the entire thread group after the kernel run. + In this context, "handle" refers to performing the dot multiplication + between query data and key data. + + .. code:: cpp + + const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; + +- Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different + key token at different iterations. As shown above, that ``k_ptr`` + points to key token data based on ``k_cache`` at assigned block, + assigned head and assigned token. + + .. figure:: ../../assets/kernel/key.png + :alt: key + :width: 70% + :align: center + + Key data of all context tokens at one head + +- The diagram above illustrates the memory layout for key data. It + assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is + 8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each + rectangle represents all the elements for one key token at one head, + which will be processed by one thread group. The left half shows the + total 16 blocks of key token data for warp 0, while the right half + represents the remaining key token data for other warps or + iterations. Inside each rectangle, there are a total 32 vecs (128 + elements for one token) that will be processed by 2 threads (one + thread group) separately. + + .. figure:: ../../assets/kernel/k_vecs.png + :alt: k_vecs + :width: 70% + :align: center + + ``k_vecs`` for one thread + + .. code:: cpp + + K_vec k_vecs[NUM_VECS_PER_THREAD] + +- Next, we need to read the key token data from ``k_ptr`` and store + them on register memory as ``k_vecs``. We use register memory for + ``k_vecs`` because it will only be accessed by one thread once, + whereas ``q_vecs`` will be accessed by multiple threads multiple + times. Each ``k_vecs`` will contain multiple vectors for later + calculation. Each vec will be set at each inner iteration. The + assignment of vecs allows neighboring threads in a warp to read + neighboring memory together, which again promotes the memory + coalescing. For instance, thread 0 will read vec 0, while thread 1 + will read vec 1. In the next inner loop, thread 0 will read vec 2, + while thread 1 will read vec 3, and so on. +- You may still be a little confused about the overall flow. Don't + worry, please keep reading the next "QK" section. It will illustrate + the query and key calculation flow in a clearer and higher-level + manner. + +QK +--- + +- As shown the pseudo code below, before the entire for loop block, we + fetch the query data for one token and store it in ``q_vecs``. Then, + in the outer for loop, we iterate through different ``k_ptrs`` that + point to different tokens and prepare the ``k_vecs`` in the inner for + loop. Finally, we perform the dot multiplication between the + ``q_vecs`` and each ``k_vecs``. + + .. code:: cpp + + q_vecs = ... + for ... { + k_ptr = ... + for ... { + k_vecs[i] = ... + } + ... + float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); + } + +- As mentioned before, for each thread, it only fetches part of the + query and key token data at a time. However, there will be a cross + thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk`` + returned here is not just between part of the query and key token dot + multiplication, but actually a full result between entire query and + key token data. +- For example, if the value of ``HEAD_SIZE`` is 128 and + ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain + total 64 elements. However, the returned ``qk`` is actually the + result of dot multiplication between 128 query elements and 128 key + elements. If you want to learn more about the details of the dot + multiplication and reduction, you may refer to the implementation of + ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not + cover it in this document. + +Softmax +------- + +- Next, we need to calculate the normalized softmax for all ``qk``\ s, + as shown above, where each :math:`x` represents a ``qk``. To do this, + we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and + the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction + should be performed across the entire thread block, encompassing + results between the query token and all context key tokens. + + .. math:: + :nowrap: + + \begin{gather*} + m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ + \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} + \end{gather*} + +``qk_max`` and ``logits`` +~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Just right after we get the ``qk`` result, we can set the temporary + ``logits`` result with ``qk`` (In the end, the ``logits`` should + store the normalized softmax result). Also we can compare and collect + the ``qk_max`` for all ``qk``\ s that are calculated by current + thread group. + + .. code:: cpp + + if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + } + +- Please note that the ``logits`` here is on shared memory, so each + thread group will set the fields for its own assigned context tokens. + Overall, the size of logits should be number of context tokens. + + .. code:: cpp + + for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + + if (lane == 0) { + red_smem[warp_idx] = qk_max; + } + +- Then we need to get the reduced ``qk_max`` across each warp. The main + idea is to make threads in warp to communicate with each other and + get the final max ``qk`` . + + .. code:: cpp + + for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + qk_max = VLLM_SHFL_SYNC(qk_max, 0); + +- Finally, we can get the reduced ``qk_max`` from whole thread block by + compare the ``qk_max`` from all warps in this thread block. Then we + need to broadcast the final result to each thread. + +``exp_sum`` +~~~~~~~~~~~ + +- Similar to ``qk_max``, we need to get the reduced sum value from the + entire thread block too. + + .. code:: cpp + + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; + } + ... + exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); + +- Firstly, sum all exp values from each thread group, and meanwhile, + convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``. + Please note, the ``qk_max`` here is already the max ``qk`` across the + whole thread block. And then we can do reduction for ``exp_sum`` + across whole thread block just like the ``qk_max``. + + .. code:: cpp + + const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; + } + +- Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain + the final normalized softmax result as ``logits``. This ``logits`` + variable will be used for dot multiplication with the value data in + later steps. Now, it should store the normalized softmax result of + ``qk`` for all assigned context tokens. + +Value +----- + +.. figure:: ../../assets/kernel/value.png + :alt: value + :width: 70% + :align: center + + Value data of all context tokens at one head + +.. figure:: ../../assets/kernel/logits_vec.png + :alt: logits_vec + :width: 50% + :align: center + + ``logits_vec`` for one thread + +.. figure:: ../../assets/kernel/v_vec.png + :alt: v_vec + :width: 70% + :align: center + + List of ``v_vec`` for one thread + +- Now we need to retrieve the value data and perform dot multiplication + with ``logits``. Unlike query and key, there is no thread group + concept for value data. As shown in diagram, different from key token + memory layout, elements from the same column correspond to the same + value token. For one block of value data, there are ``HEAD_SIZE`` of + rows and ``BLOCK_SIZE`` of columns that are split into multiple + ``v_vecs``. +- Each thread always fetches ``V_VEC_SIZE`` elements from the same + ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread + retrieves multiple ``v_vec``\ s from different rows and the same + columns through multiple inner iterations. For each ``v_vec``, it + needs to be dot multiplied with the corresponding ``logits_vec``, + which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with + multiple inner iterations, each warp will process one block of value + tokens. And with multiple outer iterations, the whole context value + tokens are processd + + .. code:: cpp + + float accs[NUM_ROWS_PER_THREAD]; + for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } + } + +- As shown in the above pseudo code, in the outer loop, similar to + ``k_ptr``, ``logits_vec`` iterates over different blocks and reads + ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each + thread reads ``V_VEC_SIZE`` elements from the same tokens as a + ``v_vec`` and performs dot multiplication. It is important to note + that in each inner iteration, the thread fetches different head + position elements for the same tokens. The dot result is then + accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped + to a head position assigned to the current thread. +- For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each + thread fetches 8 value elements for 8 tokens at a time. Each element + is from different tokens at the same head position. If ``HEAD_SIZE`` + is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to + fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are + a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle + a whole block of value tokens. And each ``accs`` in each thread + contains 8 elements that accumulated at 8 different head positions. + For the thread 0, the ``accs`` variable will have 8 elements, which + are 0th, 16th … 112th elements of a value head that are accumulated + from all assigned 8 tokens. + +LV +--- +- Now, we need to perform reduction for ``accs`` within each warp. This + process allows each thread to accumulate the ``accs`` for the + assigned head positions of all tokens in one block. + + .. code:: cpp + + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { + acc += VLLM_SHFL_XOR_SYNC(acc, mask); + } + accs[i] = acc; + } + +- Next, we perform reduction for ``accs`` across all warps, allowing + each thread to have the accumulation of ``accs`` for the assigned + head positions of all context tokens. Please note that each ``accs`` + in every thread only stores the accumulation for a portion of + elements of the entire head for all context tokens. However, overall, + all results for output have been calculated but are just stored in + different thread register memory. + + .. code:: cpp + + float* out_smem = reinterpret_cast(shared_mem); + for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. + } + +Output +------ + +- Now we can write all of calculated result from local register memory + to final output global memory. + + .. code:: cpp + + scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; + +- First, we need to define the ``out_ptr`` variable, which points to + the start address of the assigned sequence and assigned head. + + .. code:: cpp + + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } + } + +- Finally, we need to iterate over different assigned head positions + and write out the corresponding accumulated result based on the + ``out_ptr``. diff --git a/docs/source/index.rst b/docs/source/index.rst index e90481845c4ff..c0250bf99f7ae 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -98,6 +98,7 @@ Documentation :caption: Developer Documentation dev/engine/engine_index + dev/kernel/paged_attention Indices and tables ================== From 9cbc7e5f3be72552d6041f81738921a9597643e8 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Tue, 5 Mar 2024 02:37:58 +0800 Subject: [PATCH 051/113] enable --gpu-memory-utilization in benchmark_throughput.py (#3175) Co-authored-by: zixiao --- benchmarks/benchmark_throughput.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1f0bfe06a67cb..72bdc4b3b4540 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -74,6 +74,7 @@ def run_vllm( kv_cache_dtype: str, device: str, enable_prefix_caching: bool, + gpu_memory_utilization: float = 0.9, ) -> float: from vllm import LLM, SamplingParams llm = LLM(model=model, @@ -84,6 +85,7 @@ def run_vllm( trust_remote_code=trust_remote_code, dtype=dtype, max_model_len=max_model_len, + gpu_memory_utilization=gpu_memory_utilization, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, device=device, @@ -206,13 +208,12 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, - args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.device, - args.enable_prefix_caching) + elapsed_time = run_vllm( + requests, args.model, args.tokenizer, args.quantization, + args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, + args.trust_remote_code, args.dtype, args.max_model_len, + args.enforce_eager, args.kv_cache_dtype, args.device, + args.enable_prefix_caching, args.gpu_memory_utilization) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -287,6 +288,12 @@ def main(args: argparse.Namespace): 'The "auto" option will use FP16 precision ' 'for FP32 and FP16 models, and BF16 precision ' 'for BF16 models.') + parser.add_argument('--gpu-memory-utilization', + type=float, + default=0.9, + help='the fraction of GPU memory to be used for ' + 'the model executor, which can range from 0 to 1.' + 'If unspecified, will use the default value of 0.9.') parser.add_argument("--enforce-eager", action="store_true", help="enforce eager execution") From 76e8a70476ef9daa970349c14c117fe91e8b4544 Mon Sep 17 00:00:00 2001 From: ttbachyinsda Date: Tue, 5 Mar 2024 03:17:12 +0800 Subject: [PATCH 052/113] [Minor fix] The domain dns.google may cause a socket.gaierror exception (#3176) Co-authored-by: guofangze --- vllm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index a4f9bfe6aac99..9cdf623379516 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -173,7 +173,7 @@ def get_ip() -> str: # try ipv4 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: - s.connect(("dns.google", 80)) # Doesn't need to be reachable + s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable return s.getsockname()[0] except OSError: # try ipv6 From 22de45235c6dd14e901e089971635ec655d5fbe0 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 4 Mar 2024 11:54:06 -0800 Subject: [PATCH 053/113] Push logprob generation to LLMEngine (#3065) Co-authored-by: Avnish Narayan --- tests/entrypoints/test_openai_server.py | 61 ++- tests/samplers/test_logprobs.py | 42 +- tests/worker/spec_decode/utils.py | 12 +- vllm/config.py | 2 + vllm/engine/arg_utils.py | 10 +- vllm/engine/async_llm_engine.py | 29 +- vllm/engine/llm_engine.py | 42 +- vllm/entrypoints/openai/serving_chat.py | 236 ++++++----- vllm/entrypoints/openai/serving_completion.py | 391 +++++++++--------- vllm/entrypoints/openai/serving_engine.py | 23 +- vllm/model_executor/layers/sampler.py | 15 +- vllm/sequence.py | 25 +- vllm/worker/spec_decode/multi_step_worker.py | 2 +- 13 files changed, 555 insertions(+), 335 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e426cf7eed72b..f4a6e44d88a87 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -213,14 +213,14 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, messages=messages, max_tokens=10, logprobs=True, - top_logprobs=10) + top_logprobs=5) assert chat_completion.id is not None assert chat_completion.choices is not None and len( chat_completion.choices) == 1 assert chat_completion.choices[0].message is not None assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.top_logprobs is not None - assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 10 + assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" @@ -229,7 +229,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) chat_completion = await client.chat.completions.create( - model=MODEL_NAME, + model=model_name, messages=messages, max_tokens=10, ) @@ -237,6 +237,61 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + # Default max_logprobs is 5, so this should raise an error + with pytest.raises((openai.BadRequestError, openai.APIError)): + stream = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=10, + stream=True) + async for chunk in stream: + ... + + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=10, + stream=False) + + with pytest.raises((openai.BadRequestError, openai.APIError)): + stream = await client.completions.create(model=model_name, + prompt="Test", + max_tokens=10, + logprobs=10, + stream=True) + async for chunk in stream: + ... + + with pytest.raises(openai.BadRequestError): + await client.completions.create(model=model_name, + prompt="Test", + max_tokens=10, + logprobs=10, + stream=False) + + # the server should still work afterwards + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + stream=False) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + @pytest.mark.parametrize( # just test 1 lora hereafter "model_name", diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 0ea3704462fcb..1abb55f021214 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,5 +1,6 @@ import pytest import torch +from tests.conftest import VllmRunner from vllm import SamplingParams @@ -16,6 +17,7 @@ def test_get_prompt_logprobs( example_prompts, ): max_tokens = 5 + num_top_logprobs = 6 hf_model = hf_runner(model, dtype=dtype) hf_logprobs = hf_model.generate_greedy_logprobs( example_prompts, @@ -23,19 +25,32 @@ def test_get_prompt_logprobs( ) del hf_model - vllm_model = vllm_runner(model, dtype=dtype) + vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs) vllm_sampling_params = SamplingParams(max_tokens=max_tokens, - logprobs=5, + logprobs=num_top_logprobs, prompt_logprobs=5, temperature=0.0) vllm_results = vllm_model.model.generate( example_prompts, sampling_params=vllm_sampling_params) - del vllm_model # Test whether logprobs are included in the results. for result in vllm_results: assert result.prompt_logprobs is not None assert result.outputs[0].logprobs is not None + assert len(result.outputs[0].logprobs) == max_tokens + for logprobs in result.outputs[0].logprobs: + assert len(logprobs) == num_top_logprobs + output_text = result.outputs[0].text + output_string_from_most_likely_tokens = [] + for top_logprobs in result.outputs[0].logprobs: + top_logprob = next(iter(top_logprobs.values())) + output_string_from_most_likely_tokens.append( + top_logprob.decoded_token) + output_string_from_most_likely_tokens = "".join( + output_string_from_most_likely_tokens) + assert output_text == output_string_from_most_likely_tokens, ( + "The output text from the top logprob for each token position " + "should be the same as the output text in the result.") # Test whether prompt logprobs are consistent with HF for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): @@ -43,14 +58,29 @@ def test_get_prompt_logprobs( vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): for token_id, logprob in vllm_prompt_logprob_dict.items(): - torch.testing.assert_close(logprob, + torch.testing.assert_close(logprob.logprob, hf_logprob[0][i][token_id].item(), atol=1e-2, rtol=1e-2) vllm_sample_logprobs = vllm_result.outputs[0].logprobs - for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs): - for token_id, logprob in vllm_sample_logprob_dict.items(): + for i, top_logprobs in enumerate(vllm_sample_logprobs): + for token_id, sample_logprob in top_logprobs.items(): + logprob = sample_logprob.logprob torch.testing.assert_close(logprob, hf_logprob[i][-1][token_id].item(), atol=1e-2, rtol=1e-2) + assert isinstance(sample_logprob.decoded_token, str), \ + ("The token should be decoded by the time it is returned " + " to the user.") + + +def test_max_logprobs(): + runner = VllmRunner("facebook/opt-125m", max_logprobs=1) + vllm_sampling_params = SamplingParams(logprobs=1) + # should pass + runner.generate(["Hello world"], sampling_params=vllm_sampling_params) + + bad_sampling_params = SamplingParams(logprobs=2) + with pytest.raises(ValueError): + runner.generate(["Hello world"], sampling_params=bad_sampling_params) diff --git a/tests/worker/spec_decode/utils.py b/tests/worker/spec_decode/utils.py index 8d74509fea488..fa8767cf898aa 100644 --- a/tests/worker/spec_decode/utils.py +++ b/tests/worker/spec_decode/utils.py @@ -4,7 +4,7 @@ from vllm.worker.worker import Worker from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import SequenceGroupMetadata, SequenceData +from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData from vllm.sampling_params import SamplingParams from vllm.worker.cache_engine import CacheEngine from vllm.model_executor.utils import set_random_seed @@ -166,13 +166,15 @@ def create_seq_group_metadata_from_prompts( def assert_logprobs_dict_allclose( - actual_logprobs: List[Dict[int, float]], - expected_logprobs: List[Dict[int, float]]) -> None: + actual_logprobs: List[Dict[int, Logprob]], + expected_logprobs: List[Dict[int, Logprob]]) -> None: for single_step_actual_logprobs, single_step_expected_logprobs in zip( actual_logprobs, expected_logprobs): assert set(single_step_actual_logprobs.keys()) == set( single_step_expected_logprobs.keys()) for token_id in single_step_actual_logprobs: - actual = torch.tensor(single_step_actual_logprobs[token_id]) - expected = torch.tensor(single_step_expected_logprobs[token_id]) + actual = torch.tensor( + single_step_actual_logprobs[token_id].logprob) + expected = torch.tensor( + single_step_expected_logprobs[token_id].logprob) assert torch.allclose(actual, expected) diff --git a/vllm/config.py b/vllm/config.py index e39fd7265689f..ef9a920f29c2a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -79,6 +79,7 @@ def __init__( quantization: Optional[str] = None, enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, + max_logprobs: int = 5, ) -> None: self.model = model self.tokenizer = tokenizer @@ -93,6 +94,7 @@ def __init__( self.quantization = quantization self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture + self.max_logprobs = max_logprobs if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6882e8be34d11..c3dccdd5bb50b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -31,6 +31,7 @@ class EngineArgs: max_num_batched_tokens: Optional[int] = None max_num_seqs: int = 256 max_paddings: int = 256 + max_logprobs: int = 5 # OpenAI default value disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None @@ -212,6 +213,12 @@ def add_cli_args( type=int, default=EngineArgs.max_paddings, help='maximum number of paddings in a batch') + parser.add_argument( + '--max-logprobs', + type=int, + default=EngineArgs.max_logprobs, + help=('max number of log probs to return logprobs is specified in' + ' SamplingParams')) parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics') @@ -300,7 +307,8 @@ def create_engine_configs( self.trust_remote_code, self.download_dir, self.load_format, self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, - self.enforce_eager, self.max_context_len_to_capture) + self.enforce_eager, self.max_context_len_to_capture, + self.max_logprobs) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 9e52d20ca4980..df66139fddcd1 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -47,7 +47,7 @@ def __init__(self, request_id: str) -> None: self._queue = asyncio.Queue() self._finished = False - def put(self, item: RequestOutput) -> None: + def put(self, item: Union[RequestOutput, Exception]) -> None: if self._finished: return self._queue.put_nowait(item) @@ -110,6 +110,17 @@ def process_request_output(self, logger.info(f"Finished request {request_id}.") self.abort_request(request_id) + def process_exception(self, + request_id: str, + exception: Exception, + *, + verbose: bool = False) -> None: + """Propagate an exception from the engine.""" + self._request_streams[request_id].put(exception) + if verbose: + logger.info(f"Finished request {request_id}.") + self.abort_request(request_id) + def add_request(self, request_id: str, **engine_add_request_kwargs) -> AsyncStream: """Add a request to be sent to the engine on the next background @@ -377,10 +388,18 @@ async def engine_step(self) -> bool: for new_request in new_requests: # Add the request into the vLLM engine's waiting queue. # TODO: Maybe add add_request_batch to reduce Ray overhead - if self.engine_use_ray: - await self.engine.add_request.remote(**new_request) - else: - await self.engine.add_request_async(**new_request) + try: + if self.engine_use_ray: + await self.engine.add_request.remote(**new_request) + else: + await self.engine.add_request_async(**new_request) + except ValueError as e: + # TODO: use a vLLM specific error for failed validation + self._request_tracker.process_exception( + new_request["request_id"], + e, + verbose=self.log_requests, + ) if finished_requests: await self._engine_abort(finished_requests) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8a2573034c940..703756996b7f7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -18,7 +18,7 @@ from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams -from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, +from vllm.sequence import (Logprob, SamplerOutput, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.tokenizer import (detokenize_incrementally, TokenizerGroup) @@ -473,6 +473,13 @@ def add_request( if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") + max_logprobs = self.get_model_config().max_logprobs + if (sampling_params.logprobs + and sampling_params.logprobs > max_logprobs) or ( + sampling_params.prompt_logprobs + and sampling_params.prompt_logprobs > max_logprobs): + raise ValueError(f"Cannot request more than " + f"{max_logprobs} logprobs.") if arrival_time is None: arrival_time = time.monotonic() prompt_token_ids = self.encode_request( @@ -583,6 +590,13 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, # Process prompt logprobs prompt_logprobs = outputs.prompt_logprobs if prompt_logprobs is not None: + # We can pick any sequence for the prompt. + seq = next(iter(seq_group.seqs_dict.values())) + all_token_ids = seq.get_token_ids() + for i, prompt_logprobs_for_token in enumerate(prompt_logprobs): + self._decode_logprobs(seq, seq_group.sampling_params, + prompt_logprobs_for_token, + all_token_ids[:i]) seq_group.prompt_logprobs = prompt_logprobs # Process samples @@ -930,12 +944,36 @@ def _get_stats(self, time_e2e_requests=time_e2e_requests, ) + def _decode_logprobs(self, seq: Sequence, prms: SamplingParams, + logprobs: Dict[int, Logprob], + all_input_ids: List[int]) -> None: + if not logprobs: + return + for token_id, sample_logprob in logprobs.items(): + if (sample_logprob.decoded_token is None and token_id != -1): + all_input_ids_with_logprob = all_input_ids[:-1] + [token_id] + _, new_text, prefix_offset, read_offset = detokenize_incrementally( + self.get_tokenizer_for_seq(seq), + all_input_ids=all_input_ids_with_logprob, + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms. + spaces_between_special_tokens, + ) + sample_logprob.decoded_token = new_text + def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: """Decodes the new token for a sequence.""" + all_input_ids = seq.get_token_ids() + self._decode_logprobs(seq, prms, seq.output_logprobs[-1], + all_input_ids) + (new_tokens, new_output_text, prefix_offset, read_offset) = detokenize_incrementally( self.get_tokenizer_for_seq(seq), - all_input_ids=seq.get_token_ids(), + all_input_ids=all_input_ids, prev_tokens=seq.tokens, prefix_offset=seq.prefix_offset, read_offset=seq.read_offset, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index f4ad0aa5a0184..ba352f18f6454 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -82,8 +82,12 @@ async def create_chat_completion( return self.chat_completion_stream_generator( request, result_generator, request_id) else: - return await self.chat_completion_full_generator( - request, raw_request, result_generator, request_id) + try: + return await self.chat_completion_full_generator( + request, raw_request, result_generator, request_id) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) def get_chat_request_role(self, request: ChatCompletionRequest) -> str: if request.add_generation_prompt: @@ -99,117 +103,133 @@ async def chat_completion_stream_generator( model_name = request.model created_time = int(time.monotonic()) chunk_object_type = "chat.completion.chunk" - - # Send first response for each request.n (index) with the role - role = self.get_chat_request_role(request) - for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(role=role), - logprobs=None, - finish_reason=None) - chunk = ChatCompletionStreamResponse(id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - - # Send response to echo the input portion of the last message - if request.echo: - last_msg_content = "" - if request.messages and isinstance( - request.messages, list) and request.messages[-1].get( - "content") and request.messages[-1].get( - "role") == role: - last_msg_content = request.messages[-1]["content"] - - if last_msg_content: - for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(content=last_msg_content), - finish_reason=None) - chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - logprobs=None, - model=model_name) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" + first_iteration = True # Send response for each token for each request.n (index) previous_texts = [""] * request.n previous_num_tokens = [0] * request.n finish_reason_sent = [False] * request.n - async for res in result_generator: - res: RequestOutput - for output in res.outputs: - i = output.index - - if finish_reason_sent[i]: - continue - - delta_token_ids = output.token_ids[previous_num_tokens[i]:] - top_logprobs = output.logprobs[ - previous_num_tokens[i]:] if output.logprobs else None - - if request.logprobs: - logprobs = self._create_logprobs( - token_ids=delta_token_ids, - top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, - initial_text_offset=len(previous_texts[i]), - ) - else: - logprobs = None - - delta_text = output.text[len(previous_texts[i]):] - previous_texts[i] = output.text - previous_num_tokens[i] = len(output.token_ids) - if output.finish_reason is None: - # Send token-by-token response for each request.n - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(content=delta_text), - logprobs=logprobs, - finish_reason=None) - chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - data = chunk.model_dump_json(exclude_unset=True) - yield f"data: {data}\n\n" - else: - # Send the finish response for each request.n only once - prompt_tokens = len(res.prompt_token_ids) - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=previous_num_tokens[i], - total_tokens=prompt_tokens + previous_num_tokens[i], - ) - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage(content=delta_text), - logprobs=logprobs, - finish_reason=output.finish_reason) - chunk = ChatCompletionStreamResponse( - id=request_id, - object=chunk_object_type, - created=created_time, - choices=[choice_data], - model=model_name) - if final_usage is not None: - chunk.usage = final_usage - data = chunk.model_dump_json(exclude_unset=True, - exclude_none=True) - yield f"data: {data}\n\n" - finish_reason_sent[i] = True + try: + async for res in result_generator: + res: RequestOutput + # We need to do it here, because if there are exceptions in + # the result_generator, it needs to be sent as the FIRST + # response (by the try...catch). + if first_iteration: + # Send first response for each request.n (index) with the role + role = self.get_chat_request_role(request) + for i in range(request.n): + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(role=role), + logprobs=None, + finish_reason=None) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + + # Send response to echo the input portion of the last message + if request.echo: + last_msg_content = "" + if request.messages and isinstance( + request.messages, + list) and request.messages[-1].get( + "content") and request.messages[-1].get( + "role") == role: + last_msg_content = request.messages[-1]["content"] + + if last_msg_content: + for i in range(request.n): + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage( + content=last_msg_content), + finish_reason=None) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + logprobs=None, + model=model_name) + data = chunk.model_dump_json( + exclude_unset=True) + yield f"data: {data}\n\n" + first_iteration = False + + for output in res.outputs: + i = output.index + + if finish_reason_sent[i]: + continue + + delta_token_ids = output.token_ids[previous_num_tokens[i]:] + top_logprobs = output.logprobs[ + previous_num_tokens[i]:] if output.logprobs else None + + if request.logprobs: + logprobs = self._create_logprobs( + token_ids=delta_token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + initial_text_offset=len(previous_texts[i]), + ) + else: + logprobs = None + + delta_text = output.text[len(previous_texts[i]):] + previous_texts[i] = output.text + previous_num_tokens[i] = len(output.token_ids) + if output.finish_reason is None: + # Send token-by-token response for each request.n + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(content=delta_text), + logprobs=logprobs, + finish_reason=None) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + data = chunk.model_dump_json(exclude_unset=True) + yield f"data: {data}\n\n" + else: + # Send the finish response for each request.n only once + prompt_tokens = len(res.prompt_token_ids) + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=previous_num_tokens[i], + total_tokens=prompt_tokens + + previous_num_tokens[i], + ) + choice_data = ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage(content=delta_text), + logprobs=logprobs, + finish_reason=output.finish_reason) + chunk = ChatCompletionStreamResponse( + id=request_id, + object=chunk_object_type, + created=created_time, + choices=[choice_data], + model=model_name) + if final_usage is not None: + chunk.usage = final_usage + data = chunk.model_dump_json(exclude_unset=True, + exclude_none=True) + yield f"data: {data}\n\n" + finish_reason_sent[i] = True + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" # Send the final done message after all response.n are finished yield "data: [DONE]\n\n" diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 99a10196b5f73..a8244fd150753 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -26,107 +26,6 @@ [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs] -async def completion_stream_generator( - request: CompletionRequest, - raw_request: Request, - on_abort, - result_generator: AsyncIterator[Tuple[int, RequestOutput]], - create_logprobs_fn: TypeCreateLogProbsFn, - request_id: str, - created_time: int, - model_name: str, - num_prompts: int, -) -> AsyncGenerator[str, None]: - previous_texts = [""] * request.n * num_prompts - previous_num_tokens = [0] * request.n * num_prompts - has_echoed = [False] * request.n * num_prompts - - async for prompt_idx, res in result_generator: - - # Abort the request if the client disconnects. - if await raw_request.is_disconnected(): - await on_abort(f"{request_id}-{prompt_idx}") - raise StopAsyncIteration() - - for output in res.outputs: - i = output.index + prompt_idx * request.n - # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. - - if request.echo and request.max_tokens == 0: - # only return the prompt - delta_text = res.prompt - delta_token_ids = res.prompt_token_ids - top_logprobs = res.prompt_logprobs - has_echoed[i] = True - elif request.echo and request.max_tokens > 0 and not has_echoed[i]: - # echo the prompt and first token - delta_text = res.prompt + output.text - delta_token_ids = res.prompt_token_ids + output.token_ids - top_logprobs = res.prompt_logprobs + (output.logprobs or []) - has_echoed[i] = True - else: - # return just the delta - delta_text = output.text[len(previous_texts[i]):] - delta_token_ids = output.token_ids[previous_num_tokens[i]:] - top_logprobs = output.logprobs[ - previous_num_tokens[i]:] if output.logprobs else None - - if request.logprobs is not None: - assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" - logprobs = create_logprobs_fn( - token_ids=delta_token_ids, - top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, - initial_text_offset=len(previous_texts[i]), - ) - else: - logprobs = None - - previous_texts[i] = output.text - previous_num_tokens[i] = len(output.token_ids) - finish_reason = output.finish_reason - response_json = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[ - CompletionResponseStreamChoice( - index=i, - text=delta_text, - logprobs=logprobs, - finish_reason=finish_reason, - ) - ]).model_dump_json() - yield f"data: {response_json}\n\n" - - if output.finish_reason is not None: # return final usage - logprobs = LogProbs() if request.logprobs is not None else None - prompt_tokens = len(res.prompt_token_ids) - completion_tokens = len(output.token_ids) - final_usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - response_json = CompletionStreamResponse( - id=request_id, - created=created_time, - model=model_name, - choices=[ - CompletionResponseStreamChoice( - index=i, - text="", - logprobs=logprobs, - finish_reason=output.finish_reason, - ) - ], - usage=final_usage, - ).model_dump_json() - yield f"data: {response_json}\n\n" - - yield "data: [DONE]\n\n" - - def parse_prompt_format(prompt) -> Tuple[bool, list]: # get the prompt, openai supports the following # "a string, array of strings, array of tokens, or array of token arrays." @@ -151,73 +50,6 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]: return prompt_is_tokens, prompts -def request_output_to_completion_response( - final_res_batch: List[RequestOutput], - request: CompletionRequest, - create_logprobs_fn: TypeCreateLogProbsFn, - request_id: str, - created_time: int, - model_name: str, -) -> CompletionResponse: - choices = [] - num_prompt_tokens = 0 - num_generated_tokens = 0 - for final_res in final_res_batch: - assert final_res is not None - prompt_token_ids = final_res.prompt_token_ids - prompt_logprobs = final_res.prompt_logprobs - prompt_text = final_res.prompt - - for output in final_res.outputs: - if request.echo and request.max_tokens == 0: - token_ids = prompt_token_ids - top_logprobs = prompt_logprobs - output_text = prompt_text - elif request.echo and request.max_tokens > 0: - token_ids = prompt_token_ids + output.token_ids - top_logprobs = prompt_logprobs + output.logprobs - output_text = prompt_text + output.text - else: - token_ids = output.token_ids - top_logprobs = output.logprobs - output_text = output.text - - if request.logprobs is not None: - logprobs = create_logprobs_fn( - token_ids=token_ids, - top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, - ) - else: - logprobs = None - - choice_data = CompletionResponseChoice( - index=len(choices), - text=output_text, - logprobs=logprobs, - finish_reason=output.finish_reason, - ) - choices.append(choice_data) - - num_prompt_tokens += len(prompt_token_ids) - num_generated_tokens += sum( - len(output.token_ids) for output in final_res.outputs) - - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, - total_tokens=num_prompt_tokens + num_generated_tokens, - ) - - return CompletionResponse( - id=request_id, - created=created_time, - model=model_name, - choices=choices, - usage=usage, - ) - - def merge_async_iterators(*iterators): """Merge multiple asynchronous iterators into a single iterator. @@ -230,8 +62,11 @@ def merge_async_iterators(*iterators): finished = [False] * len(iterators) async def producer(i, iterator): - async for item in iterator: - await queue.put((i, item)) + try: + async for item in iterator: + await queue.put((i, item)) + except Exception as e: + await queue.put(e) finished[i] = True _tasks = [ @@ -242,6 +77,8 @@ async def producer(i, iterator): async def consumer(): while not all(finished) or not queue.empty(): item = await queue.get() + if isinstance(item, Exception): + raise item yield item await asyncio.gather(*_tasks) @@ -312,6 +149,7 @@ async def create_completion(self, request: CompletionRequest, prompt_token_ids=input_ids, lora_request=lora_request)) except ValueError as e: + # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) result_generator: AsyncIterator[Tuple[ @@ -325,27 +163,28 @@ async def create_completion(self, request: CompletionRequest, # Streaming response if stream: - return completion_stream_generator(request, - raw_request, - self.engine.abort, - result_generator, - self._create_logprobs, - request_id, - created_time, - model_name, - num_prompts=len(prompts)) + return self.completion_stream_generator(request, + raw_request, + result_generator, + request_id, + created_time, + model_name, + num_prompts=len(prompts)) # Non-streaming response final_res_batch: RequestOutput = [None] * len(prompts) - async for i, res in result_generator: - if await raw_request.is_disconnected(): - # Abort the request if the client disconnects. - await self.engine.abort(f"{request_id}-{i}") - return self.create_error_response("Client disconnected") - final_res_batch[i] = res - response = request_output_to_completion_response( - final_res_batch, request, self._create_logprobs, request_id, - created_time, model_name) + try: + async for i, res in result_generator: + if await raw_request.is_disconnected(): + # Abort the request if the client disconnects. + await self.engine.abort(f"{request_id}-{i}") + return self.create_error_response("Client disconnected") + final_res_batch[i] = res + response = self.request_output_to_completion_response( + final_res_batch, request, request_id, created_time, model_name) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) # When user requests streaming but we don't stream, we still need to # return a streaming response with a single event. @@ -359,3 +198,179 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]: return fake_stream_generator() return response + + async def completion_stream_generator( + self, + request: CompletionRequest, + raw_request: Request, + result_generator: AsyncIterator[Tuple[int, RequestOutput]], + request_id: str, + created_time: int, + model_name: str, + num_prompts: int, + ) -> AsyncGenerator[str, None]: + previous_texts = [""] * request.n * num_prompts + previous_num_tokens = [0] * request.n * num_prompts + has_echoed = [False] * request.n * num_prompts + + try: + async for prompt_idx, res in result_generator: + + # Abort the request if the client disconnects. + if await raw_request.is_disconnected(): + await self.engine.abort(f"{request_id}-{prompt_idx}") + raise StopAsyncIteration() + + for output in res.outputs: + i = output.index + prompt_idx * request.n + # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. + + if request.echo and request.max_tokens == 0: + # only return the prompt + delta_text = res.prompt + delta_token_ids = res.prompt_token_ids + top_logprobs = res.prompt_logprobs + has_echoed[i] = True + elif request.echo and request.max_tokens > 0 and not has_echoed[ + i]: + # echo the prompt and first token + delta_text = res.prompt + output.text + delta_token_ids = res.prompt_token_ids + output.token_ids + top_logprobs = res.prompt_logprobs + (output.logprobs + or []) + has_echoed[i] = True + else: + # return just the delta + delta_text = output.text[len(previous_texts[i]):] + delta_token_ids = output.token_ids[ + previous_num_tokens[i]:] + top_logprobs = output.logprobs[previous_num_tokens[ + i]:] if output.logprobs else None + + if request.logprobs is not None: + assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" + logprobs = self._create_logprobs( + token_ids=delta_token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + initial_text_offset=len(previous_texts[i]), + ) + else: + logprobs = None + + previous_texts[i] = output.text + previous_num_tokens[i] = len(output.token_ids) + finish_reason = output.finish_reason + response_json = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[ + CompletionResponseStreamChoice( + index=i, + text=delta_text, + logprobs=logprobs, + finish_reason=finish_reason, + ) + ]).model_dump_json() + yield f"data: {response_json}\n\n" + + if output.finish_reason is not None: # return final usage + logprobs = LogProbs( + ) if request.logprobs is not None else None + prompt_tokens = len(res.prompt_token_ids) + completion_tokens = len(output.token_ids) + final_usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + response_json = CompletionStreamResponse( + id=request_id, + created=created_time, + model=model_name, + choices=[ + CompletionResponseStreamChoice( + index=i, + text="", + logprobs=logprobs, + finish_reason=output.finish_reason, + ) + ], + usage=final_usage, + ).model_dump_json() + yield f"data: {response_json}\n\n" + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + data = self.create_streaming_error_response(str(e)) + print("yield", f"data: {data}\n\n") + yield f"data: {data}\n\n" + + print("yield", "data: [DONE]\n\n") + yield "data: [DONE]\n\n" + + def request_output_to_completion_response( + self, + final_res_batch: List[RequestOutput], + request: CompletionRequest, + request_id: str, + created_time: int, + model_name: str, + ) -> CompletionResponse: + choices = [] + num_prompt_tokens = 0 + num_generated_tokens = 0 + for final_res in final_res_batch: + assert final_res is not None + prompt_token_ids = final_res.prompt_token_ids + prompt_logprobs = final_res.prompt_logprobs + prompt_text = final_res.prompt + + for output in final_res.outputs: + if request.echo and request.max_tokens == 0: + token_ids = prompt_token_ids + top_logprobs = prompt_logprobs + output_text = prompt_text + elif request.echo and request.max_tokens > 0: + token_ids = prompt_token_ids + output.token_ids + top_logprobs = prompt_logprobs + output.logprobs + output_text = prompt_text + output.text + else: + token_ids = output.token_ids + top_logprobs = output.logprobs + output_text = output.text + + if request.logprobs is not None: + logprobs = self._create_logprobs( + token_ids=token_ids, + top_logprobs=top_logprobs, + num_output_top_logprobs=request.logprobs, + ) + else: + logprobs = None + + choice_data = CompletionResponseChoice( + index=len(choices), + text=output_text, + logprobs=logprobs, + finish_reason=output.finish_reason, + ) + choices.append(choice_data) + + num_prompt_tokens += len(prompt_token_ids) + num_generated_tokens += sum( + len(output.token_ids) for output in final_res.outputs) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + num_generated_tokens, + ) + + return CompletionResponse( + id=request_id, + created=created_time, + model=model_name, + choices=choices, + usage=usage, + ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 09945471e9af0..230d13d97dbba 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,4 +1,5 @@ import asyncio +import json from dataclasses import dataclass from http import HTTPStatus from typing import Dict, List, Optional, Union @@ -11,6 +12,7 @@ ModelCard, ModelList, ModelPermission) from vllm.lora.request import LoRARequest +from vllm.sequence import Logprob logger = init_logger(__name__) @@ -83,7 +85,7 @@ async def show_available_models(self) -> ModelList: def _create_logprobs( self, token_ids: List[int], - top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None, + top_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None, num_output_top_logprobs: Optional[int] = None, initial_text_offset: int = 0, ) -> LogProbs: @@ -95,10 +97,10 @@ def _create_logprobs( for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is not None: - token_logprob = step_top_logprobs[token_id] + token_logprob = step_top_logprobs[token_id].logprob else: token_logprob = None - token = self.tokenizer.convert_ids_to_tokens(token_id) + token = step_top_logprobs[token_id].decoded_token logprobs.tokens.append(token) logprobs.token_logprobs.append(token_logprob) if len(logprobs.text_offset) == 0: @@ -110,7 +112,7 @@ def _create_logprobs( if num_output_top_logprobs: logprobs.top_logprobs.append({ - self.tokenizer.convert_ids_to_tokens(i): p + p.decoded_token: p.logprob for i, p in step_top_logprobs.items() } if step_top_logprobs else None) return logprobs @@ -124,6 +126,19 @@ def create_error_response( type=err_type, code=status_code.value) + def create_streaming_error_response( + self, + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str: + json_str = json.dumps({ + "error": + self.create_error_response(message=message, + err_type=err_type, + status_code=status_code).model_dump() + }) + return json_str + async def _check_model(self, request) -> Optional[ErrorResponse]: if request.model == self.served_model: return diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 71655b216fb3d..b48dde0318d09 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -8,8 +8,9 @@ tensor_model_parallel_gather) from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors from vllm.sampling_params import SamplingParams, SamplingType -from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput, - SequenceData, SequenceGroupOutput, SequenceOutput) +from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs, + SamplerOutput, SequenceData, SequenceGroupOutput, + SequenceOutput) from vllm.utils import is_neuron @@ -528,7 +529,10 @@ def _get_logprobs( prompt_logprobs_dict.update( zip(top_token_ids[sample_idx, :num_logprobs].tolist(), top_logprobs[sample_idx, :num_logprobs].tolist())) - group_prompt_logprobs.append(prompt_logprobs_dict) + group_prompt_logprobs.append({ + token_id: Logprob(logprob) + for token_id, logprob in prompt_logprobs_dict.items() + }) sample_idx += 1 query_result_idx += 1 result_prompt_logprobs.append(group_prompt_logprobs) @@ -553,7 +557,10 @@ def _get_logprobs( parent_id, :num_logprobs].tolist(), top_logprobs[sample_idx + parent_id, :num_logprobs].tolist())) - group_sample_logprobs.append(sample_logprobs_dict) + group_sample_logprobs.append({ + token_id: Logprob(logprob) + for token_id, logprob in sample_logprobs_dict.items() + }) result_sample_logprobs.append(group_sample_logprobs) sample_idx += len(seq_ids) diff --git a/vllm/sequence.py b/vllm/sequence.py index 04a9a90a68bcc..a110ab6b748f8 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -8,8 +8,16 @@ from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest -PromptLogprobs = List[Optional[Dict[int, float]]] -SampleLogprobs = List[Dict[int, float]] + +@dataclass +class Logprob: + """Infos for supporting OpenAI compatible logprobs.""" + logprob: float + decoded_token: Optional[str] = None + + +PromptLogprobs = List[Optional[Dict[int, Logprob]]] +SampleLogprobs = List[Dict[int, Logprob]] class SequenceStatus(enum.Enum): @@ -196,12 +204,12 @@ def _append_tokens_to_blocks(self, token_ids: List[int]) -> None: def append_token_id( self, token_id: int, - logprobs: Dict[int, float], + logprobs: Dict[int, Logprob], ) -> None: assert token_id in logprobs self._append_tokens_to_blocks([token_id]) self.output_logprobs.append(logprobs) - self.data.append_token_id(token_id, logprobs[token_id]) + self.data.append_token_id(token_id, logprobs[token_id].logprob) def get_len(self) -> int: return self.data.get_len() @@ -456,7 +464,7 @@ def __init__( self, parent_seq_id: int, output_token: int, - logprobs: Dict[int, float], + logprobs: Dict[int, Logprob], ) -> None: self.parent_seq_id = parent_seq_id self.output_token = output_token @@ -470,9 +478,10 @@ def __repr__(self) -> str: def __eq__(self, other: object) -> bool: if not isinstance(other, SequenceOutput): raise NotImplementedError() - return (self.parent_seq_id == other.parent_seq_id - and self.output_token == other.output_token - and self.logprobs == other.logprobs) + equal = (self.parent_seq_id == other.parent_seq_id + and self.output_token == other.output_token) + log_probs_equal = other.logprobs == self.logprobs + return equal and log_probs_equal class SequenceGroupOutput: diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py index 591d1b1300c88..ab3e28389a04c 100644 --- a/vllm/worker/spec_decode/multi_step_worker.py +++ b/vllm/worker/spec_decode/multi_step_worker.py @@ -77,7 +77,7 @@ def _append_new_tokens( token_id = seq_output.output_token token_logprob = seq_output.logprobs[token_id] - seq.append_token_id(token_id, token_logprob) + seq.append_token_id(token_id, token_logprob.logprob) def _shallow_copy_inputs( self, seq_group_metadata_list: List[SequenceGroupMetadata] From ff578cae54d23812b53b6c9b94b8bd0bb293a1fe Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Mon, 4 Mar 2024 14:01:40 -0800 Subject: [PATCH 054/113] Add health check, make async Engine more robust (#3015) Co-authored-by: Zhuohan Li --- tests/async_engine/test_async_llm_engine.py | 32 +++--- tests/async_engine/test_request_tracker.py | 38 +++---- vllm/engine/async_llm_engine.py | 113 +++++++++++++++----- vllm/engine/llm_engine.py | 20 ++++ 4 files changed, 138 insertions(+), 65 deletions(-) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1edb19c550010..1e31ff7373031 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -25,12 +25,8 @@ async def step_async(self): return [RequestOutput( request_id=self.request_id)] if self.request_id else [] - async def encode_request_async( - self, - *args, - **kwargs, - ): - return [1] + async def encode_request_async(self, *args, **kwargs): + pass def generate(self, request_id): self.request_id = request_id @@ -43,13 +39,16 @@ def add_request(self, **kwargs): self.add_request_calls += 1 async def add_request_async(self, **kwargs): - del kwargs # Unused self.add_request_calls += 1 + return def abort_request(self, request_id): del request_id # Unused self.abort_request_calls += 1 + def has_unfinished_requests(self): + return self.request_id is not None + class MockAsyncLLMEngine(AsyncLLMEngine): @@ -72,20 +71,21 @@ async def test_new_requests_event(): await engine.add_request("2", "", None) engine.engine.generate("2") await asyncio.sleep(0) - assert engine.engine.add_request_calls == 2 - assert engine.engine.step_calls == 2 await asyncio.sleep(0) - assert engine.engine.step_calls == 3 + assert engine.engine.add_request_calls == 2 + assert engine.engine.step_calls >= 2 + await asyncio.sleep(0.001) + assert engine.engine.step_calls >= 3 engine.engine.stop_generating() - await asyncio.sleep(0) - assert engine.engine.step_calls == 4 - await asyncio.sleep(0) - assert engine.engine.step_calls == 4 + await asyncio.sleep(0.001) + old_step_calls = engine.engine.step_calls + await asyncio.sleep(0.001) + assert engine.engine.step_calls == old_step_calls await engine.add_request("3", "", None) await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == 5 + assert engine.engine.step_calls == old_step_calls + 1 await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 - assert engine.engine.step_calls == 5 + assert engine.engine.step_calls == old_step_calls + 1 diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index 4043558bae919..7b1f4a9e1eb2f 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -4,25 +4,14 @@ from vllm.outputs import RequestOutput -class DummyEvent: - - def __init__(self): - self.flag = False - - def set(self): - self.flag = True - - def clear(self): - self.flag = False - - -def test_request_tracker(): +@pytest.mark.asyncio +async def test_request_tracker(): tracker = RequestTracker() - tracker.new_requests_event = DummyEvent() stream_1 = tracker.add_request("1") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() assert len(new) == 1 assert new[0]["request_id"] == "1" assert not finished @@ -30,9 +19,10 @@ def test_request_tracker(): stream_2 = tracker.add_request("2") stream_3 = tracker.add_request("3") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() assert len(new) == 2 assert new[0]["request_id"] == "2" assert new[1]["request_id"] == "3" @@ -43,7 +33,7 @@ def test_request_tracker(): # request_ids must be unique with pytest.raises(KeyError): tracker.add_request("1") - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() tracker.abort_request("1") new, finished = tracker.get_new_and_finished_requests() @@ -54,7 +44,8 @@ def test_request_tracker(): stream_4 = tracker.add_request("4") tracker.abort_request("4") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() assert len(finished) == 1 assert "4" in finished @@ -62,11 +53,12 @@ def test_request_tracker(): assert stream_4.finished stream_5 = tracker.add_request("5") - assert tracker.new_requests_event.flag + assert tracker.new_requests_event.is_set() tracker.process_request_output( - RequestOutput("2", "output", [], [], [], bool(finished))) + RequestOutput("2", "output", [], [], [], finished=True)) + await tracker.wait_for_new_requests() new, finished = tracker.get_new_and_finished_requests() - assert not tracker.new_requests_event.flag + assert not tracker.new_requests_event.is_set() assert len(finished) == 1 assert "2" in finished assert len(new) == 1 diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index df66139fddcd1..65ab0c0634176 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,8 +1,9 @@ import asyncio +import os import time from functools import partial from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator) + Union, AsyncIterator, Callable) from vllm.lora.request import LoRARequest from vllm.config import ModelConfig @@ -14,28 +15,31 @@ from vllm.sampling_params import SamplingParams logger = init_logger(__name__) +ENGINE_ITERATION_TIMEOUT_S = int( + os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")) class AsyncEngineDeadError(RuntimeError): pass -def _raise_exception_on_finish(task: asyncio.Task, - request_tracker: "RequestTracker") -> None: +def _raise_exception_on_finish( + task: asyncio.Task, error_callback: Callable[[Exception], + None]) -> None: msg = ("Task finished unexpectedly. This should never happen! " "Please open an issue on Github.") + + exception = None try: - try: - task.result() - except asyncio.CancelledError: - return - except Exception as exc: - raise AsyncEngineDeadError( - msg + " See stack trace above for the actual cause.") from exc + task.result() + # NOTE: This will be thrown if task exits normally (which it should not) raise AsyncEngineDeadError(msg) - except Exception as exc: - request_tracker.propagate_exception(exc) - raise exc + except Exception as e: + exception = e + logger.error("Engine background task failed", exc_info=e) + error_callback(exception) + raise AsyncEngineDeadError( + msg + " See stack trace above for the actual cause.") from e class AsyncStream: @@ -78,13 +82,13 @@ def __init__(self) -> None: self._finished_requests: asyncio.Queue[str] = asyncio.Queue() self._new_requests: asyncio.Queue[Tuple[AsyncStream, dict]] = asyncio.Queue() - self.new_requests_event = None + self.new_requests_event = asyncio.Event() def __contains__(self, item): return item in self._request_streams - def init_event(self): - self.new_requests_event = asyncio.Event() + def __len__(self) -> int: + return len(self._request_streams) def propagate_exception(self, exc: Exception, @@ -93,9 +97,11 @@ def propagate_exception(self, (all if request_id is None).""" if request_id is not None: self._request_streams[request_id].put(exc) + self.abort_request(request_id) else: - for stream in self._request_streams.values(): + for rid, stream in self._request_streams.items(): stream.put(exc) + self.abort_request(rid) def process_request_output(self, request_output: RequestOutput, @@ -172,12 +178,15 @@ def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]: self._request_streams[stream.request_id] = stream new_requests.append(new_request) - self.new_requests_event.clear() - return new_requests, finished_requests async def wait_for_new_requests(self): - await self.new_requests_event.wait() + if not self.has_new_requests(): + await self.new_requests_event.wait() + self.new_requests_event.clear() + + def has_new_requests(self): + return not self._new_requests.empty() class _AsyncLLMEngine(LLMEngine): @@ -285,6 +294,10 @@ async def _run_workers_async( all_outputs = await asyncio.gather(*coros) return all_outputs + async def check_health_async(self): + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + class AsyncLLMEngine: """An asynchronous wrapper for LLMEngine. @@ -335,27 +348,48 @@ def __init__(self, # collected self._background_loop_unshielded = None self.start_engine_loop = start_engine_loop - self._request_tracker = RequestTracker() + self._request_tracker: Optional[RequestTracker] = None + self._errored_with: Optional[BaseException] = None @property def is_running(self) -> bool: return (self.background_loop is not None - and not self.background_loop.done()) + and not self._background_loop_unshielded.done()) + + @property + def is_stopped(self) -> bool: + return self.errored or (self.background_loop is not None + and self._background_loop_unshielded.done()) + + @property + def errored(self) -> bool: + return self._errored_with is not None + + def set_errored(self, exc: Exception) -> None: + self._errored_with = exc + + def _error_callback(self, exc: Exception) -> None: + self.set_errored(exc) + self._request_tracker.propagate_exception(exc) def get_tokenizer(self): return self.engine.tokenizer.tokenizer def start_background_loop(self) -> None: """Start the background loop.""" + if self.errored: + raise AsyncEngineDeadError( + "Background loop has errored already.") from self._errored_with if self.is_running: raise RuntimeError("Background loop is already running.") - self._request_tracker.init_event() + # Initialize the RequestTracker here so it uses the right event loop. + self._request_tracker = RequestTracker() self._background_loop_unshielded = asyncio.get_event_loop( ).create_task(self.run_engine_loop()) self._background_loop_unshielded.add_done_callback( partial(_raise_exception_on_finish, - request_tracker=self._request_tracker)) + error_callback=self._error_callback)) self.background_loop = asyncio.shield(self._background_loop_unshielded) def _init_engine(self, *args, @@ -423,12 +457,23 @@ async def _engine_abort(self, request_ids: Iterable[str]): self.engine.abort_request(request_ids) async def run_engine_loop(self): - # Initialize the RequestTracker here so it uses the right event loop. has_requests_in_progress = False while True: if not has_requests_in_progress: + logger.debug("Waiting for new requests...") await self._request_tracker.wait_for_new_requests() - has_requests_in_progress = await self.engine_step() + logger.debug("Got new requests!") + + # Abort if iteration takes too long due to unrecoverable errors + # (eg. NCCL timeouts). + try: + has_requests_in_progress = await asyncio.wait_for( + self.engine_step(), ENGINE_ITERATION_TIMEOUT_S) + except asyncio.TimeoutError as exc: + logger.error( + "Engine iteration timed out. This should never happen!") + self.set_errored(exc) + raise await asyncio.sleep(0) async def add_request( @@ -647,3 +692,19 @@ async def do_log_stats(self) -> None: await self.engine.do_log_stats.remote() else: self.engine.do_log_stats() + + async def check_health(self): + """Raises an error if engine is unhealthy.""" + t = time.perf_counter() + logger.debug("Starting health check...") + if self.is_stopped: + raise AsyncEngineDeadError("Background loop is stopped.") + + if self.engine_use_ray: + try: + await self.engine.check_health.remote() + except ray.exceptions.RayActorError as e: + raise RuntimeError("Engine is dead.") from e + else: + await self.engine.check_health_async() + logger.debug(f"Health check took {time.perf_counter()-t}s") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 703756996b7f7..1f518cbf39b21 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1119,3 +1119,23 @@ def _compiled_ray_dag(self): for worker in self.workers ]) return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.parallel_config.worker_use_ray: + return + + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") From 9a4548bae73a8831f668116d8a6e88491d933a4e Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Mon, 4 Mar 2024 18:51:56 -0500 Subject: [PATCH 055/113] Fix the openai benchmarking requests to work with latest OpenAI apis (#2992) Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- benchmarks/backend_request_func.py | 70 ++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index e7f74e2feaf86..d7cac22ce7a99 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -275,10 +275,80 @@ async def async_request_openai_completions( return output +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith( + "v1/chat/completions" + ), "OpenAI Chat API URL must end with 'v1/chat/completions'." + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + assert not request_func_input.use_beam_search + payload = { + "model": request_func_input.model, + "messages": [ + { + "role": "user", + "content": request_func_input.prompt, + }, + ], + "temperature": 0.0, + "max_tokens": request_func_input.output_len, + "stream": True, + } + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0 + st = time.perf_counter() + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk in response.content: + if ttft == 0: + ttft = time.perf_counter() - st + output.ttft = ttft + + chunk = chunk.strip() + if not chunk: + continue + + chunk = chunk.decode("utf-8").lstrip("data: ") + if chunk == "[DONE]": + latency = time.perf_counter() - st + else: + body = json.loads(chunk) + if "content" in body["choices"][0]["delta"]: + generated_text += body["choices"][0]["delta"][ + "content"] + + output.generated_text = generated_text + output.success = True + output.latency = latency + else: + output.success = False + except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError): + output.success = False + + if pbar: + pbar.update(1) + return output + + ASYNC_REQUEST_FUNCS = { "tgi": async_request_tgi, "vllm": async_request_vllm, "deepspeed-mii": async_request_deepspeed_mii, "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, "tensorrt-llm": async_request_trt_llm, } From 05af6da8d927f70d15ab1ed25b01df3c967ad961 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Mon, 4 Mar 2024 21:14:53 -0500 Subject: [PATCH 056/113] [ROCm] enable cupy in order to enable cudagraph mode for AMD GPUs (#3123) Co-authored-by: lcskrishna --- Dockerfile.rocm | 30 +++++++++++++++++++++++++----- vllm/worker/worker.py | 4 +--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 54ae06be6e101..a45265d79a6ac 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH" # In that case, we need to use the python reference attention implementation in vllm ARG BUILD_FA="1" +# whether to build cupy on rocm +ARG BUILD_CUPY="1" + # Install some basic utilities RUN apt-get update && apt-get install python3 python3-pip -y @@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ && cd ..; \ fi -COPY ./ /app/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.23 --no-deps - # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi +# build cupy +RUN if [ "$BUILD_CUPY" = "1" ]; then \ + mkdir -p libs \ + && cd libs \ + && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \ + && cd cupy \ + && pip install mpi4py-mpich \ + && pip install scipy==1.9.3 \ + && pip install cython==0.29.* \ + && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \ + && export CUPY_INSTALL_USE_HIP=1 \ + && export ROCM_HOME=/opt/rocm \ + && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \ + && pip install . \ + && cd ..; \ + fi + +COPY ./ /app/vllm + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install xformers==0.0.23 --no-deps + RUN cd /app \ && cd vllm \ && pip install -U -r requirements-rocm.txt \ diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9df518d155ec2..157e8c45836b1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,6 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner from vllm.lora.request import LoRARequest -from vllm.utils import is_hip class Worker: @@ -267,8 +266,7 @@ def init_distributed_environment( "cupy.distributed is already initialized but the cupy world " "size does not match parallel_config.world_size " f"({cupy_world_size} vs. {parallel_config.world_size}).") - elif (parallel_config.world_size > 1 and cupy_port is not None - and not is_hip()): + elif (parallel_config.world_size > 1 and cupy_port is not None): # NOTE(woosuk): We don't initialize CuPy process group when world size # is 1. # TODO(woosuk): Support multi-node connection. From 8999ec3c1632c91c194ab27df6bf274f5bcb0b5f Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 5 Mar 2024 15:35:43 -0800 Subject: [PATCH 057/113] Store `eos_token_id` in `Sequence` for easy access (#3166) --- tests/test_cache_block_hashing.py | 3 +- vllm/core/scheduler.py | 7 ++--- vllm/engine/llm_engine.py | 30 +++++++++----------- vllm/model_executor/layers/sampler.py | 1 - vllm/outputs.py | 41 ++++++++++++++------------- vllm/sequence.py | 11 ++++--- 6 files changed, 44 insertions(+), 49 deletions(-) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 7c4ade7f8c8ed..c2067e52b59c0 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -54,7 +54,8 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): for prompt in prompts: hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, + tokenizer.tokenizer.eos_token_id) num_blocks = len(prompt_token_ids) // block_size for idx in range(num_blocks): diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 1ae58f525b0fb..c96c6d62ef19d 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -59,10 +59,9 @@ def is_empty(self) -> bool: and not self.blocks_to_swap_out and not self.blocks_to_copy) def _sort_by_lora_ids(self) -> bool: - self.scheduled_seq_groups = sorted( - self.scheduled_seq_groups, - key=lambda g: (g.lora_request.lora_int_id - if g.lora_request else 0, g.request_id)) + self.scheduled_seq_groups = sorted(self.scheduled_seq_groups, + key=lambda g: + (g.lora_int_id, g.request_id)) @property def lora_requests(self) -> Set[LoRARequest]: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1f518cbf39b21..52dc96e2b82e1 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -491,8 +491,10 @@ def add_request( # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) + eos_token_id = self.tokenizer.get_lora_tokenizer( + lora_request).eos_token_id seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - lora_request) + eos_token_id, lora_request) # Defensive copy of SamplingParams, which are used by the sampler, # this doesn't deep-copy LogitsProcessor objects @@ -548,15 +550,13 @@ def _check_beam_search_early_stopping( if early_stopping is True: return True - current_worst_score = (current_worst_seq.get_beam_search_score( + current_worst_score = current_worst_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - current_worst_seq).eos_token_id)) + eos_token_id=current_worst_seq.eos_token_id) if early_stopping is False: - highest_attainable_score = (best_running_seq.get_beam_search_score( + highest_attainable_score = best_running_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id)) + eos_token_id=best_running_seq.eos_token_id) else: assert early_stopping == "never" if length_penalty > 0.0: @@ -570,8 +570,7 @@ def _check_beam_search_early_stopping( highest_attainable_score = ( best_running_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id, + eos_token_id=best_running_seq.eos_token_id, seq_len=max_possible_length)) else: # Otherwise, beam search will prefer shorter sequences. The @@ -580,8 +579,7 @@ def _check_beam_search_early_stopping( highest_attainable_score = ( best_running_seq.get_beam_search_score( length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq( - best_running_seq).eos_token_id)) + eos_token_id=best_running_seq.eos_token_id)) return current_worst_score >= highest_attainable_score def _process_sequence_group_outputs(self, seq_group: SequenceGroup, @@ -679,8 +677,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, all_finished_seqs = existing_finished_seqs + new_finished_seqs # Sort the finished sequences by their scores. all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), reverse=True) for seq, parent, is_new in all_finished_seqs[:beam_width]: if is_new: @@ -707,8 +704,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, if not seq.is_finished()] # Sort the running sequences by their scores. running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score( - length_penalty=length_penalty, - eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id), + length_penalty=length_penalty, eos_token_id=x[0].eos_token_id), reverse=True) # Check if we can stop the beam search. @@ -1014,8 +1010,8 @@ def _check_stop(self, seq: Sequence, return # Check if the sequence has generated the EOS token. - if ((not sampling_params.ignore_eos) and seq.get_last_token_id() - == self.get_tokenizer_for_seq(seq).eos_token_id): + if ((not sampling_params.ignore_eos) + and seq.get_last_token_id() == seq.eos_token_id): seq.status = SequenceStatus.FINISHED_STOPPED return diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index b48dde0318d09..320cb443524ca 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -516,7 +516,6 @@ def _get_logprobs( if (i < sampling_metadata.num_prompts and sampling_params.prompt_logprobs is not None): num_logprobs = sampling_params.prompt_logprobs - prompt_len = sampling_metadata.prompt_lens[i] prompt_tokens = sampling_metadata.seq_data[ seq_ids[0]].prompt_token_ids group_prompt_logprobs: PromptLogprobs = [None] diff --git a/vllm/outputs.py b/vllm/outputs.py index a6de2a5a2257b..4f9eddee11cd4 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -90,29 +90,30 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": # Get the top-n sequences. n = seq_group.sampling_params.n seqs = seq_group.get_seqs() - if seq_group.sampling_params.use_beam_search: - sorting_key = lambda seq: seq.get_beam_search_score( - seq_group.sampling_params.length_penalty) + if n == 1: + top_n_seqs = seqs else: - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] + if seq_group.sampling_params.use_beam_search: + sorting_key = lambda seq: seq.get_beam_search_score( + seq_group.sampling_params.length_penalty) + else: + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] # Create the outputs. - outputs: List[CompletionOutput] = [] - for seq in top_n_seqs: - logprobs = seq.output_logprobs - if seq_group.sampling_params.logprobs is None: - # NOTE: We need to take care of this case because the sequence - # always has the logprobs of the sampled tokens even if the - # logprobs are not requested. - logprobs = None - finshed_reason = SequenceStatus.get_finished_reason(seq.status) - output = CompletionOutput(seqs.index(seq), seq.output_text, - seq.get_output_token_ids(), - seq.get_cumulative_logprob(), logprobs, - finshed_reason) - outputs.append(output) + # NOTE: We need omit logprobs here explicitly because the sequence + # always has the logprobs of the sampled tokens even if the + # logprobs are not requested. + include_logprobs = seq_group.sampling_params.logprobs + outputs = [ + CompletionOutput(seqs.index(seq), seq.output_text, + seq.get_output_token_ids(), + seq.get_cumulative_logprob(), + seq.output_logprobs if include_logprobs else None, + SequenceStatus.get_finished_reason(seq.status)) + for seq in top_n_seqs + ] # Every sequence in the sequence group should have the same prompt. prompt = seq_group.prompt diff --git a/vllm/sequence.py b/vllm/sequence.py index a110ab6b748f8..97b72fdc4cbeb 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -142,11 +142,13 @@ def __init__( prompt: str, prompt_token_ids: List[int], block_size: int, + eos_token_id: int, lora_request: Optional[LoRARequest] = None, ) -> None: self.seq_id = seq_id self.prompt = prompt self.block_size = block_size + self.eos_token_id = eos_token_id self.lora_request = lora_request self.data = SequenceData(prompt_token_ids) @@ -362,12 +364,9 @@ def get_seqs( self, status: Optional[SequenceStatus] = None, ) -> List[Sequence]: - if status is None: - return list(self.seqs_dict.values()) - else: - return [ - seq for seq in self.seqs_dict.values() if seq.status == status - ] + return list(self.seqs_dict.values()) if status is None else [ + seq for seq in self.seqs_dict.values() if seq.status == status + ] def get_unfinished_seqs(self) -> List[Sequence]: return [ From 2efce05dc3c7c1e367617465f8f661a058499e37 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 5 Mar 2024 16:17:20 -0800 Subject: [PATCH 058/113] [Fix] Avoid pickling entire LLMEngine for Ray workers (#3207) Co-authored-by: Antoni Baum --- vllm/engine/llm_engine.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 52dc96e2b82e1..8484014c9a13f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -158,6 +158,11 @@ def __init__( if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() + def __reduce__(self): + # This is to ensure that the LLMEngine is not referenced in + # the closure used to initialize Ray worker actors + raise RuntimeError("LLMEngine should not be pickled!") + def get_tokenizer_for_seq(self, sequence: Sequence): return self.tokenizer.get_lora_tokenizer(sequence.lora_request) @@ -280,6 +285,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", parallel_config = copy.deepcopy(self.parallel_config) scheduler_config = copy.deepcopy(self.scheduler_config) device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype for rank, (worker, (node_id, _)) in enumerate(zip(self.workers, @@ -295,22 +302,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", local_rank, rank, distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, )) driver_rank = 0 driver_local_rank = node_workers[driver_node_id].index(driver_rank) self.driver_worker = Worker( - model_config, - parallel_config, - scheduler_config, - device_config, + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, driver_local_rank, driver_rank, distributed_init_method, lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, + kv_cache_dtype=kv_cache_dtype, is_driver_worker=True, ) From 24aecf421a4ad5989697010963074904fead9a1b Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Wed, 6 Mar 2024 11:23:34 +0900 Subject: [PATCH 059/113] [Tests] Add block manager and scheduler tests (#3108) --- .buildkite/test-pipeline.yaml | 3 + tests/core/__init__.py | 0 tests/core/test_block_manager.py | 262 +++++++++++++++++++++++++++++++ tests/core/test_scheduler.py | 170 ++++++++++++++++++++ tests/core/utils.py | 27 ++++ 5 files changed, 462 insertions(+) create mode 100644 tests/core/__init__.py create mode 100644 tests/core/test_block_manager.py create mode 100644 tests/core/test_scheduler.py create mode 100644 tests/core/utils.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c65ab04b8ddda..15f971b66e3bd 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -13,6 +13,9 @@ steps: - label: Basic Correctness Test command: pytest -v -s --forked basic_correctness + +- label: Core Test + command: pytest -v -s core - label: Distributed Comm Ops Test command: pytest -v -s --forked test_comm_ops.py diff --git a/tests/core/__init__.py b/tests/core/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py new file mode 100644 index 0000000000000..ecdf3025cffdf --- /dev/null +++ b/tests/core/test_block_manager.py @@ -0,0 +1,262 @@ +import pytest +import time +from typing import List + +from vllm import SamplingParams +from vllm.block import PhysicalTokenBlock +from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus +from vllm.utils import Device +from vllm.sequence import Sequence, SequenceGroup, SequenceStatus + +from .utils import create_dummy_prompt + + +def test_block_allocator_allocate(): + block_size = 4 + num_cpu_blocks = 4 + cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks) + + # Allocate all available cpu blocks. + num_free = num_cpu_blocks + assert cpu_allocator.get_num_free_blocks() == num_free + for _ in range(num_cpu_blocks): + block = cpu_allocator.allocate() + num_free -= 1 + assert block not in cpu_allocator.free_blocks + assert cpu_allocator.get_num_free_blocks() == num_free + + with pytest.raises(ValueError): + cpu_allocator.allocate() + + +def test_block_allocator_free(): + block_size = 4 + num_cpu_blocks = 4 + cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks) + + # Allocate all available cpu blocks. + blocks: List[PhysicalTokenBlock] = [] + for _ in range(num_cpu_blocks): + block = cpu_allocator.allocate() + blocks.append(block) + assert block not in cpu_allocator.free_blocks + + # Free all allocated cpu blocks. + num_free = 0 + assert cpu_allocator.get_num_free_blocks() == num_free + for block in blocks: + cpu_allocator.free(block) + num_free += 1 + assert block in cpu_allocator.free_blocks + assert cpu_allocator.get_num_free_blocks() == num_free + + with pytest.raises(ValueError): + cpu_allocator.free(block) + + +def test_allocate(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same sequence group to all available gpu blocks. + for i in range(num_gpu_blocks): + _, seq_group = create_dummy_prompt(str(i), block_size) + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + # Allocate same sequence group to all available gpu blocks. + # Use watermark to reserve one gpu block. + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=1 / num_gpu_blocks) + for i in range(num_gpu_blocks - 1): + _, seq_group = create_dummy_prompt(str(i), block_size) + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + +def test_append_slot_single_seq(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate single seq to gpu block. + prompt, seq_group = create_dummy_prompt("1", block_size) + block_manager.allocate(seq_group) + + # Nothing to append. Sequence has no new logical blocks. + assert block_manager.can_append_slot(seq_group) + before_blocks = block_manager.get_num_free_gpu_blocks() + assert not block_manager.append_slot(prompt) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert before_blocks == after_blocks + + # Add block_size number of new tokens and append slot. + for i in range(block_size): + token_id = i + 5 + prompt.append_token_id(token_id, {token_id: 0.0}) + + assert block_manager.can_append_slot(seq_group) + before_blocks = block_manager.get_num_free_gpu_blocks() + assert not block_manager.append_slot(prompt) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert before_blocks - after_blocks == 1 + + +def test_append_slot_cow(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate prompt to gpu block. + prompt = Sequence(1, "one two three", [1, 2, 3], block_size) + child = prompt.fork(2) + token_id = 4 + child.append_token_id(token_id, {token_id: 0.0}) + seq_group = SequenceGroup("1", [prompt, child], SamplingParams(), + time.time(), time.perf_counter) + block_manager.allocate(seq_group) + + # Append slot for child token. + # Last block being modified is shared. Copy on write occurs. + assert block_manager.can_append_slot(seq_group) + before_blocks = block_manager.get_num_free_gpu_blocks() + src_block, dst_block = block_manager.append_slot(child) + assert src_block != dst_block + + after_blocks = block_manager.get_num_free_gpu_blocks() + assert before_blocks - after_blocks == 1 + + +def test_fork(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + prompt, seq_group = create_dummy_prompt("1", + block_size - 1, + block_size=block_size) + block_manager.allocate(seq_group) + + # Fork prompt and copy block tables. + child = prompt.fork(2) + block_manager.fork(prompt, child) + assert block_manager.get_block_table( + prompt) == block_manager.get_block_table(child) + token_id = 4 + # Append token to child. Block is shared so copy on write occurs. + child.append_token_id(token_id, {token_id: 0.0}) + block_manager.append_slot(child) + assert block_manager.get_block_table( + prompt) != block_manager.get_block_table(child) + + +def test_swap(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) + prompt.status = SequenceStatus.WAITING + block_manager.allocate(seq_group) + + # Emulate a forward pass by appending a single token. + # The block manager then knows how many unprocessed + # tokens will be written in the next forward pass. + token_id = 0 + prompt.status = SequenceStatus.RUNNING + prompt.append_token_id(token_id, {token_id: 0.0}) + + # Swap seq group from GPU -> CPU. + gpu_blocks = block_manager.get_block_table(prompt) + assert block_manager.can_swap_out(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_out(seq_group) + assert list(mapping.keys()) == gpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) + assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks + prompt.status = SequenceStatus.SWAPPED + + # Swap seq group from CPU -> GPU. + cpu_blocks = block_manager.get_block_table(prompt) + assert block_manager.can_swap_in(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_in(seq_group) + assert list(mapping.keys()) == cpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks + assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) + + +def test_free(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + prompt, seq_group = create_dummy_prompt("1", block_size) + block_manager.allocate(seq_group) + + # Free allocated seq. + prompt_blocks = len(block_manager.get_block_table(prompt)) + before_blocks = block_manager.get_num_free_gpu_blocks() + block_manager.free(prompt) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert after_blocks == before_blocks + prompt_blocks + + # Block table for freed seq is deleted. + with pytest.raises(KeyError): + block_manager.get_block_table(prompt) + + +def test_reset(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same seq group on all available gpu blocks. + original_blocks = block_manager.get_num_free_gpu_blocks() + for i in range(num_gpu_blocks): + _, seq_group = create_dummy_prompt(str(i), block_size) + block_manager.allocate(seq_group) + assert block_manager.get_num_free_gpu_blocks() == 0 + + # Resetting block manager frees all allocated blocks. + block_manager.reset() + assert block_manager.get_num_free_gpu_blocks() == original_blocks diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py new file mode 100644 index 0000000000000..6322b2f2d5e9e --- /dev/null +++ b/tests/core/test_scheduler.py @@ -0,0 +1,170 @@ +from typing import List +import pytest # noqa + +from vllm.config import CacheConfig, SchedulerConfig +from vllm.core.scheduler import Scheduler +from vllm.sequence import SequenceGroup + +from .utils import create_dummy_prompt + + +def test_scheduler_add_seq_group(): + block_size = 4 + scheduler_config = SchedulerConfig(100, 64, 1, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 4 + cache_config.num_gpu_blocks = 4 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add seq group to scheduler. + num_seq_group = 4 + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), block_size) + scheduler.add_seq_group(seq_group) + assert scheduler.get_num_unfinished_seq_groups() == i + 1 + + +def test_scheduler_abort_seq_group(): + block_size = 4 + scheduler_config = SchedulerConfig(100, 64, 1, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 4 + cache_config.num_gpu_blocks = 4 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add multiple seq groups to scheduler. + num_seq_group = 4 + request_ids = set() + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), block_size) + scheduler.add_seq_group(seq_group) + request_ids.add(str(i)) + + # Abort all added seq groups. + assert scheduler.get_num_unfinished_seq_groups() == num_seq_group + scheduler.abort_seq_group(request_ids) + assert scheduler.get_num_unfinished_seq_groups() == 0 + + +def test_scheduler_schedule_simple(): + block_size = 4 + num_seq_group = 4 + max_model_len = 16 + scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 8 + cache_config.num_gpu_blocks = 8 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add seq groups to scheduler. + running: List[SequenceGroup] = [] + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) + scheduler.add_seq_group(seq_group) + running.append(seq_group) + + # Schedule seq groups prompts. + seq_group_meta, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set(running) + assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs( + )[0].get_len() + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == num_seq_group + + # Schedule seq groups generation. + seq_group_meta, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set(running) + assert out.num_batched_tokens == num_seq_group + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == num_seq_group + + +def test_scheduler_schedule_preempt_abort(): + block_size = 4 + max_model_len = 16 + scheduler_config = SchedulerConfig(64, 2, max_model_len, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 2 + cache_config.num_gpu_blocks = 2 + scheduler = Scheduler(scheduler_config, cache_config, None) + + # Add seq groups to scheduler. + seq_a, seq_group_a = create_dummy_prompt("1", block_size) + seq_b, seq_group_b = create_dummy_prompt("2", block_size) + scheduler.add_seq_group(seq_group_a) + scheduler.add_seq_group(seq_group_b) + + # Schedule seq groups prompts. + seq_group_meta, out = scheduler.schedule() + assert out.scheduled_seq_groups == [seq_group_a, seq_group_b] + assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2 + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == 2 + assert scheduler.get_num_unfinished_seq_groups() == 2 + + # Append "generated" tokens, allowing the sequence to mark prompt tokens as + # processed. + token_id = 0 + seq_a.append_token_id(token_id, {token_id: 0.0}) + seq_b.append_token_id(token_id, {token_id: 0.0}) + + # Schedule seq groups generation and preempt seq group b. + seq_group_meta, out = scheduler.schedule() + assert out.scheduled_seq_groups == [seq_group_a] + assert out.num_batched_tokens == 1 + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == 1 + assert scheduler.get_num_unfinished_seq_groups() == 2 + + # Abort seq group a. Re-schedule seq group b prompt with recomputation. + scheduler.abort_seq_group("1") + seq_group_meta, out = scheduler.schedule() + assert out.scheduled_seq_groups == [seq_group_b] + assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len() + assert (not out.blocks_to_copy and not out.blocks_to_swap_in + and not out.blocks_to_swap_out) + assert len(seq_group_meta) == 1 + assert scheduler.get_num_unfinished_seq_groups() == 1 + + +def test_scheduler_max_seqs(): + block_size = 4 + num_seq_group = 4 + max_seq_group = 2 + max_model_len = 16 + scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 8 + cache_config.num_gpu_blocks = 8 + scheduler = Scheduler(scheduler_config, cache_config, None) + + all_seq_groups: List[SequenceGroup] = [] + # Add seq groups to scheduler. + for i in range(num_seq_group): + _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size) + all_seq_groups.append(seq_group) + + # Append 1 seq group + scheduler.add_seq_group(all_seq_groups[0]) + + # Schedule seq groups prompts. + _, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]]) + + # Schedule seq groups generation. + _, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]]) + + # Append 2 more seq group + scheduler.add_seq_group(all_seq_groups[1]) + scheduler.add_seq_group(all_seq_groups[2]) + + # Schedule seq groups prompts. + # Only 1 seq group should be scheduled since max_seq_group is 2 + # and one is prompting. + _, out = scheduler.schedule() + assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]]) diff --git a/tests/core/utils.py b/tests/core/utils.py new file mode 100644 index 0000000000000..9c0cfe1a7cf66 --- /dev/null +++ b/tests/core/utils.py @@ -0,0 +1,27 @@ +import time +from typing import Tuple + +from vllm import SamplingParams +from vllm.sequence import Sequence, SequenceGroup + + +def create_dummy_prompt( + request_id: str, + prompt_length: int, + block_size: int = None) -> Tuple[Sequence, SequenceGroup]: + if not block_size: + block_size = prompt_length + + # Create dummy prompt sequence with tokens 0...block_size-1 + # and prompt "0 ... block_size". + prompt_tokens = list(range(prompt_length)) + prompt_str = " ".join([str(t) for t in prompt_tokens]) + prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) + seq_group = SequenceGroup(request_id, [prompt], SamplingParams(), + time.time(), None, None) + + return prompt, seq_group + + +def round_up_to_next_block(seq_len: int, block_size: int) -> int: + return (seq_len + block_size - 1) // block_size From a33ce60c6629e8c22aaf002ae8478a685e726e3e Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Wed, 6 Mar 2024 01:04:23 -0800 Subject: [PATCH 060/113] [Testing] Fix core tests (#3224) --- tests/core/test_block_manager.py | 49 ++++++++++++++++++++------------ tests/core/test_scheduler.py | 6 ++-- tests/core/utils.py | 2 +- vllm/sequence.py | 2 +- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index ecdf3025cffdf..04d01f7724e4f 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -6,7 +6,7 @@ from vllm.block import PhysicalTokenBlock from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus from vllm.utils import Device -from vllm.sequence import Sequence, SequenceGroup, SequenceStatus +from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob from .utils import create_dummy_prompt @@ -22,7 +22,8 @@ def test_block_allocator_allocate(): for _ in range(num_cpu_blocks): block = cpu_allocator.allocate() num_free -= 1 - assert block not in cpu_allocator.free_blocks + + assert block.block_hash not in cpu_allocator.evictor assert cpu_allocator.get_num_free_blocks() == num_free with pytest.raises(ValueError): @@ -39,7 +40,7 @@ def test_block_allocator_free(): for _ in range(num_cpu_blocks): block = cpu_allocator.allocate() blocks.append(block) - assert block not in cpu_allocator.free_blocks + assert block.block_hash not in cpu_allocator.evictor # Free all allocated cpu blocks. num_free = 0 @@ -47,7 +48,7 @@ def test_block_allocator_free(): for block in blocks: cpu_allocator.free(block) num_free += 1 - assert block in cpu_allocator.free_blocks + assert block.block_hash in cpu_allocator.evictor assert cpu_allocator.get_num_free_blocks() == num_free with pytest.raises(ValueError): @@ -106,7 +107,7 @@ def test_append_slot_single_seq(): # Add block_size number of new tokens and append slot. for i in range(block_size): token_id = i + 5 - prompt.append_token_id(token_id, {token_id: 0.0}) + prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) assert block_manager.can_append_slot(seq_group) before_blocks = block_manager.get_num_free_gpu_blocks() @@ -119,25 +120,37 @@ def test_append_slot_cow(): block_size = 4 num_cpu_blocks = 4 num_gpu_blocks = 4 - block_manager = BlockSpaceManager(block_size, - num_cpu_blocks, - num_gpu_blocks, + block_manager = BlockSpaceManager(block_size=block_size, + num_cpu_blocks=num_cpu_blocks, + num_gpu_blocks=num_gpu_blocks, watermark=0) - # Allocate prompt to gpu block. - prompt = Sequence(1, "one two three", [1, 2, 3], block_size) - child = prompt.fork(2) - token_id = 4 - child.append_token_id(token_id, {token_id: 0.0}) + # Allocate prompt to gpu block. There is one slot left in the block. + prompt = Sequence(seq_id=1, + prompt="one two three", + prompt_token_ids=[1, 2, 3], + block_size=block_size) + + # Fork the sequence, such that a COW will be required when we append a new + # token id. + child = prompt.fork(new_seq_id=2) + + # Allocate space for the sequence group. seq_group = SequenceGroup("1", [prompt, child], SamplingParams(), time.time(), time.perf_counter) block_manager.allocate(seq_group) - # Append slot for child token. - # Last block being modified is shared. Copy on write occurs. + # Fork and append a new token id. We expect a COW to be scheduled. + token_id = 4 + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.fork(prompt, child) + assert block_manager.can_append_slot(seq_group) before_blocks = block_manager.get_num_free_gpu_blocks() - src_block, dst_block = block_manager.append_slot(child) + + maybe_src_dst_block = block_manager.append_slot(child) + assert maybe_src_dst_block is not None + src_block, dst_block = maybe_src_dst_block assert src_block != dst_block after_blocks = block_manager.get_num_free_gpu_blocks() @@ -165,7 +178,7 @@ def test_fork(): prompt) == block_manager.get_block_table(child) token_id = 4 # Append token to child. Block is shared so copy on write occurs. - child.append_token_id(token_id, {token_id: 0.0}) + child.append_token_id(token_id, {token_id: Logprob(0.0)}) block_manager.append_slot(child) assert block_manager.get_block_table( prompt) != block_manager.get_block_table(child) @@ -189,7 +202,7 @@ def test_swap(): # tokens will be written in the next forward pass. token_id = 0 prompt.status = SequenceStatus.RUNNING - prompt.append_token_id(token_id, {token_id: 0.0}) + prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) # Swap seq group from GPU -> CPU. gpu_blocks = block_manager.get_block_table(prompt) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 6322b2f2d5e9e..ebfeb8ba04812 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -3,7 +3,7 @@ from vllm.config import CacheConfig, SchedulerConfig from vllm.core.scheduler import Scheduler -from vllm.sequence import SequenceGroup +from vllm.sequence import SequenceGroup, Logprob from .utils import create_dummy_prompt @@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort(): # Append "generated" tokens, allowing the sequence to mark prompt tokens as # processed. token_id = 0 - seq_a.append_token_id(token_id, {token_id: 0.0}) - seq_b.append_token_id(token_id, {token_id: 0.0}) + seq_a.append_token_id(token_id, {token_id: Logprob(0.0)}) + seq_b.append_token_id(token_id, {token_id: Logprob(0.0)}) # Schedule seq groups generation and preempt seq group b. seq_group_meta, out = scheduler.schedule() diff --git a/tests/core/utils.py b/tests/core/utils.py index 9c0cfe1a7cf66..6469789e89386 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -18,7 +18,7 @@ def create_dummy_prompt( prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size) seq_group = SequenceGroup(request_id, [prompt], SamplingParams(), - time.time(), None, None) + time.time(), None) return prompt, seq_group diff --git a/vllm/sequence.py b/vllm/sequence.py index 97b72fdc4cbeb..19dafe3cb0fc9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -142,7 +142,7 @@ def __init__( prompt: str, prompt_token_ids: List[int], block_size: int, - eos_token_id: int, + eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, ) -> None: self.seq_id = seq_id From 4cb3b924cdeb6b809f0a0311f9833253d9162699 Mon Sep 17 00:00:00 2001 From: Chujie Zheng Date: Wed, 6 Mar 2024 14:41:42 -0800 Subject: [PATCH 061/113] Add tqdm `dynamic_ncols=True` (#3242) --- vllm/entrypoints/llm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 62f1d172377f6..1f463bdaaedc3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -191,7 +191,9 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() - pbar = tqdm(total=num_requests, desc="Processed prompts") + pbar = tqdm(total=num_requests, + desc="Processed prompts", + dynamic_ncols=True) # Run the engine. outputs: List[RequestOutput] = [] while self.llm_engine.has_unfinished_requests(): From d3c04b6a39df016504c28ec3fc27ea58ca802a28 Mon Sep 17 00:00:00 2001 From: TechxGenus Date: Thu, 7 Mar 2024 08:19:14 +0800 Subject: [PATCH 062/113] Add GPTQ support for Gemma (#3200) --- vllm/model_executor/models/gemma.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 03948132d32c3..bf1f164ff700d 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -325,11 +325,17 @@ def load_weights(self, if shard_name not in name: continue name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue # GemmaRMSNorm is different from Llama's in that it multiplies # (1 + weight) to the output, instead of just weight. if "norm.weight" in name: From cbf4c05b156c8705c6bb1a94b9edc0a5b4d26e20 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Thu, 7 Mar 2024 03:39:28 -0500 Subject: [PATCH 063/113] Update requirements-dev.txt to include package for benchmarking scripts. (#3181) Co-authored-by: Zhuohan Li --- requirements-dev.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index 55e102374fd73..dfcbfa4253f1c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,3 +21,6 @@ einops # required for MPT openai requests ray + +# Benchmarking +aiohttp From 2daf23ab0cf00da157b1255faddcf0a269283d36 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 7 Mar 2024 01:45:50 -0800 Subject: [PATCH 064/113] Separate attention backends (#3005) --- .gitignore | 3 + setup.py | 48 +++- tests/kernels/test_prefix_prefill.py | 2 +- vllm/__init__.py | 30 ++- .../layers/attention/__init__.py | 5 + .../layers/attention/attention.py | 59 +++++ .../backends}/__init__.py | 0 .../layers/attention/backends/flash_attn.py | 124 ++++++++++ .../backends/xformers.py} | 216 +++++------------- .../layers/attention/ops/__init__.py | 0 .../layers/attention/ops/paged_attn.py | 138 +++++++++++ .../ops}/prefix_prefill.py | 0 vllm/model_executor/models/baichuan.py | 13 +- vllm/model_executor/models/bloom.py | 10 +- vllm/model_executor/models/chatglm.py | 4 +- vllm/model_executor/models/deepseek.py | 10 +- vllm/model_executor/models/falcon.py | 28 +-- vllm/model_executor/models/gemma.py | 10 +- vllm/model_executor/models/gpt2.py | 6 +- vllm/model_executor/models/gpt_bigcode.py | 10 +- vllm/model_executor/models/gpt_j.py | 4 +- vllm/model_executor/models/gpt_neox.py | 4 +- vllm/model_executor/models/internlm2.py | 10 +- vllm/model_executor/models/llama.py | 12 +- vllm/model_executor/models/mixtral.py | 4 +- vllm/model_executor/models/mixtral_quant.py | 4 +- vllm/model_executor/models/mpt.py | 12 +- vllm/model_executor/models/olmo.py | 8 +- vllm/model_executor/models/opt.py | 8 +- vllm/model_executor/models/orion.py | 10 +- vllm/model_executor/models/phi.py | 4 +- vllm/model_executor/models/qwen.py | 4 +- vllm/model_executor/models/qwen2.py | 12 +- vllm/model_executor/models/stablelm.py | 10 +- vllm/model_executor/models/starcoder2.py | 4 +- 35 files changed, 558 insertions(+), 268 deletions(-) create mode 100644 vllm/model_executor/layers/attention/__init__.py create mode 100644 vllm/model_executor/layers/attention/attention.py rename vllm/model_executor/layers/{triton_kernel => attention/backends}/__init__.py (100%) create mode 100644 vllm/model_executor/layers/attention/backends/flash_attn.py rename vllm/model_executor/layers/{attention.py => attention/backends/xformers.py} (56%) create mode 100644 vllm/model_executor/layers/attention/ops/__init__.py create mode 100644 vllm/model_executor/layers/attention/ops/paged_attn.py rename vllm/model_executor/layers/{triton_kernel => attention/ops}/prefix_prefill.py (100%) diff --git a/.gitignore b/.gitignore index b5195629e5cf3..0b14c98270c41 100644 --- a/.gitignore +++ b/.gitignore @@ -184,3 +184,6 @@ _build/ # Benchmark dataset *.json + +# Third-party Python packages. +vllm/thirdparty_files/ diff --git a/setup.py b/setup.py index 745b5a9b2d02a..57d7a139e8237 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ import os import re import subprocess +import sys import warnings from pathlib import Path from typing import List, Set @@ -14,6 +15,8 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME ROOT_DIR = os.path.dirname(__file__) +# This is a temporary directory to store third-party packages. +THIRDPARTY_SUBDIR = "vllm/thirdparty_files" # If you are developing the C++ backend of vLLM, consider building vLLM with # `python setup.py develop` since it will give you incremental builds. @@ -324,8 +327,46 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS_PUNICA, }, )) -elif _is_neuron(): - neuronxcc_version = get_neuronxcc_version() + + # Download the FlashAttention package. + # Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/setup.py#L518-L530 + flash_attn_version = "2.5.6" + install_dir = os.path.join(ROOT_DIR, THIRDPARTY_SUBDIR) + subprocess.check_call( + [ + sys.executable, + "-m", + "pip", + "install", + "-q", + f"--target={install_dir}", + "einops", # Dependency of flash-attn. + f"flash-attn=={flash_attn_version}", + "--no-dependencies", # Required to avoid re-installing torch. + ], + env=dict(os.environ, CC="gcc"), + ) + + # Copy the FlashAttention package into the vLLM package after build. + class build_ext(BuildExtension): + + def run(self): + super().run() + target_dir = os.path.join(self.build_lib, THIRDPARTY_SUBDIR) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + self.copy_tree(install_dir, target_dir) + + class BinaryDistribution(setuptools.Distribution): + + def has_ext_modules(self): + return True + +else: + build_ext = BuildExtension + BinaryDistribution = setuptools.Distribution + if _is_neuron(): + neuronxcc_version = get_neuronxcc_version() vllm_extension_sources = [ "csrc/cache_kernels.cu", @@ -468,6 +509,7 @@ def get_requirements() -> List[str]: python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {}, + cmdclass={"build_ext": build_ext} if not _is_neuron() else {}, + distclass=BinaryDistribution, package_data=package_data, ) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index c068b38a66910..e881cd1ec3753 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -3,7 +3,7 @@ import time import torch -from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( +from vllm.model_executor.layers.attention.ops.prefix_prefill import ( context_attention_fwd) from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask diff --git a/vllm/__init__.py b/vllm/__init__.py index f1e30f5eb6e6e..59f1345b58d42 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,12 +1,28 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster -from vllm.entrypoints.llm import LLM -from vllm.outputs import CompletionOutput, RequestOutput -from vllm.sampling_params import SamplingParams + +# Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/ray/__init__.py#L11 +def _configure_system(): + import os + import sys + + # Importing flash-attn. + thirdparty_files = os.path.join(os.path.abspath(os.path.dirname(__file__)), + "thirdparty_files") + sys.path.insert(0, thirdparty_files) + + +_configure_system() +# Delete configuration function. +del _configure_system + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 +from vllm.engine.async_llm_engine import AsyncLLMEngine # noqa: E402 +from vllm.engine.llm_engine import LLMEngine # noqa: E402 +from vllm.engine.ray_utils import initialize_cluster # noqa: E402 +from vllm.entrypoints.llm import LLM # noqa: E402 +from vllm.outputs import CompletionOutput, RequestOutput # noqa: E402 +from vllm.sampling_params import SamplingParams # noqa: E402 __version__ = "0.3.3" diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py new file mode 100644 index 0000000000000..1c42a3d28f976 --- /dev/null +++ b/vllm/model_executor/layers/attention/__init__.py @@ -0,0 +1,5 @@ +from vllm.model_executor.layers.attention.attention import Attention + +__all__ = [ + "Attention", +] diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py new file mode 100644 index 0000000000000..830e82e10f7ad --- /dev/null +++ b/vllm/model_executor/layers/attention/attention.py @@ -0,0 +1,59 @@ +"""Attention layer.""" +from typing import List, Optional + +import torch +import torch.nn as nn + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.utils import is_hip + + +class Attention(nn.Module): + """Attention layer. + + This class takes query, key, and value tensors as input. The input tensors + can either contain prompt tokens or generation tokens. + The class does the following: + + 1. Store the input key and value tensors in the KV cache. + 2. Perform (multi-head/multi-query/grouped-query) attention. + 3. Return the output tensor. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + super().__init__() + if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and + torch.get_default_dtype() in (torch.float16, torch.bfloat16)): + # Ampere or later NVIDIA GPUs. + # NOTE(woosuk): FlashAttention does not support FP32. + from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend + self.backend = FlashAttentionBackend(num_heads, head_size, scale, + num_kv_heads, alibi_slopes, + sliding_window) + else: + # Turing and Volta NVIDIA GPUs or AMD GPUs. + # Or FP32 on any GPU. + from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend + self.backend = XFormersBackend(num_heads, head_size, scale, + num_kv_heads, alibi_slopes, + sliding_window) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + return self.backend.forward(query, key, value, key_cache, value_cache, + input_metadata) diff --git a/vllm/model_executor/layers/triton_kernel/__init__.py b/vllm/model_executor/layers/attention/backends/__init__.py similarity index 100% rename from vllm/model_executor/layers/triton_kernel/__init__.py rename to vllm/model_executor/layers/attention/backends/__init__.py diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py new file mode 100644 index 0000000000000..512f4e49c7eb2 --- /dev/null +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -0,0 +1,124 @@ +"""Attention layer with Flash and PagedAttention.""" +from typing import List, Optional + +# NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/. +from flash_attn import flash_attn_func +import torch + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention.ops.paged_attn import ( + PagedAttentionImpl) + + +class FlashAttentionBackend: + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + self.sliding_window = ((self.sliding_window, self.sliding_window) if + self.sliding_window is not None else (-1, -1)) + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: Optional[torch.Tensor], + value_cache: Optional[torch.Tensor], + input_metadata: InputMetadata, + ) -> torch.Tensor: + """Forward pass with FlashAttention and PagedAttention. + + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + key_cache: shape = [num_blocks, num_kv_heads, head_size/x, + block_size, x] + value_cache: shape = [num_blocks, num_kv_heads, head_size, + block_size] + input_metadata: metadata for the inputs. + Returns: + shape = [batch_size, seq_len, num_heads * head_size] + """ + batch_size, seq_len, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + # Reshape the keys and values and store them in the cache. + # If key_cache and value_cache are not provided, the new key and value + # vectors will not be cached. This happens during the initial memory + # profiling run. + if key_cache is not None and value_cache is not None: + PagedAttentionImpl.reshape_and_cache(key, value, key_cache, + value_cache, input_metadata) + + if input_metadata.is_prompt: + # Prompt run. + if (key_cache is None or value_cache is None + or input_metadata.block_tables.numel() == 0): + # normal attention + query = query.unflatten(0, (batch_size, seq_len)) + key = key.unflatten(0, (batch_size, seq_len)) + value = value.unflatten(0, (batch_size, seq_len)) + output = flash_attn_func( + query, + key, + value, + softmax_scale=self.scale, + causal=True, + window_size=self.sliding_window, + alibi_slopes=self.alibi_slopes, + ) + else: + # prefix-enabled attention + output = PagedAttentionImpl.forward_prefix( + query, + key, + value, + key_cache, + value_cache, + input_metadata, + self.num_heads, + self.num_kv_heads, + self.alibi_slopes, + ) + else: + # Decoding run. + output = PagedAttentionImpl.forward_decode( + query, + key_cache, + value_cache, + input_metadata, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + ) + + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention/backends/xformers.py similarity index 56% rename from vllm/model_executor/layers/attention.py rename to vllm/model_executor/layers/attention/backends/xformers.py index 2a82325b80213..bad2a648b6703 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention/backends/xformers.py @@ -1,37 +1,19 @@ -"""Multi-head attention.""" +"""Attention layer with xFormers and PagedAttention.""" +import importlib from typing import List, Optional -import importlib import torch -import torch.nn as nn from xformers import ops as xops from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask, LowerTriangularMaskWithTensorBias) -from vllm._C import ops -from vllm._C import cache_ops from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( - context_attention_fwd) +from vllm.model_executor.layers.attention.ops.paged_attn import ( + PagedAttentionImpl) from vllm.utils import is_hip -_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] -# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. -_PARTITION_SIZE = 512 - - -class PagedAttention(nn.Module): - """MHA/MQA/GQA layer with PagedAttention. - This class takes query, key, and value tensors as input. The input tensors - can either contain prompt tokens or generation tokens. - The class does the following: - - 1. Reshape and store the input key and value tensors in the KV cache. - 2. Perform (multi-head/multi-query/grouped-query) attention using either - xformers or the PagedAttention custom op. - 3. Return the output tensor. - """ +class XFormersBackend: def __init__( self, @@ -42,7 +24,6 @@ def __init__( alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, ) -> None: - super().__init__() self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -50,48 +31,17 @@ def __init__( self.sliding_window = sliding_window if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) - self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) + self.alibi_slopes = alibi_slopes assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads + suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") - if self.head_size not in _SUPPORTED_HEAD_SIZES: - raise ValueError(f"head_size ({self.head_size}) is not supported. " - f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.") - - self.use_ref_attention = self.check_use_ref_attention() - - def check_use_ref_attention(self) -> bool: - if not is_hip(): - return False - # For ROCm, check whether flash attention is installed or not. - # if not, use_ref_attention needs to be True - return importlib.util.find_spec("flash_attn") is None - - def ref_masked_attention( - self, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - ) -> torch.Tensor: - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - seq_len, _, _ = query.shape - attn_mask = torch.triu(torch.ones(seq_len, - seq_len, - dtype=query.dtype, - device=query.device), - diagonal=1) - attn_mask = attn_mask * torch.finfo(query.dtype).min - - attn_weights = self.scale * torch.einsum("qhd,khd->hqk", query, - key).float() - attn_weights = attn_weights + attn_mask.float() - attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) - out = torch.einsum("hqk,khd->qhd", attn_weights, value) - return out + self.use_ref_attention = _check_use_ref_attention() def forward( self, @@ -102,7 +52,7 @@ def forward( value_cache: Optional[torch.Tensor], input_metadata: InputMetadata, ) -> torch.Tensor: - """PagedAttention forward pass. + """Forward pass with xFormers and PagedAttention. Args: query: shape = [batch_size, seq_len, num_heads * head_size] @@ -127,19 +77,14 @@ def forward( # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: - cache_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - input_metadata.slot_mapping.flatten(), - input_metadata.kv_cache_dtype, - ) + PagedAttentionImpl.reshape_and_cache(key, value, key_cache, + value_cache, input_metadata) if input_metadata.is_prompt: - # normal attention + # Prompt run. if (key_cache is None or value_cache is None or input_metadata.block_tables.numel() == 0): + # normal attention if self.num_kv_heads != self.num_heads: # As of Nov 2023, xformers only supports MHA. For MQA/GQA, # project the key and value tensors to the desired number of @@ -175,13 +120,19 @@ def forward( seq_len, query.dtype) if self.use_ref_attention: - output = self.ref_masked_attention( + output = _ref_masked_attention( query, key, value, + self.num_heads, + self.num_kv_heads, + self.head_size, + self.scale, ) - # Using view got RuntimeError: view size is not compatible with input tensor's size and stride - # (at least one dimension spans across two contiguous subspaces). Use reshape instead + # Using view got RuntimeError: view size is not compatible + # with input tensor's size and stride (at least one + # dimension spans across two contiguous subspaces). + # Use reshape instead. return output.reshape(batch_size, seq_len, hidden_size) # TODO(woosuk): Too many view operations. Let's try to reduce @@ -206,27 +157,21 @@ def forward( (is_hip()) else None, ) output = out.view_as(query) + else: # prefix-enabled attention - output = torch.empty_like(query) - context_attention_fwd( + output = PagedAttentionImpl.forward_prefix( query, key, value, - output, key_cache, value_cache, - input_metadata.block_tables, # [BS, max_block_per_request] - input_metadata.start_loc, - input_metadata.prompt_lens, - input_metadata.context_lens, - input_metadata.max_seq_len, - getattr(self, "alibi_slopes", None), + input_metadata, + self.alibi_slopes, ) - else: # Decoding run. - output = _paged_attention( + output = PagedAttentionImpl.forward_decode( query, key_cache, value_cache, @@ -274,76 +219,37 @@ def _make_alibi_bias( return attn_bias -def _paged_attention( +def _check_use_ref_attention() -> bool: + if not is_hip(): + return False + # For ROCm, check whether flash attention is installed or not. + # if not, use_ref_attention needs to be True + return importlib.util.find_spec("flash_attn") is None + + +def _ref_masked_attention( query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - input_metadata: InputMetadata, + key: torch.Tensor, + value: torch.Tensor, + num_heads: int, num_kv_heads: int, + head_size: int, scale: float, - alibi_slopes: Optional[torch.Tensor], ) -> torch.Tensor: - output = torch.empty_like(query) - - block_size = value_cache.shape[3] - num_seqs, num_heads, head_size = query.shape - max_num_partitions = ( - (input_metadata.max_context_len + _PARTITION_SIZE - 1) // - _PARTITION_SIZE) - # NOTE(woosuk): We use a simple heuristic to decide whether to use - # PagedAttention V1 or V2. If the number of partitions is 1, we use - # V1 to avoid the overhead of reduction. Also, if the number of - # sequences or heads is large, we use V1 since there is enough work - # to parallelize. - # TODO(woosuk): Tune this heuristic. - # For context len > 8192, use V2 kernel to avoid shared memory shortage. - use_v1 = input_metadata.max_context_len <= 8192 and ( - max_num_partitions == 1 or num_seqs * num_heads > 512) - if use_v1: - # Run PagedAttention V1. - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - ) - else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - ) - return output + query = query.view(-1, num_heads, head_size) + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + + seq_len, _, _ = query.shape + attn_mask = torch.triu(torch.ones(seq_len, + seq_len, + dtype=query.dtype, + device=query.device), + diagonal=1) + attn_mask = attn_mask * torch.finfo(query.dtype).min + + attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() + attn_weights = attn_weights + attn_mask.float() + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.einsum("hqk,khd->qhd", attn_weights, value) + return out diff --git a/vllm/model_executor/layers/attention/ops/__init__.py b/vllm/model_executor/layers/attention/ops/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py new file mode 100644 index 0000000000000..c5a9618c2395b --- /dev/null +++ b/vllm/model_executor/layers/attention/ops/paged_attn.py @@ -0,0 +1,138 @@ +from typing import List, Optional + +import torch + +from vllm._C import cache_ops +from vllm._C import ops +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.attention.ops.prefix_prefill import ( + context_attention_fwd) + +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 512 + + +class PagedAttentionImpl: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [64, 80, 96, 112, 128, 256] + + @staticmethod + def reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + ) -> None: + cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + input_metadata.slot_mapping.flatten(), + input_metadata.kv_cache_dtype, + ) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + ) -> torch.Tensor: + output = torch.empty_like(query) + + block_size = value_cache.shape[3] + num_seqs, num_heads, head_size = query.shape + max_num_partitions = ( + (input_metadata.max_context_len + _PARTITION_SIZE - 1) // + _PARTITION_SIZE) + # NOTE(woosuk): We use a simple heuristic to decide whether to use + # PagedAttention V1 or V2. If the number of partitions is 1, we use + # V1 to avoid the overhead of reduction. Also, if the number of + # sequences or heads is large, we use V1 since there is enough work + # to parallelize. + # TODO(woosuk): Tune this heuristic. + # For context len > 8192, use V2 kernel to avoid shared memory shortage. + use_v1 = input_metadata.max_context_len <= 8192 and ( + max_num_partitions == 1 or num_seqs * num_heads > 512) + if use_v1: + # Run PagedAttention V1. + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + input_metadata.kv_cache_dtype, + ) + else: + # Run PagedAttention V2. + assert _PARTITION_SIZE % block_size == 0 + tmp_output = torch.empty( + size=(num_seqs, num_heads, max_num_partitions, head_size), + dtype=output.dtype, + device=output.device, + ) + exp_sums = torch.empty( + size=(num_seqs, num_heads, max_num_partitions), + dtype=torch.float32, + device=output.device, + ) + max_logits = torch.empty_like(exp_sums) + ops.paged_attention_v2( + output, + exp_sums, + max_logits, + tmp_output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + input_metadata.block_tables, + input_metadata.context_lens, + block_size, + input_metadata.max_context_len, + alibi_slopes, + input_metadata.kv_cache_dtype, + ) + return output + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + input_metadata: InputMetadata, + alibi_slopes: Optional[torch.Tensor], + ) -> torch.Tensor: + output = torch.empty_like(query) + context_attention_fwd( + query, + key, + value, + output, + key_cache, + value_cache, + input_metadata.block_tables, # [BS, max_block_per_request] + input_metadata.start_loc, + input_metadata.prompt_lens, + input_metadata.context_lens, + input_metadata.max_seq_len, + alibi_slopes, + ) + return output diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/attention/ops/prefix_prefill.py similarity index 100% rename from vllm/model_executor/layers/triton_kernel/prefix_prefill.py rename to vllm/model_executor/layers/attention/ops/prefix_prefill.py diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 550dec6487f9e..6da0082b94285 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -27,7 +27,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -151,10 +151,10 @@ def __init__( alibi_slopes = alibi_slopes[head_start:head_end].tolist() scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes) + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes) else: self.rotary_emb = get_rope( self.head_dim, @@ -163,8 +163,7 @@ def __init__( base=self.rope_theta, ) self.scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, self.head_dim, - self.scaling) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling) def forward( self, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4adfb6b78102f..0548b2b140b1b 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -107,10 +107,10 @@ def __init__( alibi_slopes = alibi_slopes[head_start:head_end].tolist() scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes) + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes) def forward( self, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dca8d724f976b..1c5dcfacaff2b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -10,7 +10,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -87,7 +87,7 @@ def __init__( base=10000 * rope_ratio, is_neox_style=False, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 6dba952736921..f2dca3df27cfb 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -29,7 +29,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, @@ -229,10 +229,10 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 2b5e022312e3b..3c148be5b10f4 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -28,7 +28,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -150,10 +150,10 @@ def __init__( max_position=max_position_embeddings, base=rope_theta, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.inv_norm_factor, + num_kv_heads=self.num_kv_heads) elif self.use_alibi: tp_rank = get_tensor_model_parallel_rank() head_start = tp_rank * self.num_heads @@ -161,16 +161,16 @@ def __init__( alibi_slopes = (_get_alibi_slopes(self.total_num_heads) * self.inv_norm_factor) alibi_slopes = alibi_slopes[head_start:head_end].tolist() - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.inv_norm_factor, - num_kv_heads=self.num_kv_heads, - alibi_slopes=alibi_slopes) + self.attn = Attention(self.num_heads, + self.head_dim, + self.inv_norm_factor, + num_kv_heads=self.num_kv_heads, + alibi_slopes=alibi_slopes) else: - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.inv_norm_factor, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.inv_norm_factor, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index bf1f164ff700d..386a36cf492d6 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -23,7 +23,7 @@ from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import GeluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -123,10 +123,10 @@ def __init__(self, base=self.rope_theta, is_neox_style=True, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 661da0fe0434e..3f7b21e5a4133 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -73,9 +73,7 @@ def __init__( bias=True, linear_method=linear_method, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scale) + self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale) def forward( self, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ef4c1d4143c88..5c30d47d93e36 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -26,7 +26,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -85,10 +85,10 @@ def __init__( bias=True, linear_method=linear_method, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scale, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scale, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 5bab30d9d442e..b8c6822e9825e 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -24,7 +24,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -86,7 +86,7 @@ def __init__( base=rope_theta, is_neox_style=False, ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + self.attn = Attention(self.num_heads, self.head_size, scaling) def forward( self, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8f7e1063e0c1d..98107350e60b9 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -24,7 +24,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -87,7 +87,7 @@ def __init__( max_position=max_position_embeddings, base=rope_theta, ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + self.attn = Attention(self.num_heads, self.head_size, scaling) def forward( self, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ebf1d8a89a022..0ae0a85643456 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -7,7 +7,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -114,10 +114,10 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d35887cc0f6a3..4c163dfdab537 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -30,7 +30,7 @@ from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -139,11 +139,11 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=sliding_window) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=sliding_window) def forward( self, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0100624a44d78..d47834e519697 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -29,7 +29,7 @@ from vllm.config import LoRAConfig from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, @@ -197,7 +197,7 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index a8dadce24aa1d..25c7f1978c0dc 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -32,7 +32,7 @@ from transformers import MixtralConfig from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, ReplicatedLinear, @@ -214,7 +214,7 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 22a876e2ef691..16ecac3d0529a 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -8,7 +8,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -105,11 +105,11 @@ def __init__( self.head_dim = self.d_model // self.total_num_heads scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scaling, - alibi_slopes=alibi_slopes, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + scaling, + alibi_slopes=alibi_slopes, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 9d563039208c8..fa7a6d850051e 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -43,7 +43,7 @@ from torch import nn from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import ( ColumnParallelLinear, LinearMethodBase, @@ -126,9 +126,9 @@ def __init__( base=rope_theta, ) self.scaling = self.head_dim**-0.5 - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scaling) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scaling) # Attention output projection. self.attn_out = RowParallelLinear( diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 393b2dcabcd5a..782f43ce265bd 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -89,9 +89,9 @@ def __init__( bias=bias, linear_method=linear_method, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - scale=self.scaling) + self.attn = Attention(self.num_heads, + self.head_dim, + scale=self.scaling) def forward( self, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 0b067d4fc8802..6039b1cdc3534 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -12,7 +12,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, QKVParallelLinear, @@ -118,10 +118,10 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads) def forward( self, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index d143261968288..039dc7a9b7675 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -43,7 +43,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearMethodBase, QKVParallelLinear, @@ -108,7 +108,7 @@ def __init__(self, max_position=max_position_embeddings, base=rope_theta, ) - self.attn = PagedAttention(self.num_heads, self.head_size, scaling) + self.attn = Attention(self.num_heads, self.head_size, scaling) def forward( self, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 37af84c7cd53f..d4d5a4e8bb9a5 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -12,7 +12,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -104,7 +104,7 @@ def __init__( base=rope_theta, rope_scaling=rope_scaling, ) - self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling) def forward( self, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e823e6f8c3dbe..3586a7fb82778 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -30,7 +30,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, @@ -135,11 +135,11 @@ def __init__(self, max_position=max_position, base=self.rope_theta, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - sliding_window=self.sliding_window) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + sliding_window=self.sliding_window) def forward( self, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 44c57e5a6d4f9..d1a547f815616 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.linear import (LinearMethodBase, MergedColumnParallelLinear, QKVParallelLinear, @@ -122,10 +122,10 @@ def __init__(self, max_position=self.config.max_position_embeddings, base=self.config.rope_theta, ) - self.attn = PagedAttention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_key_value_heads) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_key_value_heads) def forward( self, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1eda07b724cae..efa235233372f 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -25,7 +25,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -103,7 +103,7 @@ def __init__(self, base=int(self.rope_theta), is_neox_style=True, ) - self.attn = PagedAttention( + self.attn = Attention( self.num_heads, self.head_dim, self.scaling, From 385da2dae2b90e5273da8dfce881727bd9c574a1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 7 Mar 2024 11:42:42 -0800 Subject: [PATCH 065/113] Measure model memory usage (#3120) --- vllm/utils.py | 25 +++++++++++++++++++++++++ vllm/worker/model_runner.py | 18 ++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 9cdf623379516..5b94067cec777 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -3,6 +3,7 @@ import socket import subprocess import uuid +import gc from platform import uname from typing import List, Tuple, Union from packaging.version import parse, Version @@ -309,3 +310,27 @@ def create_kv_caches_with_random( f"Does not support value cache of type {cache_dtype}") value_caches.append(value_cache) return key_caches, value_caches + + +class measure_cuda_memory: + + def __init__(self, device=None): + self.device = device + + def current_memory_usage(self) -> float: + # Return the memory usage in bytes. + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + return mem + + def __enter__(self): + self.initial_memory = self.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = self.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index aff8ebc903623..b01f865f1bb03 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -21,7 +21,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest -from vllm.utils import in_wsl +from vllm.utils import in_wsl, measure_cuda_memory logger = init_logger(__name__) @@ -85,11 +85,17 @@ def __init__( self.model_config.enforce_eager = True def load_model(self) -> None: - self.model = get_model(self.model_config, - self.device_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + with measure_cuda_memory() as m: + self.model = get_model(self.model_config, + self.device_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + + self.model_memory_usage = m.consumed_memory + logger.info( + f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" + ) vocab_size = self.model.config.vocab_size From 8cbba4622c8c526b207b17e3ba51e18e2c766419 Mon Sep 17 00:00:00 2001 From: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com> Date: Thu, 7 Mar 2024 18:03:22 -0500 Subject: [PATCH 066/113] Possible fix for conflict between Automated Prefix Caching (#2762) and multi-LoRA support (#1804) (#3263) --- tests/test_cache_block_hashing.py | 46 +++++++++++++++++++++---------- vllm/sequence.py | 3 +- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index c2067e52b59c0..fb541f38f3489 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -2,8 +2,11 @@ Run `pytest tests/test_cache_block_hashing.py`. """ +from typing import List, Optional + import pytest +from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import TokenizerGroup from vllm.sequence import Sequence @@ -36,7 +39,10 @@ def flatten_2d(li): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("max_num_seqs", [256]) -def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): +@pytest.mark.parametrize("concurrent_lora_int_ids", + [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) +def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, + concurrent_lora_int_ids: List[Optional[int]]): tokenizer = TokenizerGroup( tokenizer_id="facebook/opt-125m", @@ -48,20 +54,30 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): hashes = [] for prefix in prefixes: - hashes.append([]) - prompts = [prefix + prompt for prompt in sample_prompts] - seq_id = 0 - for prompt in prompts: - hashes[-1].append([]) - prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - tokenizer.tokenizer.eos_token_id) - - num_blocks = len(prompt_token_ids) // block_size - for idx in range(num_blocks): - hashes[-1][-1].append(seq.hash_of_block(idx)) - - seq_id += 1 + for lora_int_id in concurrent_lora_int_ids: + lora_request = None + + if lora_int_id is not None: + lora_request = LoRARequest( + f"example_lora_{lora_int_id}", + lora_int_id, + f"example/path/to/lora_{lora_int_id}", + ) + + hashes.append([]) + prompts = [prefix + prompt for prompt in sample_prompts] + seq_id = 0 + for prompt in prompts: + hashes[-1].append([]) + prompt_token_ids = tokenizer.encode(prompt) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, + tokenizer.tokenizer.eos_token_id, lora_request) + + num_blocks = len(prompt_token_ids) // block_size + for idx in range(num_blocks): + hashes[-1][-1].append(seq.hash_of_block(idx)) + + seq_id += 1 # Check that hashes made with two prefixes with different first blocks are # different everywhere. diff --git a/vllm/sequence.py b/vllm/sequence.py index 19dafe3cb0fc9..fee96a875dde5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -175,7 +175,8 @@ def hash_of_block(self, logical_idx: int) -> int: # TODO: The current hashing function is O(L^2). We should optimize # this in the future. num_tokens = self.num_hashed_tokens_of_block(logical_idx) - return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + return hash( + (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id)) def num_hashed_tokens_of_block(self, logical_idx: int): return logical_idx * self.block_size + self.block_size From b35cc93420e37b72dc1c4bbedb06012fd294b743 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 8 Mar 2024 01:37:28 +0100 Subject: [PATCH 067/113] Fix auto prefix bug (#3239) --- tests/engine/test_computed_prefix_blocks.py | 34 +++++++++++++++++++++ vllm/core/block_manager.py | 28 +++++++++-------- vllm/worker/model_runner.py | 1 + 3 files changed, 51 insertions(+), 12 deletions(-) create mode 100644 tests/engine/test_computed_prefix_blocks.py diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py new file mode 100644 index 0000000000000..ed35212cc3f11 --- /dev/null +++ b/tests/engine/test_computed_prefix_blocks.py @@ -0,0 +1,34 @@ +import pytest + +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.llm_engine import LLMEngine +from vllm.sampling_params import SamplingParams + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +@pytest.mark.parametrize("block_size", [16]) +def test_computed_prefix_blocks(model: str, block_size: int): + # This test checks if we are able to run the engine to completion + # without triggering asserts. + # We are in a scenario where all blocks from the second request's prompt + # are full and already computed when the second request arrives. + prompt = ( + "You are a helpful assistant. How do I build a car from cardboard and " + "paper clips? Is there an easy to follow video tutorial available " + "online for free?") + prompt2 = ( + " Please recommend to me some resources where I can learn not only to " + "handle technical difficulties of building a car, but also " + "decoration.") + + engine_args = EngineArgs(model=model, + block_size=block_size, + enable_prefix_caching=True) + + engine = LLMEngine.from_engine_args(engine_args) + sampling_params = SamplingParams() + + engine.add_request("0", prompt + prompt2, sampling_params) + engine.step() + engine.add_request("1", prompt, sampling_params) + engine.step() diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index daf83827a7e52..52b120f227eda 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,6 +1,6 @@ """A block manager that manages token blocks.""" import enum -from itertools import count +from itertools import count, takewhile from os.path import commonprefix from typing import Dict, List, Optional, Set, Tuple @@ -426,23 +426,29 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def compute_last_full_block_in_seq(self, seq: Sequence): + def compute_full_blocks_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return max_full_block = seq.get_len() // self.block_size - 1 block_table = self.block_tables[seq.seq_id] if max_full_block == -1: return - block_table[max_full_block].computed = True + for i in reversed(range(max_full_block)): + if block_table[i].computed: + break + block_table[i].computed = True - def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]: + def get_all_computed_blocks(self, seq: Sequence) -> List[int]: if seq.seq_id not in self.block_tables: return [] block_table = self.block_tables[seq.seq_id] - for block_idx in reversed(range(len(block_table))): - if block_table[block_idx].computed: - return [b.block_number for b in block_table[:block_idx + 1]] - return [] + # NOTE We exclude the last block to avoid the case where the entire + # prompt is cached. This would cause erroneous behavior in model + # runner. + return [ + b.block_number + for b in takewhile(lambda b: b.computed, block_table[:-1]) + ] def get_common_computed_block_ids(self, seq_group: SequenceGroup) -> List[int]: @@ -451,14 +457,12 @@ def get_common_computed_block_ids(self, return [] ids_list = [ - self.get_all_block_ids_till_computed(seq) + self.get_all_computed_blocks(seq) for seq in iter(seq_group.seqs_dict.values()) ] return commonprefix([ids for ids in ids_list if ids != []]) def mark_blocks_as_computed(self, seq_group: SequenceGroup): - # NOTE: We only mark the last full block because with prefix caching, - # all blocks until the marked one are guaranteed to be computed. if self.enable_caching: for seq in seq_group.seqs_dict.values(): - self.compute_last_full_block_in_seq(seq) + self.compute_full_blocks_in_seq(seq) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b01f865f1bb03..9023b0c59b3fb 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -215,6 +215,7 @@ def _prepare_prompt( slot_mapping[-1].append(slot) max_prompt_len = max(subquery_lens) + assert max_prompt_len > 0 input_tokens = _make_tensor_with_pad(input_tokens, max_prompt_len, pad=0, From d2339d6840498397f6e373489ed120cd2cce8eb4 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 7 Mar 2024 16:38:12 -0800 Subject: [PATCH 068/113] Connect engine healthcheck to openai server (#3260) --- vllm/entrypoints/openai/api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 993a834e5a720..9f29b4ac92f48 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -160,6 +160,7 @@ async def validation_exception_handler(_, exc): @app.get("/health") async def health() -> Response: """Health check.""" + await openai_serving_chat.engine.check_health() return Response(status_code=200) From c59e120c557743b0fc8178ee1796c8a3def78bf4 Mon Sep 17 00:00:00 2001 From: whyiug Date: Fri, 8 Mar 2024 13:58:24 +0800 Subject: [PATCH 069/113] Feature add lora support for Qwen2 (#3177) --- csrc/punica/bgmv/bgmv_config.h | 2 ++ vllm/model_executor/models/qwen2.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index d5fee9c40d00c..3eb84ceb4d534 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -21,6 +21,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 2048) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ + f(in_T, out_T, W_T, narrow, 2816) \ f(in_T, out_T, W_T, narrow, 3072) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ @@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ f(in_T, out_T, W_T, narrow, 12288) \ + f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 3586a7fb82778..4dd63f923e5f2 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -46,6 +46,7 @@ from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput +from vllm.config import LoRAConfig KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -264,12 +265,35 @@ def forward( class Qwen2ForCausalLM(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] def __init__( self, config: Qwen2Config, linear_method: Optional[LinearMethodBase] = None, + lora_config: Optional[LoRAConfig] = None, ) -> None: + del lora_config super().__init__() self.config = config self.linear_method = linear_method From 1ece1ae829dcbc4b1b19b3e2d3042457615e862f Mon Sep 17 00:00:00 2001 From: TianYu GUO Date: Fri, 8 Mar 2024 14:22:59 +0800 Subject: [PATCH 070/113] [Minor Fix] Fix comments in benchmark_serving (#3252) --- benchmarks/benchmark_serving.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 7d389a9c7d703..3f5e2d9c8f4dc 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -12,7 +12,7 @@ On the client side, run: python benchmarks/benchmark_serving.py \ --backend \ - --tokenizer --dataset \ + --model --dataset \ --request-rate """ import argparse @@ -171,10 +171,10 @@ async def benchmark( else: raise ValueError(f"Unknown backend: {backend}") - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - print(f"Traffic request rate: {request_rate}") + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + benchmark_start_time = time.perf_counter() tasks = [] async for request in get_request(input_requests, request_rate): From 99c3cfb83c20d45899ab8cbfdddce98c7cffb7b1 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Fri, 8 Mar 2024 09:58:01 -0800 Subject: [PATCH 071/113] [Docs] Fix Unmocked Imports (#3275) --- docs/source/conf.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 5a45c6f9d1e0a..61d24e1612128 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -72,8 +72,15 @@ # Mock out external dependencies here. autodoc_mock_imports = [ - "torch", "transformers", "psutil", "prometheus_client", "sentencepiece", - "vllm.cuda_utils", "vllm._C" + "torch", + "transformers", + "psutil", + "prometheus_client", + "sentencepiece", + "vllm.cuda_utils", + "vllm._C", + "numpy", + "tqdm", ] for mock_target in autodoc_mock_imports: From 1cb0cc2975d1c42c445c795f955b783e78919502 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 8 Mar 2024 10:52:20 -0800 Subject: [PATCH 072/113] [FIX] Make `flash_attn` optional (#3269) --- .gitignore | 3 -- setup.py | 48 ++----------------- vllm/__init__.py | 30 +++--------- .../layers/attention/attention.py | 37 +++++++++++--- .../layers/attention/backends/flash_attn.py | 1 - 5 files changed, 41 insertions(+), 78 deletions(-) diff --git a/.gitignore b/.gitignore index 0b14c98270c41..b5195629e5cf3 100644 --- a/.gitignore +++ b/.gitignore @@ -184,6 +184,3 @@ _build/ # Benchmark dataset *.json - -# Third-party Python packages. -vllm/thirdparty_files/ diff --git a/setup.py b/setup.py index 57d7a139e8237..745b5a9b2d02a 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,6 @@ import os import re import subprocess -import sys import warnings from pathlib import Path from typing import List, Set @@ -15,8 +14,6 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME ROOT_DIR = os.path.dirname(__file__) -# This is a temporary directory to store third-party packages. -THIRDPARTY_SUBDIR = "vllm/thirdparty_files" # If you are developing the C++ backend of vLLM, consider building vLLM with # `python setup.py develop` since it will give you incremental builds. @@ -327,46 +324,8 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS_PUNICA, }, )) - - # Download the FlashAttention package. - # Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/setup.py#L518-L530 - flash_attn_version = "2.5.6" - install_dir = os.path.join(ROOT_DIR, THIRDPARTY_SUBDIR) - subprocess.check_call( - [ - sys.executable, - "-m", - "pip", - "install", - "-q", - f"--target={install_dir}", - "einops", # Dependency of flash-attn. - f"flash-attn=={flash_attn_version}", - "--no-dependencies", # Required to avoid re-installing torch. - ], - env=dict(os.environ, CC="gcc"), - ) - - # Copy the FlashAttention package into the vLLM package after build. - class build_ext(BuildExtension): - - def run(self): - super().run() - target_dir = os.path.join(self.build_lib, THIRDPARTY_SUBDIR) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - self.copy_tree(install_dir, target_dir) - - class BinaryDistribution(setuptools.Distribution): - - def has_ext_modules(self): - return True - -else: - build_ext = BuildExtension - BinaryDistribution = setuptools.Distribution - if _is_neuron(): - neuronxcc_version = get_neuronxcc_version() +elif _is_neuron(): + neuronxcc_version = get_neuronxcc_version() vllm_extension_sources = [ "csrc/cache_kernels.cu", @@ -509,7 +468,6 @@ def get_requirements() -> List[str]: python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass={"build_ext": build_ext} if not _is_neuron() else {}, - distclass=BinaryDistribution, + cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {}, package_data=package_data, ) diff --git a/vllm/__init__.py b/vllm/__init__.py index 59f1345b58d42..f1e30f5eb6e6e 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,28 +1,12 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" - -# Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/ray/__init__.py#L11 -def _configure_system(): - import os - import sys - - # Importing flash-attn. - thirdparty_files = os.path.join(os.path.abspath(os.path.dirname(__file__)), - "thirdparty_files") - sys.path.insert(0, thirdparty_files) - - -_configure_system() -# Delete configuration function. -del _configure_system - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 -from vllm.engine.async_llm_engine import AsyncLLMEngine # noqa: E402 -from vllm.engine.llm_engine import LLMEngine # noqa: E402 -from vllm.engine.ray_utils import initialize_cluster # noqa: E402 -from vllm.entrypoints.llm import LLM # noqa: E402 -from vllm.outputs import CompletionOutput, RequestOutput # noqa: E402 -from vllm.sampling_params import SamplingParams # noqa: E402 +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.llm_engine import LLMEngine +from vllm.engine.ray_utils import initialize_cluster +from vllm.entrypoints.llm import LLM +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.sampling_params import SamplingParams __version__ = "0.3.3" diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 830e82e10f7ad..724dd0511c5aa 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -1,12 +1,16 @@ """Attention layer.""" +from functools import lru_cache from typing import List, Optional import torch import torch.nn as nn +from vllm.logger import init_logger from vllm.model_executor.input_metadata import InputMetadata from vllm.utils import is_hip +logger = init_logger(__name__) + class Attention(nn.Module): """Attention layer. @@ -30,17 +34,12 @@ def __init__( sliding_window: Optional[int] = None, ) -> None: super().__init__() - if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and - torch.get_default_dtype() in (torch.float16, torch.bfloat16)): - # Ampere or later NVIDIA GPUs. - # NOTE(woosuk): FlashAttention does not support FP32. + if _use_flash_attn(): from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend self.backend = FlashAttentionBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) else: - # Turing and Volta NVIDIA GPUs or AMD GPUs. - # Or FP32 on any GPU. from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend self.backend = XFormersBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, @@ -57,3 +56,29 @@ def forward( ) -> torch.Tensor: return self.backend.forward(query, key, value, key_cache, value_cache, input_metadata) + + +@lru_cache(maxsize=1) +def _use_flash_attn() -> bool: + try: + import flash_attn # noqa: F401 + except ImportError: + logger.info("flash_attn is not found. Using xformers backend.") + return False + + if is_hip(): + # AMD GPUs. + return False + if torch.cuda.get_device_capability()[0] < 8: + # Volta and Turing NVIDIA GPUs. + logger.info("flash_attn is not supported on Turing or older GPUs. " + "Using xformers backend.") + return False + if torch.get_default_dtype() not in (torch.float16, torch.bfloat16): + logger.info( + "flash_attn only supports torch.float16 or torch.bfloat16. " + "Using xformers backend.") + return False + + logger.info("Using flash_attn backend.") + return True diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index 512f4e49c7eb2..4abe195f274a7 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -1,7 +1,6 @@ """Attention layer with Flash and PagedAttention.""" from typing import List, Optional -# NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/. from flash_attn import flash_attn_func import torch From c2c5e0909ad4457ad542117939c2629ebe2db609 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 8 Mar 2024 13:33:10 -0800 Subject: [PATCH 073/113] Move model filelocks from `/tmp/` to `~/.cache/vllm/locks/` dir (#3241) --- vllm/model_executor/weight_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 3570366887e78..24d78db8d2637 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -20,6 +20,9 @@ logger = init_logger(__name__) +_xdg_cache_home = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache')) +_vllm_filelocks_path = os.path.join(_xdg_cache_home, 'vllm/locks/') + class Disabledtqdm(tqdm): @@ -28,7 +31,8 @@ def __init__(self, *args, **kwargs): def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): - lock_dir = cache_dir if cache_dir is not None else "/tmp" + lock_dir = cache_dir if cache_dir is not None else _vllm_filelocks_path + os.makedirs(os.path.dirname(lock_dir), exist_ok=True) lock_file_name = model_name_or_path.replace("/", "-") + ".lock" lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name)) return lock From f48c6791b7bfc2579ad575d33ed83912f0bfb011 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Fri, 8 Mar 2024 17:16:14 -0800 Subject: [PATCH 074/113] [FIX] Fix prefix test error on main (#3286) --- vllm/model_executor/layers/attention/backends/flash_attn.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index 4abe195f274a7..58ccd461b993e 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -103,8 +103,6 @@ def forward( key_cache, value_cache, input_metadata, - self.num_heads, - self.num_kv_heads, self.alibi_slopes, ) else: From 8437bae6ef47a690d18c72f0da02c7e5abe83866 Mon Sep 17 00:00:00 2001 From: Cade Daniel Date: Fri, 8 Mar 2024 23:32:46 -0800 Subject: [PATCH 075/113] [Speculative decoding 3/9] Worker which speculates, scores, and applies rejection sampling (#3103) --- .buildkite/test-pipeline.yaml | 5 +- tests/{worker => }/spec_decode/__init__.py | 0 tests/spec_decode/test_batch_expansion.py | 95 +++ tests/spec_decode/test_metrics.py | 157 +++++ .../spec_decode/test_multi_step_worker.py | 162 ++++- tests/spec_decode/test_spec_decode_worker.py | 591 ++++++++++++++++++ tests/spec_decode/test_utils.py | 111 ++++ tests/{worker => }/spec_decode/utils.py | 115 +++- tests/test_sequence.py | 50 ++ .../layers/rejection_sampler.py | 10 +- vllm/model_executor/layers/sampler.py | 2 +- vllm/sequence.py | 55 +- vllm/spec_decode/batch_expansion.py | 351 +++++++++++ vllm/spec_decode/interfaces.py | 77 +++ vllm/spec_decode/metrics.py | 174 ++++++ vllm/spec_decode/multi_step_worker.py | 366 +++++++++++ vllm/spec_decode/spec_decode_worker.py | 372 +++++++++++ vllm/spec_decode/util.py | 99 +++ vllm/worker/model_runner.py | 11 +- vllm/worker/spec_decode/multi_step_worker.py | 178 ------ vllm/worker/worker.py | 20 +- 21 files changed, 2786 insertions(+), 215 deletions(-) rename tests/{worker => }/spec_decode/__init__.py (100%) create mode 100644 tests/spec_decode/test_batch_expansion.py create mode 100644 tests/spec_decode/test_metrics.py rename tests/{worker => }/spec_decode/test_multi_step_worker.py (61%) create mode 100644 tests/spec_decode/test_spec_decode_worker.py create mode 100644 tests/spec_decode/test_utils.py rename tests/{worker => }/spec_decode/utils.py (60%) create mode 100644 tests/test_sequence.py create mode 100644 vllm/spec_decode/batch_expansion.py create mode 100644 vllm/spec_decode/interfaces.py create mode 100644 vllm/spec_decode/metrics.py create mode 100644 vllm/spec_decode/multi_step_worker.py create mode 100644 vllm/spec_decode/spec_decode_worker.py create mode 100644 vllm/spec_decode/util.py delete mode 100644 vllm/worker/spec_decode/multi_step_worker.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 15f971b66e3bd..42a1eacb6de57 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -28,7 +28,7 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Engine Test - command: pytest -v -s engine + command: pytest -v -s engine test_sequence.py - label: Entrypoints Test command: pytest -v -s entrypoints @@ -52,6 +52,9 @@ steps: - label: Worker Test command: pytest -v -s worker +- label: Speculative decoding tests + command: pytest -v -s spec_decode + - label: LoRA Test command: pytest -v -s lora --forked diff --git a/tests/worker/spec_decode/__init__.py b/tests/spec_decode/__init__.py similarity index 100% rename from tests/worker/spec_decode/__init__.py rename to tests/spec_decode/__init__.py diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py new file mode 100644 index 0000000000000..fddc3995452cc --- /dev/null +++ b/tests/spec_decode/test_batch_expansion.py @@ -0,0 +1,95 @@ +import torch +import pytest + +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer + +from .utils import mock_worker, create_seq_group_metadata_from_prompts + + +@pytest.mark.parametrize('num_target_seq_ids', [100]) +def test_create_target_seq_id_iterator(num_target_seq_ids: int): + """Verify all new sequence ids are greater than all input + seq ids. + """ + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + + all_seq_ids = [ + [1, 3, 5, 7], + list(range(100)) + [0], + [100], + ] + + for seq_ids in all_seq_ids: + max_seq_id = max(seq_ids) + iterator = scorer._create_target_seq_id_iterator(seq_ids) # pylint: disable=protected-access + for _ in range(num_target_seq_ids): + assert next(iterator) > max_seq_id + + +@pytest.mark.parametrize('k', [1, 2, 6]) +def test_get_token_ids_to_score(k: int): + """Verify correct tokens are selected for scoring. + """ + proposal_token_ids = torch.tensor( + list(range(k)), + dtype=torch.int64, + device='cuda', + ) + + expected_output = [ + [], + ] + for i in range(proposal_token_ids.shape[0]): + expected_output.append(proposal_token_ids[:i + 1].tolist()) + + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access + + actual_output = [ + x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output + ] + + assert actual_output == expected_output + + +@pytest.mark.parametrize('k', [1, 2, 6]) +def test_create_single_target_seq_group_metadata(k: int): + """Verify correct creation of a batch-expanded seq group metadata. + """ + + prompt_tokens = [1, 2, 3] + prev_output_tokens = [4, 5, 6] + + token_ids = list(range(k)) + + num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1 + + final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len( + token_ids) + + block_size = 32 + input_seq_group_metadata = create_seq_group_metadata_from_prompts( + [prompt_tokens], 2048 // block_size, block_size, [final_seq_len], + [prev_output_tokens], [num_tokens_processed])[0] + + input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0] + target_seq_id = 100 + + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + output = scorer._create_single_target_seq_group_metadata( # pylint: disable=protected-access + input_seq_group_metadata, + input_seq_id, + target_seq_id, + token_ids, + ) + + assert output.request_id == input_seq_group_metadata.request_id + assert len(output.seq_data) == 1 + assert output.seq_data[target_seq_id].get_prompt_token_ids( + ) == prompt_tokens + assert output.seq_data[target_seq_id].get_output_token_ids( + ) == prev_output_tokens + token_ids + + assert len(output.block_tables) == 1 + assert output.block_tables[ + target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id] diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py new file mode 100644 index 0000000000000..941ea37aa81e0 --- /dev/null +++ b/tests/spec_decode/test_metrics.py @@ -0,0 +1,157 @@ +import torch +import math +import pytest + +from unittest.mock import MagicMock + +from vllm.spec_decode.metrics import AsyncMetricsCollector + + +def test_initial_call_returns_none(): + """Expect first call to get metrics to return None. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collector = AsyncMetricsCollector(rej_sampler) + collector.init_gpu_tensors(rank=0) + maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert maybe_metrics is None + + +def test_second_call_returns_metrics(): + """Expect second call to not return None. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is not None + + +@pytest.mark.parametrize("rank", [1, 2, 3, 4]) +def test_nonzero_rank_noop(rank): + """Verify nonzero ranks don't collect metrics. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collector = AsyncMetricsCollector(rej_sampler) + collector.init_gpu_tensors(rank=rank) + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is None + + +def test_noop_until_time(): + """Verify metrics aren't collected until enough time passes. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s - 0.1, collect_interval_s - 0.1, + collect_interval_s + 0.1, collect_interval_s + 0.1 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is None + + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is not None + + +@pytest.mark.parametrize("has_data", [True, False]) +def test_initial_metrics_has_correct_values(has_data: bool): + """Test correctness of metrics data. + """ + if has_data: + num_accepted_tokens = 103 + num_emitted_tokens = 104 + num_draft_tokens = 105 + else: + num_accepted_tokens = 0 + num_emitted_tokens = 0 + num_draft_tokens = 0 + k = 5 + + num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens( + num_draft_tokens, k) + + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = num_draft_tokens + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + _ = collector.maybe_collect_rejsample_metrics(k) + metrics = collector.maybe_collect_rejsample_metrics(k) + + assert metrics.num_spec_tokens == k + assert metrics.accepted_tokens == num_accepted_tokens + assert metrics.draft_tokens == num_draft_tokens + assert metrics.emitted_tokens == num_emitted_tokens + + if has_data: + assert metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens + assert metrics.system_efficiency == num_emitted_tokens / num_possible_tokens + else: + assert math.isnan(metrics.draft_acceptance_rate) + assert math.isnan(metrics.system_efficiency) diff --git a/tests/worker/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py similarity index 61% rename from tests/worker/spec_decode/test_multi_step_worker.py rename to tests/spec_decode/test_multi_step_worker.py index ea54802903578..88bb7c293fe95 100644 --- a/tests/worker/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -3,14 +3,15 @@ import pytest from unittest.mock import MagicMock -from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.multi_step_worker import MultiStepWorker, DraftModelTop1Proposer from vllm.worker.worker import Worker from vllm.model_executor.utils import set_random_seed +from vllm.sequence import SamplerOutput from .utils import (create_execute_model_data, create_worker, create_seq_group_metadata_from_prompts, zero_kv_cache, patch_execute_model_with_seeds, - assert_logprobs_dict_allclose) + assert_logprobs_dict_allclose, create_batch) @pytest.mark.parametrize('num_steps', list(range(1, 17))) @@ -259,3 +260,160 @@ def test_same_output_for_multi_step(): multi_step_output_logprobs, single_step_output_logprobs): assert_logprobs_dict_allclose(multi_step_logprobs, single_step_logprobs) + + +@torch.inference_mode() +def test_draft_proposals_full_speculation_len(): + """Verify DraftModelTop1Proposer correctly handles case where all sequences + can speculate. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=2048, + vocab_size=vocab_size, + ) + draft_worker.execute_model_multi_step.return_value = [ + SamplerOutput( + outputs=[], + sampled_token_probs=torch.rand(batch_size, + vocab_size, + device=device, + dtype=torch.float32), + sampled_token_ids=torch.randint(low=0, + high=vocab_size, + size=(batch_size, ), + device=device, + dtype=torch.long), + ) for _ in range(k) + ] + + execute_model_data, _, _ = create_batch(batch_size, k) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] + + +@torch.inference_mode() +def test_draft_proposals_no_speculations(): + """Verify DraftModelTop1Proposer correctly handles case where no sequences + can speculate. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + prompt_len = 10 + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=prompt_len + k - 1, + vocab_size=vocab_size, + ) + + execute_model_data, _, _ = create_batch(batch_size, + k, + prompt_len=prompt_len) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([0, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] + + +@torch.inference_mode() +def test_draft_proposals_mixed_k(): + """Verify DraftModelTop1Proposer correctly handles case some sequences can + speculate and some can't. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + + small_prompt_len = 5 + long_prompt_len = 10 + prev_output_token_len = 20 + + expected_num_proposal_seqs = 6 + expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs + + prompt_len = [ + small_prompt_len for _ in range(expected_num_proposal_seqs - 1) + ] + [long_prompt_len + for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len] + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=long_prompt_len + prev_output_token_len + k - 1, + vocab_size=vocab_size, + ) + + draft_worker.execute_model_multi_step.return_value = [ + SamplerOutput( + outputs=[], + sampled_token_probs=torch.rand(expected_num_proposal_seqs, + vocab_size, + device=device, + dtype=torch.float32), + sampled_token_ids=torch.randint( + low=0, + high=vocab_size, + size=(expected_num_proposal_seqs, ), + device=device, + dtype=torch.long), + ) for _ in range(k) + ] + + execute_model_data, _, _ = create_batch( + batch_size, + k, + prompt_len=prompt_len, + prev_output_token_len=prev_output_token_len, + ) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [ + k for _ in range(expected_num_proposal_seqs - 1) + ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k] diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py new file mode 100644 index 0000000000000..e919711c3ed2c --- /dev/null +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -0,0 +1,591 @@ +import torch +import random +import pytest +from unittest.mock import MagicMock + +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker, split_num_cache_blocks_evenly +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.model_executor.utils import set_random_seed +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from .utils import mock_worker, create_batch, ExecuteModelData, create_sampler_output_list +from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics, AsyncMetricsCollector + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_draft_model(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the draft worker with correct + inputs. Everything else is mocked out. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + exception_secret = 'artifical stop' + draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) + + execute_model_data, _, _ = create_batch(batch_size, k) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + call_args_list = draft_worker.get_spec_proposals.call_args_list + assert len(call_args_list) == 1 + + for args, _ in call_args_list: + (seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy, actual_k) = args + actual_execute_model_data = ExecuteModelData(seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy) + assert actual_execute_model_data == execute_model_data + assert actual_k == k + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_target_model(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the target model with correct + inputs. Everything else is mocked out. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + vocab_size = 32_000 + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + exception_secret = 'artifical stop' + target_worker.execute_model.side_effect = ValueError(exception_secret) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + seen_contexts = [] + + call_args_list = target_worker.execute_model.call_args_list + assert len(call_args_list) == 1 + for args, kwargs in call_args_list: + target_execute_model_data = ExecuteModelData.from_dict(kwargs) + + assert len(target_execute_model_data.seq_group_metadata_list) == ( + k + 1) * batch_size + for seq_group_metadata in ( + target_execute_model_data.seq_group_metadata_list): + for seq_data in seq_group_metadata.seq_data.values(): + seen_contexts.append(seq_data.get_token_ids()) + + expected_seen_contexts = [] + + for prompt, prev_generated, draft_tokens in zip( + prompts, prev_output_tokens, proposal_token_ids.tolist()): + + for i in range(len(draft_tokens) + 1): + expected_seen_contexts.append(prompt + prev_generated + + draft_tokens[:i]) + + seen_contexts.sort() + expected_seen_contexts.sort() + assert expected_seen_contexts == seen_contexts + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_rejection_sampler(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the rejection sampler with + correct inputs. Everything else is mocked out. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + exception_secret = 'artifical stop' + rejection_sampler.side_effect = ValueError(exception_secret) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + assert len(rejection_sampler.call_args_list) == 1 + args, _ = rejection_sampler.call_args_list[0] + (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs, + actual_proposal_token_ids) = args + + assert torch.equal(actual_bonus_token_ids, + target_token_ids.reshape(batch_size, k + 1)[:, -1:]) + assert torch.equal( + actual_proposal_scores, + target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1]) + assert torch.equal(actual_proposal_token_ids, proposal_token_ids) + assert torch.equal(actual_proposal_probs, proposal_probs) + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_formats_output(k: int, batch_size: int): + """Verify SpecDecodeWorker formats sampler output correctly. + Everything else is mocked out. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + rejection_sampler_output = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k + 1), + dtype=torch.int64, + device='cuda') + for i in range(batch_size): + minimum_accepted_tokens = 1 + rejection_sampler_output[i][ + -random.randint(minimum_accepted_tokens, k + 1):] = -1 + + rejection_sampler.return_value = rejection_sampler_output + + output = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + expected_output = create_sampler_output_list( + rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)]) + + seq_ids = [ + next(iter(seq_group_metadata.seq_data.keys())) + for seq_group_metadata in execute_model_data.seq_group_metadata_list + ] + actual_output_by_seq = {seq_id: [] for seq_id in seq_ids} + expected_output_by_seq = {seq_id: [] for seq_id in seq_ids} + + for step in output: + for seq_group in step: + for sample in seq_group.samples: + seq_id = sample.parent_seq_id + actual_output_by_seq[seq_id].append(sample) + + for step in expected_output: + for seq_group in step: + for sample in seq_group.samples: + seq_id = sample.parent_seq_id + expected_output_by_seq[seq_id].append(sample) + + all_seen_seq_ids = set( + list(actual_output_by_seq.keys()) + + list(expected_output_by_seq.keys())) + for seq_id in all_seen_seq_ids: + actual_by_step = actual_output_by_seq[seq_id] + expected_by_step = expected_output_by_seq[seq_id] + + for i in range(k + 1): + if i >= len(actual_by_step): + assert expected_by_step[i].output_token == -1 + continue + assert actual_by_step[i].output_token == expected_by_step[ + i].output_token + assert actual_by_step[i].logprobs == expected_by_step[i].logprobs + + +@pytest.mark.parametrize('k', [1, 2]) +@pytest.mark.parametrize('batch_size', [1]) +@pytest.mark.parametrize('returns_metrics', [True, False]) +@torch.inference_mode() +def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): + """Verify SpecDecodeWorker collects metrics. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + rejection_sampler_output = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k + 1), + dtype=torch.int64, + device='cuda') + for i in range(batch_size): + minimum_accepted_tokens = 1 + rejection_sampler_output[i][ + -random.randint(minimum_accepted_tokens, k + 1):] = -1 + + rejection_sampler.return_value = rejection_sampler_output + + mock_rejsample_metrics = MagicMock( + spec=SpecDecodeWorkerMetrics) if returns_metrics else None + metrics_collector.maybe_collect_rejsample_metrics.return_value = mock_rejsample_metrics + + output = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics + + call_args_list = metrics_collector.maybe_collect_rejsample_metrics.call_args_list + assert len(call_args_list) == 1 + args, kwargs = call_args_list[0] + assert args[0] == k or kwargs.get('k', -1) == k + + +@pytest.mark.parametrize('k', [0]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_k_equals_zero(k: int, batch_size: int): + """Verify that the SpecDecodeWorker calls the draft and target workers + when k is zero. This happens during prefill. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k, prev_output_token_len=0) + + out = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + assert len(out) == 1, f"expected only one token output when {k=}" + assert out[0].probs is None, "expect gpu tensor references to be None" + assert out[ + 0].sampled_tokens is None, "expect gpu tensor references to be None" + + draft_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict(), return_python_output=False) + target_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict()) + + +@pytest.mark.parametrize('k', [0, 5]) +@pytest.mark.parametrize('batch_size', [0]) +@torch.inference_mode() +def test_empty_input_batch(k: int, batch_size: int): + """Verify that the SpecDecodeWorker calls the draft and target workers + when the input batch is empty. This can happen if the engine communicates + to the workers information without scheduling a batch. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k, prev_output_token_len=0) + + out = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + assert len(out) == 1, f"expected only one token output when {k=}" + assert out[0].probs is None, "expect gpu tensor references to be None" + assert out[ + 0].sampled_tokens is None, "expect gpu tensor references to be None" + + draft_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict(), return_python_output=False) + target_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict()) + + +@torch.inference_mode() +def test_init_model(): + """Verify SpecDecodeWorker invokes proposer/scorer worker init_model, as + well as other GPU initialization. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + worker.init_model() + + draft_worker.init_model.assert_called_once() + + target_worker.init_model.assert_called_once() + + metrics_collector.init_gpu_tensors.assert_called_once() + rejection_sampler.init_gpu_tensors.assert_called_once() + + +@torch.inference_mode() +def test_init_cache_engine(): + """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer + workers. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + cache_config = MagicMock() + + worker.init_cache_engine(cache_config) + + draft_worker.init_cache_engine.assert_called_once_with(cache_config) + target_worker.init_cache_engine.assert_called_once_with(cache_config) + + +@pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) +@pytest.mark.parametrize('available_cpu_blocks', [500]) +@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) +@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) +@torch.inference_mode() +def test_profile_num_available_blocks(available_gpu_blocks: int, + available_cpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): + """Verify SpecDecodeWorker correctly profiles num available GPU blocks. + Specifically, it should run profiling in the scorer worker, and then evenly + split the blocks between proposer and scorer worker. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + target_worker.profile_num_available_blocks.return_value = ( + available_gpu_blocks, available_cpu_blocks) + target_worker.get_cache_block_size_bytes.return_value = target_cache_block_size_bytes + draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + # These values do not directly impact the adjusted block size calculation, + # so they can be fixed. + gpu_memory_utilization = 0.9 + cpu_swap_space = 100 + block_size = 16 + + num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks( + block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto") + + target_worker.profile_num_available_blocks.assert_called_once_with( + block_size, gpu_memory_utilization, cpu_swap_space, "auto") + assert num_cpu_blocks == available_cpu_blocks + + assert num_gpu_blocks == split_num_cache_blocks_evenly( + target_cache_block_size_bytes, draft_kv_size_bytes, + available_gpu_blocks) + + +@pytest.mark.parametrize('available_gpu_blocks', + list(range(20)) + [1024, 1024**2]) +@pytest.mark.parametrize('target_cache_block_size_bytes', + [2 * 2 * 4096, 2 * 2 * 8192]) +@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) +@torch.inference_mode() +def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): + """Verify split_num_cache_blocks_evenly does not exceed original memory + allocation in bytes. + """ + num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes, + draft_kv_size_bytes, + available_gpu_blocks) + assert (num_blocks * target_cache_block_size_bytes) + ( + num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks * + target_cache_block_size_bytes) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py new file mode 100644 index 0000000000000..19833ddb06154 --- /dev/null +++ b/tests/spec_decode/test_utils.py @@ -0,0 +1,111 @@ +from vllm.spec_decode.util import get_all_seq_ids +from vllm.sequence import SequenceGroupMetadata +from vllm.spec_decode.util import split_batch_by_proposal_len + +import pytest +from unittest.mock import MagicMock + + +def test_get_all_seq_ids(): + """Verify get_all_seq_ids extracts all seq ids. + """ + expected_seq_ids = list(range(10)) + list(range(100, 110)) + + seq_group_metadata_list = [ + SequenceGroupMetadata( + request_id=str(seq_id), + is_prompt=True, + seq_data={ + seq_id: MagicMock(), + }, + sampling_params=MagicMock(), + block_tables={ + seq_id: MagicMock(), + }, + lora_request=None, + ) for seq_id in expected_seq_ids + ] + + actual_seq_ids = get_all_seq_ids(seq_group_metadata_list) + assert actual_seq_ids == expected_seq_ids + + +@pytest.fixture +def fake_sequence_group_metadata(): + seq_ids = list(range(3)) + return [ + SequenceGroupMetadata( + request_id=str(i), + is_prompt=True, + seq_data={ + i: MagicMock(), + }, + sampling_params=MagicMock(), + block_tables={ + i: MagicMock(), + }, + lora_request=None, + ) for i in seq_ids + ] + + +def test_filter_zero_length_proposals(fake_sequence_group_metadata): + proposal_lens = [0, 1, 0] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=True) + + expected_groups = [ + fake_sequence_group_metadata[0], fake_sequence_group_metadata[2] + ] + expected_indices = [0, 2] + + assert filtered_groups == expected_groups + assert indices == expected_indices + + +def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): + proposal_lens = [0, 1, 2] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=False) + + expected_groups = [ + fake_sequence_group_metadata[1], fake_sequence_group_metadata[2] + ] + expected_indices = [1, 2] + + assert filtered_groups == expected_groups + assert indices == expected_indices + + +def test_empty_inputs(): + filtered_groups, indices = split_batch_by_proposal_len( + [], [], select_proposal_len_zero=True) + + assert filtered_groups == [] + assert indices == [] + + +def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): + proposal_lens = [0, 0, 0] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=False) + + assert filtered_groups == [] + assert indices == [] + + +def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata): + proposal_lens = [1, 1, 1] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=True) + + assert filtered_groups == [] + assert indices == [] diff --git a/tests/worker/spec_decode/utils.py b/tests/spec_decode/utils.py similarity index 60% rename from tests/worker/spec_decode/utils.py rename to tests/spec_decode/utils.py index fa8767cf898aa..997093988c0eb 100644 --- a/tests/worker/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,13 +1,16 @@ import torch -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Iterable, Union +from unittest.mock import MagicMock from vllm.worker.worker import Worker from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData +from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData, + SamplerOutput, SequenceGroupOutput, SequenceOutput) from vllm.sampling_params import SamplingParams from vllm.worker.cache_engine import CacheEngine from vllm.model_executor.utils import set_random_seed +from itertools import count from dataclasses import dataclass, fields @@ -24,6 +27,11 @@ def to_dict(self): return dict( (field.name, getattr(self, field.name)) for field in fields(self)) + @classmethod + def from_dict(cls, d): + cleaned = dict((field.name, d[field.name]) for field in fields(cls)) + return cls(**cleaned) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size @@ -50,6 +58,21 @@ def create_execute_model_data( ) +def mock_worker(cls=None, + vocab_size: int = 30_000, + max_model_len: int = 2048, + rank: int = 0) -> MagicMock: + if cls is None: + cls = Worker + + worker = MagicMock(spec=cls) + worker.vocab_size = vocab_size + worker.max_model_len = max_model_len + worker.rank = rank + worker.device = 'cuda:0' + return worker + + def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): seed_iter = iter(rand_seeds) original_execute_model = worker.execute_model @@ -117,25 +140,12 @@ def create_seq_group_metadata_from_prompts( block_size: int, final_seq_lens: List[int], continuations: Optional[List[List[int]]] = None, - num_tokens_processed: Optional[List[int]] = None, seq_ids: Optional[List[int]] = None, ) -> List[SequenceGroupMetadata]: if continuations is None: continuations = [[] for _ in prompts] - if num_tokens_processed is None: - # Default to 1 token missing from kv cache for generation sequences. - num_tokens_processed = [] - for continuation, prompt in zip(continuations, prompts): - # If prefill, then default to zero tokens processed. - if not continuation: - num_tokens_processed.append(0) - else: - # If generation, then default to all but one tokens processed. - num_tokens_processed.append( - len(continuation) + len(prompt) - 1) - if seq_ids is None: seq_ids = list(i for i, _ in enumerate(prompts)) @@ -155,13 +165,15 @@ def create_seq_group_metadata_from_prompts( is_prompt=len(cont_token_ids) == 0, seq_data={ i: - SequenceData(prompt_token_ids=prompt_token_ids[:] + - cont_token_ids[:]) + SequenceData( + prompt_token_ids=prompt_token_ids[:], + output_token_ids=cont_token_ids[:], + ), }, sampling_params=SamplingParams(temperature=0.0, ), block_tables={i: block_allocations[i][:]}, - ) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in - enumerate(zip(prompts, continuations, num_tokens_processed)) + ) for i, (prompt_token_ids, + cont_token_ids) in enumerate(zip(prompts, continuations)) ] @@ -178,3 +190,68 @@ def assert_logprobs_dict_allclose( expected = torch.tensor( single_step_expected_logprobs[token_id].logprob) assert torch.allclose(actual, expected) + + +def create_sampler_output_list( + token_ids: torch.Tensor, + probs: Iterable[Optional[torch.Tensor]], + seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: + num_steps, batch_size = token_ids.shape + token_ids_by_step = token_ids.tolist() + + if seq_ids is None: + seq_ids = list(range(batch_size)) + + return [ + SamplerOutput(outputs=[ + SequenceGroupOutput( + samples=[ + SequenceOutput( + output_token=token_id, + parent_seq_id=seq_ids[seq_index], + logprobs={token_id: 0}, + ) + ], + prompt_logprobs=None, + ) for seq_index, token_id in enumerate(token_ids_by_step[step]) + ], + sampled_token_probs=probs[step], + sampled_token_ids=token_ids[step]) + for step in range(num_steps) + ] + + +def create_batch(batch_size, + k, + prompt_len: Union[int, List[int]] = 10, + prev_output_token_len: int = 10, + seq_ids: Optional[List[int]] = None, + num_gpu_blocks: Optional[int] = None, + block_size: Optional[int] = None): + if block_size is None: + block_size = 8 + + if num_gpu_blocks is None: + num_gpu_blocks = 2048 // block_size + + iterator = count() + + if isinstance(prompt_len, int): + prompt_lens = [prompt_len for _ in range(batch_size)] + else: + prompt_lens = prompt_len + + prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] + prev_output_tokens = [[ + next(iterator) for _ in range(prev_output_token_len) + ] for _ in range(batch_size)] + final_seq_lens = [ + len(prompt) + len(prev_output_token) + k + 1 + for prompt, prev_output_token in zip(prompts, prev_output_tokens) + ] + + execute_model_data = create_execute_model_data( + create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks, + block_size, final_seq_lens, + prev_output_tokens, seq_ids), ) + return execute_model_data, prompts, prev_output_tokens diff --git a/tests/test_sequence.py b/tests/test_sequence.py new file mode 100644 index 0000000000000..e18df059d770f --- /dev/null +++ b/tests/test_sequence.py @@ -0,0 +1,50 @@ +import pytest + +from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput + + +@pytest.fixture +def sample_outputs(): + return [ + SequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=0, output_token=i, logprobs={}) + ], + prompt_logprobs=None) for i in range(5) + ] + + +@pytest.fixture +def sampler_output(sample_outputs): + return SamplerOutput(outputs=sample_outputs) + + +def test_sampler_output_initialization(sampler_output, sample_outputs): + assert len(sampler_output) == len(sample_outputs) + assert sampler_output.sampled_token_probs is None + assert sampler_output.sampled_token_ids is None + assert sampler_output.spec_decode_worker_metrics is None + + +def test_sampler_output_getitem(sampler_output, sample_outputs): + assert sampler_output[2] == sample_outputs[2] + + +def test_sampler_output_setitem(sampler_output): + new_output = SequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=0, output_token=99, logprobs={}) + ], + prompt_logprobs=None) + sampler_output[2] = new_output + assert sampler_output[2] == new_output + + +def test_sampler_output_len(sampler_output, sample_outputs): + assert len(sampler_output) == len(sample_outputs) + + +def test_sampler_output_eq(sample_outputs): + sampler_output1 = SamplerOutput(outputs=sample_outputs) + sampler_output2 = SamplerOutput(outputs=sample_outputs.copy()) + sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1]) + assert sampler_output1 == sampler_output2 + assert sampler_output1 != sampler_output3 diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 3e1cfc783b8ef..5643454060251 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -21,8 +21,6 @@ def __init__(self, strict_mode: bool = False): nontrivial latency. """ super().__init__() - self.probs_dtype = torch.float32 - self.token_id_dtype = torch.int64 self._strict_mode = strict_mode # NOTE: A "bonus token" is accepted iff all proposal tokens are @@ -44,6 +42,14 @@ def init_gpu_tensors(self, rank: int) -> None: dtype=torch.long, device=device) + @property + def probs_dtype(self): + return torch.float32 + + @property + def token_id_dtype(self): + return torch.int64 + def forward( self, target_probs: torch.Tensor, diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 320cb443524ca..19e7f630c4620 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -587,4 +587,4 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return sampler_output + return SamplerOutput(outputs=sampler_output) diff --git a/vllm/sequence.py b/vllm/sequence.py index fee96a875dde5..37c102407a5f2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -2,12 +2,16 @@ import copy import enum from dataclasses import dataclass -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, TYPE_CHECKING from vllm.block import LogicalTokenBlock from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest +if TYPE_CHECKING: + import torch + from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics + @dataclass class Logprob: @@ -81,6 +85,8 @@ class SequenceData: Args: prompt_token_ids: The token IDs of the prompt. + output_token_ids: The token IDs of the output. Set to an empty list if + None. Attributes: prompt_token_ids: The token IDs of the prompt. @@ -91,9 +97,13 @@ class SequenceData: def __init__( self, prompt_token_ids: List[int], + output_token_ids: Optional[List[int]] = None, ) -> None: + if output_token_ids is None: + output_token_ids = [] + self.prompt_token_ids = prompt_token_ids - self.output_token_ids: List[int] = [] + self.output_token_ids = output_token_ids self.cumulative_logprob = 0.0 def append_token_id(self, token_id: int, logprob: float) -> None: @@ -117,6 +127,12 @@ def get_last_token_id(self) -> int: return self.prompt_token_ids[-1] return self.output_token_ids[-1] + def get_prompt_token_ids(self) -> int: + return self.prompt_token_ids + + def get_output_token_ids(self) -> int: + return self.output_token_ids + def __repr__(self) -> str: return (f"SequenceData(" f"prompt_token_ids={self.prompt_token_ids}, " @@ -506,6 +522,35 @@ def __eq__(self, other: object) -> bool: and self.prompt_logprobs == other.prompt_logprobs) -# For each sequence group, we generate a list of SequenceOutput object, -# each of which contains one possible candidate for the next token. -SamplerOutput = List[SequenceGroupOutput] +@dataclass +class SamplerOutput: + """For each sequence group, we generate a list of SequenceOutput object, + each of which contains one possible candidate for the next token. + + This datastructure implements methods so it can be used like a list, but + also has optional fields for device tensors. + """ + + outputs: List[SequenceGroupOutput] + + # On-device tensor containing probabilities of each token. + sampled_token_probs: Optional["torch.Tensor"] = None + + # On-device tensor containing the sampled token ids. + sampled_token_ids: Optional["torch.Tensor"] = None + + # Spec decode metrics populated by workers. + spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + + def __getitem__(self, idx: int): + return self.outputs[idx] + + def __setitem__(self, idx: int, value): + self.outputs[idx] = value + + def __len__(self): + return len(self.outputs) + + def __eq__(self, other: object): + return isinstance(other, + self.__class__) and self.outputs == other.outputs diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py new file mode 100644 index 0000000000000..478c950f52873 --- /dev/null +++ b/vllm/spec_decode/batch_expansion.py @@ -0,0 +1,351 @@ +from typing import Iterator, List, Tuple, Optional, Dict +from itertools import chain, count + +import torch + +from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData) +from vllm.worker.worker import Worker +from vllm.spec_decode.util import nvtx_range, sampler_output_to_torch, get_all_seq_ids, split_batch_by_proposal_len +from vllm.spec_decode.interfaces import SpeculativeScorer, SpeculativeProposals, SpeculativeScores + +SeqId = int +TargetSeqId = int +TokenId = int + + +class BatchExpansionTop1Scorer(SpeculativeScorer): + """Implements a speculative scorer that uses batch expansion to get + probabilities of speculative tokens according to the scoring model. + + Batch expansion converts a list of sequences and multiple query positions + to a new batch of sequences, each with a single query position. This allows + for MQA-like scoring in speculative decoding without requiring an MQA + kernel. + + It is strictly less efficient than MQA scoring. + + It only supports scoring the top1 proposal tokens of the proposer, instead + of topk/tree. + """ + + def __init__(self, scorer_worker: Worker, device: str, vocab_size: int): + self._scorer_worker = scorer_worker + self._device = device + self._vocab_size = vocab_size + + @nvtx_range("BatchExpansionTop1Scorer.score_proposals") + def score_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + proposals: SpeculativeProposals, + ) -> SpeculativeScores: + """Score the proposed tokens via the scorer model. + + This converts each input sequence to a set of k+1 target sequences. The + target sequences have the unique continuations to be scored and a + unique sequence ID that is different from all input sequence ids. + + If a speculative sequence length would exceed the max model length, then + no speculation is produced for that sequence. + + Args: + seq_group_metadata_list: The input sequence group metadata. + blocks_to_swap_in: This is passed to the worker during scoring. + blocks_to_swap_out: This is passed to the worker during scoring. + blocks_to_copy: This is passed to the worker during scoring. + k: The fixed proposal length. + proposals: The speculative proposals to score. + Returns: + SpeculativeScores: The scores of each speculative token, along with + which sequences were ignored during scoring. + """ + + # TODO(cade) perform this on GPU to remove blocking call. + proposal_lens_list = proposals.proposal_lens.tolist() + proposal_token_ids_list = proposals.proposal_token_ids.tolist() + + spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens = self._expand_batch( + seq_group_metadata_list=seq_group_metadata_list, + proposal_token_ids_list=proposal_token_ids_list, + proposal_lens_list=proposal_lens_list, + ) + + target_sampler_output = self._scorer_worker.execute_model( + seq_group_metadata_list=target_seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + return_python_output=False) + + all_tokens, all_probs = self._contract_batch( + original_bs=len(seq_group_metadata_list), + target_sampler_output=target_sampler_output, + proposals=proposals, + num_scoring_tokens=num_scoring_tokens, + non_spec_indices=non_spec_indices, + spec_indices=spec_indices, + k=k, + ) + + return SpeculativeScores( + probs=all_probs, + token_ids=all_tokens, + ) + + def _expand_batch( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids_list: List[TokenId], + proposal_lens_list: List[int], + ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: + """Given the input sequences and potentially multiple corresponding + proposal tokens, create a new batch where each sequence has a single + query token. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + spec_seqs, spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=False) + non_spec_seqs, non_spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=True) + + target_seq_group_metadata_list = self._create_scoring_model_input( + spec_seqs, proposal_token_ids_list) + num_scoring_tokens = len(target_seq_group_metadata_list) + target_seq_group_metadata_list.extend(non_spec_seqs) + + return spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens + + def _contract_batch(self, original_bs: int, + target_sampler_output: List[SamplerOutput], + proposals: SpeculativeProposals, + num_scoring_tokens: int, non_spec_indices: List[int], + spec_indices: List[int], + k: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Contract the expanded batch back into its original size. + This maps the scores of speculative tokens back to their original + sequences. + """ + (target_token_ids, target_probs, non_spec_target_token_ids, + non_spec_target_probs) = self._split_scoring_output( + target_sampler_output, num_scoring_tokens) + + # Map distinct sequences used to score each token + # of shape [batch_size * k + 1] back to [batch_size, k + 1]. + batch_size, k = proposals.proposal_token_ids.shape + + target_token_ids = target_token_ids.squeeze().reshape( + batch_size, k + 1) + target_probs = target_probs.squeeze().reshape(batch_size, k + 1, + self._vocab_size) + + all_tokens = torch.full(size=(original_bs, k + 1), + fill_value=-1, + device=self._device, + dtype=torch.long) + all_probs = torch.zeros(original_bs, + k + 1, + self._vocab_size, + device=self._device, + dtype=torch.float32) + + if non_spec_indices: + all_tokens[non_spec_indices, 0] = non_spec_target_token_ids + all_probs[non_spec_indices, :1, :] = non_spec_target_probs + + if spec_indices: + all_tokens[spec_indices] = target_token_ids + all_probs[spec_indices] = target_probs + + return all_tokens, all_probs + + def _create_scoring_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + ) -> List[SequenceGroupMetadata]: + """Given the original input sequences and proposed tokens from the draft + model, create a list of target sequences that can be used for scoring. + """ + + if not seq_group_metadata_list: + return [] + + target_seq_ids_iter = self._create_target_seq_id_iterator( + get_all_seq_ids(seq_group_metadata_list)) + + target_seq_group_metadata = list( + chain.from_iterable( + self._create_target_seq_group_metadata( + seq_group_metadata, + proposal_token_ids, + i, + target_seq_ids_iter, + ) for i, seq_group_metadata in enumerate( + seq_group_metadata_list))) + + return target_seq_group_metadata + + def _create_target_seq_group_metadata( + self, + input_seq_group_metadata: SequenceGroupMetadata, + proposal_token_ids: List[TokenId], # shape: [batch_size, k] + batch_index: int, + target_seq_ids_iter: Iterator[TargetSeqId], + ) -> List[SequenceGroupMetadata]: + """Given an input sequence group metadata and a list of draft tokens, + create a list of target SequenceGroupMetadata, one for each + token id that needs to be scored. + + Naive speculative decoding requires K target model scores, one for each + draft model token. However one can add a bonus token such that if each + token is accepted, then a final token may be sampled from the model. + This function creates K+1 target SequenceGroupMetadata to take + advantage of the bonus token. + """ + assert not input_seq_group_metadata.is_prompt, ( + "Speculating on " + "prompts not yet supported") + assert len(input_seq_group_metadata.seq_data) == 1, ( + "Beam search " + "not supported in speculative decoding") + input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys())) + + token_ids_to_score = self._get_token_ids_to_score( + proposal_token_ids[batch_index]) + + target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] + for token_ids in token_ids_to_score: + target_seq_group_metadata_list.append( + self._create_single_target_seq_group_metadata( + input_seq_group_metadata, + input_seq_id, + next(target_seq_ids_iter), + token_ids, + )) + + return target_seq_group_metadata_list + + def _create_single_target_seq_group_metadata( + self, + seq_group_metadata: SequenceGroupMetadata, + seq_id: SeqId, + target_seq_id: TargetSeqId, + token_ids: List[TokenId], + ) -> SequenceGroupMetadata: + """Create a single target SequenceGroupMetadata. + + Args: + seq_group_metadata: The metadata for the input sequence. + seq_id: The input sequence ID. + target_seq_id: The corresponding target sequence ID. + token_ids: The list of token ids that are to be appended to the + input sequence. + """ + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_token_ids = seq_data.get_prompt_token_ids() + new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] + + return SequenceGroupMetadata( + request_id=seq_group_metadata.request_id, + is_prompt=seq_group_metadata.is_prompt, + seq_data={ + target_seq_id: + SequenceData( + prompt_token_ids=prompt_token_ids, + output_token_ids=new_output_token_ids, + ), + }, + sampling_params=seq_group_metadata.sampling_params, + block_tables={ + target_seq_id: seq_group_metadata.block_tables[seq_id], + }, + lora_request=None, + ) + + def _split_scoring_output( + self, sampler_output: SamplerOutput, num_scoring_tokens: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Split the target model output into speculative and non-speculative + output. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + # + # First samples are from speculative scoring, latter samples are non- + # speculative samples. + split_sizes = [ + num_scoring_tokens, + sampler_output.sampled_token_ids.numel() - num_scoring_tokens + ] + (spec_probs, non_spec_probs + ) = sampler_output.sampled_token_probs.split(split_sizes) + (spec_sampled_tokens, non_spec_sampled_tokens + ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) + + # Convert scores to tensors. + sampler_output.sampled_token_probs = spec_probs + sampler_output.sampled_token_ids = spec_sampled_tokens + target_token_ids, target_probs = sampler_output_to_torch( + [sampler_output]) + + # Convert non-speculative output tokens to tensors. + sampler_output.sampled_token_probs = non_spec_probs + sampler_output.sampled_token_ids = non_spec_sampled_tokens + non_spec_target_token_ids, non_spec_target_probs = sampler_output_to_torch( + [sampler_output]) + + return target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs + + def _create_target_seq_id_iterator( + self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: + """Create an iterator for creating target sequence ids. + Target sequence ids are distinct from sequence ids because we create a + distinct target sequence id for each proposal token to be scored. + + This implementation increments a counter starting at 1 + max of all + provided input sequence ids. + """ + return count(start=max(seq_ids) + 1) + + def _get_token_ids_to_score( + self, + full_spec_token_ids: List[TokenId] # shape: [k] + ) -> List[List[TokenId]]: + """Given an int tensor of proposal token ids, return a list of + token ids that should be scored. + + Returns k+1 output lists. The additional one is used for generating the + bonus token. + + Example: + Input: [0, 1, 2, 3] (k=4) + Output: (k+1 lists) + [] + [0] + [0, 1] + [0, 1, 2] + [0, 1, 2, 3] + """ + empty_token_ids = [] + + token_ids_to_score = [empty_token_ids] + token_ids_to_score.extend([ + full_spec_token_ids[:i + 1] + for i in range(len(full_spec_token_ids)) + ]) + return token_ids_to_score diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py new file mode 100644 index 0000000000000..9e53ffb60ac32 --- /dev/null +++ b/vllm/spec_decode/interfaces.py @@ -0,0 +1,77 @@ +from typing import List, Tuple, Optional, Dict +from dataclasses import dataclass +from abc import ABC, abstractmethod + +import torch + +from vllm.sequence import SequenceGroupMetadata + + +@dataclass +class SpeculativeProposals: + """Datastructure used to represent proposal tokens from some proposer. It + also tracks how many speculative tokens each sequence has. + """ + + # Speculative proposal tokens. + proposal_token_ids: torch.Tensor + + # Probabilities of the proposal tokens according to the proposer. + proposal_probs: torch.Tensor + + # The valid length of each proposal; can be zero. + proposal_lens: torch.Tensor + + def __repr__(self): + return (f"SpeculativeProposals(" + f"proposal_token_ids={self.proposal_token_ids.shape}, " + f"proposal_probs={self.proposal_probs.shape}, " + f"proposal_lens={self.proposal_lens.shape})") + + +@dataclass +class SpeculativeScores: + """Datastructure used to represent the scores of speculative tokens + according to the scoring model. + """ + + # Probabilities of the speculative tokens according to the scoring model. + probs: torch.Tensor + + # Token ids sampled from the scoring model. Used for speculative bonus + # tokens and also non-speculative normal decoding. + token_ids: torch.Tensor + + def __repr__(self): + return (f"SpeculativeScores(" + f"probs={self.probs.shape}, " + f"token_ids={self.token_ids.shape})") + + +class SpeculativeProposer(ABC): + + @abstractmethod + def get_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + raise NotImplementedError + + +class SpeculativeScorer(ABC): + + @abstractmethod + def score_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + proposals: SpeculativeProposals, + ) -> Tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py new file mode 100644 index 0000000000000..65a2a4a63a98f --- /dev/null +++ b/vllm/spec_decode/metrics.py @@ -0,0 +1,174 @@ +import torch +from dataclasses import dataclass +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from typing import Optional +from vllm.utils import in_wsl +import time +from typing import Callable + + +@dataclass +class SpecDecodeWorkerMetrics: + """Dataclass holding metrics emitted from the spec decode worker. + """ + + # The empirical acceptance rate of the proposal method on a per-token basis. + # This is useful for evaluating how well the proposal method aligns with the + # scoring method. + draft_acceptance_rate: float + + # The empirical efficiency, measured as the number of tokens emitted by the + # system divided by the number of tokens that could be emitted by the system + # if the proposal method were perfect. + system_efficiency: float + + # The number of speculative tokens produced by the proposal method. + draft_tokens: int + + # The number of tokens emitted by the entire system. + emitted_tokens: int + + # The number of tokens accepted by the scoring model and verification + # routine, e.g. Llama2-70B and lossless rejection sampling. + # + # NOTE: Any token accepted by the verification routine is considered + # accepted (regardless of if the speculative prefix is also accepted). The + # user will usually see less accepted tokens. This metric is helpful when + # evaluating alignment of the proposal method with the scoring model. + accepted_tokens: int + + # The number of speculative tokens per sequence. + num_spec_tokens: int + + +Timer = Callable[[], float] + + +class AsyncMetricsCollector: + """Class which copies rejection sampler metrics from the device to CPU on a + non-default Torch stream. + """ + + def __init__(self, + rejection_sampler: RejectionSampler, + timer: Optional[Timer] = None, + collect_interval_s: float = 5.0): + self._rejection_sampler = rejection_sampler + self._timer = time.time if timer is None else timer + + self._rank: Optional[int] = None + + # We don't have a device set yet. + self._copy_stream: Optional[torch.cuda.Stream] = None + + self._in_flight_copy: Optional[torch.cuda.Event] = None + + pin_memory = not in_wsl() + self._aggregate_num_accepted_tokens = torch.tensor( + 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) + self._aggregate_num_emitted_tokens = torch.tensor( + 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) + self._aggregate_num_draft_tokens = 0 + + self._rejsample_metrics_collect_interval_s = collect_interval_s + self._last_metrics_collect_time = self._timer() + + def init_gpu_tensors(self, rank: int) -> None: + self._rank = rank + self._copy_stream = torch.cuda.Stream() + + def maybe_collect_rejsample_metrics( + self, k: int) -> Optional[SpecDecodeWorkerMetrics]: + + # If a copy was initiated in the previous call, collect and return. + if self._in_flight_copy is not None: + ready_event = self._in_flight_copy + self._in_flight_copy = None + return self._collect_rejsample_metrics(k, ready_event) + + # Otherwise, check if we should start a new copy. + if self._should_collect_rejsample_metrics(self._timer()): + assert self._in_flight_copy is None + self._in_flight_copy = self._copy_rejsample_metrics_async() + + return None + + def _should_collect_rejsample_metrics(self, now: float) -> bool: + """Return whether or not this iteration should print rejection sampling + metrics. + """ + if self._rank != 0: + return False + + if (now - self._last_metrics_collect_time < + self._rejsample_metrics_collect_interval_s): + return False + return True + + def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: + """Copy rejection sampling metrics (number of accepted tokens, etc) to + CPU asynchronously. + + Returns a CUDA event recording when the copy is complete. + """ + self._copy_stream.wait_stream(torch.cuda.current_stream()) + + with torch.cuda.stream(self._copy_stream): + self._aggregate_num_accepted_tokens.copy_( + self._rejection_sampler.num_accepted_tokens, non_blocking=True) + self._aggregate_num_emitted_tokens.copy_( + self._rejection_sampler.num_emitted_tokens, non_blocking=True) + # Number of draft tokens is calculated on CPU, so no copy is + # required. + self._aggregate_num_draft_tokens = ( + self._rejection_sampler.num_draft_tokens) + + aggregate_metrics_ready = torch.cuda.Event() + aggregate_metrics_ready.record(self._copy_stream) + + return aggregate_metrics_ready + + def _collect_rejsample_metrics( + self, k: int, + ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics: + """Create metrics object from statistics copied asynchronously. + + Args: + k: int. The number of speculative tokens; used to determine system + efficiency. + ready_event: torch.cuda.Event. The CUDA event recording when the + async GPU->CPU copy is complete. + """ + + ready_event.synchronize() + accepted_tokens = self._aggregate_num_accepted_tokens.item() + emitted_tokens = self._aggregate_num_emitted_tokens.item() + draft_tokens = self._aggregate_num_draft_tokens + + num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k) + + if draft_tokens > 0: + draft_acceptance_rate = accepted_tokens / draft_tokens + else: + draft_acceptance_rate = float("nan") + + if num_possible_tokens > 0: + system_efficiency = emitted_tokens / num_possible_tokens + else: + system_efficiency = float("nan") + + return SpecDecodeWorkerMetrics( + num_spec_tokens=k, + draft_acceptance_rate=draft_acceptance_rate, + system_efficiency=system_efficiency, + accepted_tokens=accepted_tokens, + draft_tokens=draft_tokens, + emitted_tokens=emitted_tokens, + ) + + @staticmethod + def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: + # Divide by k since batch size can be variable. + total_num_spec_seqs = draft_tokens / k + num_accepted_per_seq_if_all_accepted = k + 1 + return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py new file mode 100644 index 0000000000000..f7be14d3d22c2 --- /dev/null +++ b/vllm/spec_decode/multi_step_worker.py @@ -0,0 +1,366 @@ +from typing import List, Dict, Optional, Tuple +import copy + +import torch + +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.worker import Worker +from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer +from vllm.spec_decode.util import sampler_output_to_torch + + +class MultiStepWorker(Worker): + """The MultiStepWorker is equivalent to a Worker except that it allows + multiple forward passes in a single call, assuming the scheduler has + allocated enough space to store the additional KV. This reduces overhead + by invoking the scheduler less. + + The MultiStepWorker does not support cache swap operations, or beam search. + Cache swap operations do not require large modifications. On the other hand, + beam search requires memory allocations during sequence forks and thus + requires more thought for MultiStepWorker support. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._proposer: Optional[DraftModelTop1Proposer] = None + + def init_model(self): + super().init_model() + + self._proposer = DraftModelTop1Proposer( + self, + self.device, + self.max_model_len, + self.vocab_size, + ) + + @torch.inference_mode() + def execute_model_multi_step( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + num_steps: int, + ) -> List[SamplerOutput]: + """Run the model forward pass num_steps times. Returns the list of + sampler output, one per model forward pass. + """ + self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, + blocks_to_swap_out, blocks_to_copy) + + # Shallow copy input data so modifications (such as appending tokens) + # do not cause side-effects. + copied_seq_group_metadata_list = self._shallow_copy_inputs( + seq_group_metadata_list) + + # Assert enough KV space for num_steps tokens per sequence. + self._assert_enough_kv_space(seq_group_metadata_list, num_steps) + + # Run model num_steps times. + model_outputs = [] + for _ in range(num_steps): + model_output = super().execute_model( + seq_group_metadata_list=copied_seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + self._append_new_tokens(model_output, + copied_seq_group_metadata_list) + model_outputs.append(model_output) + + return model_outputs + + def get_spec_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + + return self._proposer.get_proposals( + seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy, + max_proposal_len, + ) + + def _append_new_tokens( + self, model_output: SamplerOutput, + seq_group_metadata_list: SequenceGroupMetadata) -> None: + """Given model output from a single run, append the tokens to the + sequences. This is normally done outside of the worker, but it is + required if the worker is to perform multiple forward passes. + """ + for seq_group_metadata, sequence_group_outputs in zip( + seq_group_metadata_list, model_output): + seq_group_metadata.is_prompt = False + + for seq_output in sequence_group_outputs.samples: + # NOTE: Beam search is not supported, so we can assume that + # parent_seq_id == seq_id. + seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] + + token_id = seq_output.output_token + token_logprob = seq_output.logprobs[token_id] + + seq.append_token_id(token_id, token_logprob.logprob) + + def _shallow_copy_inputs( + self, seq_group_metadata_list: List[SequenceGroupMetadata] + ) -> List[SequenceGroupMetadata]: + """Copy input data structures to remove side-effects when input data + structures are shared with other modules. + + Helpful when the vLLM scheduler runs in the same process as the worker. + The alternative is deep-copying (or other form of deep copy); this has + performance downsides. + """ + + # Shallow-copy the list of SequenceGroupMetadata. This allows us to + # append tokens and change is_prompt without external side-effects. + new_seq_group_metadata_list = [] + + for old_seq_group_metadata in seq_group_metadata_list: + # We must shallow-copy seq_group_metadata as is_prompt could change. + seq_group_metadata = copy.copy(old_seq_group_metadata) + new_seq_group_metadata_list.append(seq_group_metadata) + + # We must shallow-copy seq_data as we will append token ids + new_seq_data = {} + for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): + new_seq_data[seq_id] = copy.copy(old_seq_data) + new_seq_data[ + seq_id].output_token_ids = old_seq_data.output_token_ids[:] + + seq_group_metadata.seq_data = new_seq_data + + return new_seq_group_metadata_list + + def _assert_enough_kv_space( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + num_steps: int) -> None: + """Assert there are enough physical blocks per sequence to store the + current KV plus additional KV from num_steps tokens. + """ + assert self.model_runner.block_size is not None + for seq_group_metadata in seq_group_metadata_list: + # Only one seq_id is guaranteed because there is no beam search. + seq_id = list(seq_group_metadata.seq_data.keys())[0] + seq = seq_group_metadata.seq_data[seq_id] + + # After num_steps, the seq len will be the current seq len + # plus one token per step. + final_seq_len = seq.get_len() + num_steps + + # We will have final_seq_len - 1 KV because vLLM saves KV for a + # token in the iteration after the token was generated. + required_num_kv_slots = final_seq_len - 1 + + # The allocated number of kv slots is the number of allocated blocks + # times the number of slots of block. + number_physical_blocks = len( + seq_group_metadata.block_tables[seq_id]) + allocated_kv_slots = (number_physical_blocks * + self.model_runner.block_size) + + if required_num_kv_slots > allocated_kv_slots: + request_id = seq_group_metadata.request_id + raise ValueError( + "The worker attempted to run " + f"{num_steps} times but found insufficient KV space for " + f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " + f"{required_num_kv_slots=}).") + + def _raise_if_unsupported( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + """MultiStepWorker does not yet implement support for cache swap + operations or beam search. + """ + if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): + raise NotImplementedError( + "MultiStepWorker does not support cache operations") + + if any( + len(seq_group_metadata.seq_data.keys()) != 1 + for seq_group_metadata in seq_group_metadata_list): + raise NotImplementedError( + "MultiStepWorker does not support beam search.") + + +class DraftModelTop1Proposer(SpeculativeProposer): + """Helper class which separates out sequences which would exceed the max + model length when speculated upon. + + This allows combinations of models such as JackFram/llama-68m draft with + meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of + 2048 while Llama2-13b has max_position_embeddings of 4096. + + We treat the sequences which exceed the proposal draft model length as + "non-spec sequences". Essentially they skip the draft model and go through + normal decoding in the target model. + + Currently, only proposal_lens of 0 and k are supported, where k is a global + batch proposal length. In the future vLLM should support per-sequence + proposal lengths. + """ + + def __init__( + self, + draft_worker: MultiStepWorker, + device: str, + max_model_len: int, + vocab_size: int, + ): + self._draft_worker = draft_worker + self._device = device + self._max_model_len = max_model_len + self._vocab_size = vocab_size + + def get_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + """Get speculative proposals given the input batch. + + Sequences which would exceed the max model length are skipped during + speculation. + """ + + # Split speculative- and non-speculative- sequences. + proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices = self._split_by_max_model_len( + seq_group_metadata_list, max_proposal_len) + + if nonzero_proposal_len_seqs: + # Speculate tokens using the draft worker for the speculative + # sequences. + maybe_sampler_output = self._draft_worker.execute_model_multi_step( + seq_group_metadata_list=nonzero_proposal_len_seqs, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + num_steps=max_proposal_len, + ) + else: + # If no sequences can be speculated, set sampler output to None. + maybe_sampler_output = None + + # Combine speculative- and non-speculative sequences into the same + # representation. + proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( + batch_size=len(seq_group_metadata_list), + max_proposal_len=max_proposal_len, + maybe_sampler_output=maybe_sampler_output, + proposal_lens=proposal_lens, + nonzero_proposal_len_indices=nonzero_proposal_len_indices, + ) + + proposals = SpeculativeProposals( + proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + ) + + return proposals + + def _split_by_max_model_len( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + max_proposal_len: int, + ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: + """Determine which sequences would exceed the max model length. + """ + + proposal_lens: List[int] = [] + nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] + nonzero_proposal_len_indices: List[int] = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_data = next(iter(seq_group_metadata.seq_data.values())) + seq_len = seq_data.get_len() + + # Currently only proposal lens of 0 or the global batch proposal len + # are supported. + if seq_len + max_proposal_len < self._max_model_len: + proposal_lens.append(max_proposal_len) + nonzero_proposal_len_seqs.append(seq_group_metadata) + nonzero_proposal_len_indices.append(i) + else: + proposal_lens.append(0) + + return proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices + + def _merge_outputs( + self, + batch_size: int, + max_proposal_len: int, + maybe_sampler_output: Optional[SamplerOutput], + proposal_lens: List[int], + nonzero_proposal_len_indices: List[int], + ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]: + """After speculations are produced, merge the speculation results with + the skipped sequences. + """ + if maybe_sampler_output is None: + # If no speculative tokens, the sampler output will be None. + # In this case we return empty tensors. + proposal_tokens = torch.zeros(0, + max_proposal_len, + dtype=torch.long, + device=self._device) + proposal_probs = torch.zeros(0, + max_proposal_len, + self._vocab_size, + dtype=torch.float32, + device=self._device) + proposal_lens = torch.zeros(len(proposal_lens), + dtype=torch.long, + device=self._device) + return proposal_tokens, proposal_probs, proposal_lens + + sampler_output = maybe_sampler_output + + proposal_tokens, proposal_probs = sampler_output_to_torch( + sampler_output) + + # Now, reformat the output GPU tensors such that each sequence has + # a proposal. the proposal can be empty, e.g. [-1, -1, -1] + + entire_proposal_tokens = torch.full(size=(batch_size, + *proposal_tokens.shape[1:]), + fill_value=-1, + dtype=torch.long, + device=self._device) + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens + entire_proposal_probs = torch.zeros(batch_size, + *proposal_probs.shape[1:], + dtype=torch.float32, + device=self._device) + entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs + + proposal_tokens, proposal_probs = entire_proposal_tokens, entire_proposal_probs + + proposal_lens = torch.zeros(batch_size, + dtype=torch.long, + device=self._device) + proposal_lens[nonzero_proposal_len_indices] = max_proposal_len + + return proposal_tokens, proposal_probs, proposal_lens diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py new file mode 100644 index 0000000000000..890e479202372 --- /dev/null +++ b/vllm/spec_decode/spec_decode_worker.py @@ -0,0 +1,372 @@ +from typing import List, Tuple, Optional, Dict +from functools import cached_property + +import torch + +from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, + SequenceGroupOutput, SequenceOutput) +from vllm.worker.worker import Worker +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from vllm.config import CacheConfig +from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len +from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer +from vllm.spec_decode.interfaces import SpeculativeScorer + + +class SpecDecodeWorker: + """Worker which implements speculative decoding. + + Speculative decoding reduces decoding per-token latency by using a proposal + method, such as a small draft model, to speculate ahead of a larger LLM. The + probabilities of the speculative tokens are then determined by the larger + LLM, after which some verification routine determines which (if any) of the + speculative tokens are accepted by the larger LLM. + + See https://github.com/vllm-project/vllm/pull/2188 and + https://github.com/vllm-project/vllm/pull/3103 for more info. + + The current implementation has the following limitations: + * Only draft-model proposal is implemented (contributions for more forms are + welcome!). + * Only top-1 proposal and scoring are implemented. Tree-attention is left as + future work. + * Only lossless rejection sampling is supported. Contributions adding lossy + verification routines are welcome (e.g. Medusa's typical acceptance). + * All sequences in a batch must have the same proposal length, or zero. This + can be improved by having per-sequence speculation in the future. + * The scoring forward pass is done without an MQA kernel, which is + suboptimal especially as the batch size, proposal length, and sequence + lengths grow. Contributions to add a MQA scoring are welcome once + correctness tests pass. + More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. + """ + + def __init__( + self, + proposer_worker: MultiStepWorker, + scorer_worker: Worker, + rejection_sampler: RejectionSampler, + metrics_collector: Optional[AsyncMetricsCollector] = None, + ): + """ + Create a SpecDecodeWorker. + + Args: + proposer_worker: A worker that can produce speculative tokens for + sequences. + scorer_worker: A worker that produces probabilities of speculative + tokens according to some base model. Typically a vanilla vLLM + Worker. + rejection_sampler: A Torch module used to perform modified rejection + sampling for speculative decoding. + metrics_collector: Helper class for collecting metrics; can be set + for testing purposes. + """ + self.proposer_worker = proposer_worker + self.scorer_worker = scorer_worker + self.rejection_sampler = rejection_sampler + + self._metrics = AsyncMetricsCollector( + rejection_sampler + ) if metrics_collector is None else metrics_collector + + self.probs_dtype = self.rejection_sampler.probs_dtype + self.token_id_dtype = self.rejection_sampler.token_id_dtype + + self.scorer: SpeculativeScorer = None + + def init_model(self) -> None: + """Initialize both scorer and proposer models. + """ + # The scorer worker model is initialized first in case the proposer + # model has a smaller TP degree than the target worker. + self.scorer_worker.init_model() + self.proposer_worker.init_model() + + self._metrics.init_gpu_tensors(self.rank) + self.rejection_sampler.init_gpu_tensors(self.rank) + self.scorer = BatchExpansionTop1Scorer( + scorer_worker=self.scorer_worker, + device=self.device, + vocab_size=self._vocab_size) + + def profile_num_available_blocks(self, block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: int, + cache_dtype: str) -> Tuple[int, int]: + """Determine the number of cache blocks to use. + + This is done by profiling the scorer model (which is typically the + larger of the two). Then the total memory which would be used by the + scorer cache is divided evenly between the proposer and scorer model KV, + such that the number of blocks is equal in both KV caches. + """ + num_gpu_blocks, num_cpu_blocks = ( + self.scorer_worker.profile_num_available_blocks( + block_size, gpu_memory_utilization, cpu_swap_space, + cache_dtype)) + + scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes( + block_size, cache_dtype) + proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes( + block_size, cache_dtype) + + new_num_gpu_blocks = split_num_cache_blocks_evenly( + scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, + num_gpu_blocks) + return new_num_gpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig): + """Initialize the cache engine of the scorer and proposer workers. + """ + self.scorer_worker.init_cache_engine(cache_config) + self.proposer_worker.init_cache_engine(cache_config) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + num_spec_tokens: int, + ) -> List[SamplerOutput]: + """Perform speculative decoding on the input batch. + """ + + assert seq_group_metadata_list is not None, ( + "speculative decoding " + "requires non-None seq_group_metadata_list") + + # If no spec tokens, call the proposer and scorer workers normally. + # Used for prefill. + if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0: + return self._run_no_spec( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + return self._run_speculative_decoding_step( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + k=num_spec_tokens, + ) + + @nvtx_range("spec_decode_worker._run_no_spec") + def _run_no_spec( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + ) -> List[SamplerOutput]: + """Run a prefill step, without any speculation. The input is sent to the + proposer and scorer model so that the KV cache is consistent between the + two. + """ + + self.proposer_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + return_python_output=False) + + sampler_output = self.scorer_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + # Clear device tensors from sampler output. This reduces communication + # overhead when the engine runs in a different process than the workers. + sampler_output.probs = None + sampler_output.sampled_tokens = None + return [sampler_output] + + @nvtx_range("spec_decode_worker._run_speculative_decoding_step") + def _run_speculative_decoding_step( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + ) -> List[SamplerOutput]: + """Execute a single step of speculative decoding. + + This invokes the proposer worker to get k speculative tokens for each + sequence, then scores each speculative token using the scoring worker. + + Returns a list of SamplerOutput, each containing a single token per + sequence. + """ + + # Generate proposals using draft worker. + proposals = self.proposer_worker.get_spec_proposals( + seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy, k) + + proposal_scores = self.scorer.score_proposals( + seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy, + k, + proposals, + ) + + accepted_token_ids = self._verify_tokens(seq_group_metadata_list, + proposal_scores, proposals, k) + + return self._create_output_sampler_list(seq_group_metadata_list, + accepted_token_ids, k) + + @nvtx_range("spec_decode_worker._verify_tokens") + def _verify_tokens( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_scores: SpeculativeScores, + proposals: SpeculativeProposals, + max_proposal_len: int, + ) -> torch.Tensor: + """Determine which speculative tokens are accepted using the + probabilities of each token according to the proposer and scorer models. + """ + proposal_lens_list = proposals.proposal_lens.tolist() + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + _, spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=False) + _, non_spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=True) + original_indices = spec_indices + non_spec_indices + + proposal_probs = proposal_scores.probs[spec_indices, :-1] + bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] + non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] + + accepted_token_ids = self.rejection_sampler( + proposal_probs, + bonus_token_ids, + proposals.proposal_probs, + proposals.proposal_token_ids, + ) + + # Append output tokens from non-speculative sequences to + # the accepted token ids tensor. + non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + + 1).clone() + non_spec_token_ids[:, 1:] = -1 + accepted_token_ids = torch.cat( + [accepted_token_ids, non_spec_token_ids]) + + # Rearrange so that results are in the order of the original seq group + # metadata. + accepted_token_ids[original_indices] = accepted_token_ids.clone() + + return accepted_token_ids + + def _create_output_sampler_list( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] + k: int, + ) -> List[SamplerOutput]: + """Given the accepted token ids, create a list of SamplerOutput. + + The output is padded with -1 tokens such that each sequence has + the same number of outputs. + """ + seq_ids = get_all_seq_ids(seq_group_metadata_list) + + # shape: [k+1, batch_size] + accepted_token_ids_by_step = accepted_token_ids.transpose(0, + 1).tolist() + sampler_output_list = [] + for token_ids_by_step in accepted_token_ids_by_step: + if all(token_id == -1 for token_id in token_ids_by_step): + break + + step_output_token_ids = [] + for token_id, seq_id in zip(token_ids_by_step, seq_ids): + step_output_token_ids.append( + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq_id, + output_token=token_id, + # TODO Add verifier logprobs. + logprobs={token_id: 0.0}, + ) + ], + prompt_logprobs=None, + )) + sampler_output_list.append( + SamplerOutput(outputs=step_output_token_ids)) + + maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics( + k) + if maybe_rejsample_metrics is not None: + sampler_output_list[ + 0].spec_decode_worker_metrics = maybe_rejsample_metrics + + return sampler_output_list + + @cached_property + def _vocab_size(self) -> int: + """Get the vocab size of the model and make sure it's consistent between + draft and target workers. + """ + vocab_sizes = [ + worker.vocab_size + for worker in [self.proposer_worker, self.scorer_worker] + ] + assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes) + return vocab_sizes[0] + + @property + def rank(self): + return self.scorer_worker.rank + + @property + def device(self): + return self.scorer_worker.device + + +def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, + proposer_cache_block_size_bytes: int, + total_num_gpu_blocks: int) -> int: + """Given total_num_gpu_blocks, the number of GPU blocks that could be + allocate to the target model, this function calculates how many blocks + should be given to the draft and target model. + + Note that usually the block size, in bytes, of each model is different, + as it's a function of number of KV/layer, number of heads, and hidden + dimension size. + + Since the target and draft models allocate the same number of blocks, we + simply calculate the number of blocks where if allocated by both models, + the total memory usage from KV cache is no larger than the number of + blocks allocatable by the target model alone. + """ + new_num_gpu_blocks = int( + total_num_gpu_blocks * scorer_cache_block_size_bytes / + (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes)) + + return new_num_gpu_blocks diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py new file mode 100644 index 0000000000000..2c5f954551905 --- /dev/null +++ b/vllm/spec_decode/util.py @@ -0,0 +1,99 @@ +import torch +from typing import List, Tuple +from vllm.sequence import SequenceGroupMetadata, SamplerOutput +from contextlib import contextmanager +from itertools import chain + +SeqId = int + + +def get_all_seq_ids( + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + return list( + chain.from_iterable([ + seq_group_metadata.seq_data.keys() + for seq_group_metadata in seq_group_metadata_list + ])) + + +def split_batch_by_proposal_len( + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_lens: List[int], select_proposal_len_zero: bool +) -> Tuple[List[SequenceGroupMetadata], List[int]]: + """Utility function that splits a batch based on whether the proposal len is + zero or not. We should remove this once vLLM supports per-sequence proposal + lens in a batch. + """ + + if select_proposal_len_zero: + predicate = lambda proposal_len: proposal_len == 0 + else: + predicate = lambda proposal_len: proposal_len != 0 + + indices = [ + i for i, (_, proposal_len + ) in enumerate(zip(seq_group_metadata_list, proposal_lens)) + if predicate(proposal_len) + ] + seq_groups = [ + seq_group for seq_group, proposal_len in zip( + seq_group_metadata_list, proposal_lens) if predicate(proposal_len) + ] + + return seq_groups, indices + + +def sampler_output_to_torch( + sampler_output_list: List[SamplerOutput], +) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility function which converts a list of SamplerOutput to tensors. + + Returns: + sampled_token_ids: torch.Tensor + shape: [batch_size, len(sampler_output_list)] + + sampled_token_probs: torch.Tensor + shape: [batch_size, len(sampler_output_list), vocab_size] + """ + + # shape: [batch_size, num_sampler_output, vocab_size] + sampled_token_probs = torch.stack( + [ + sampler_output.sampled_token_probs + for sampler_output in sampler_output_list + ], + dim=0, + ).transpose(0, 1) + + # shape: [batch_size, num_sampler_output] + sampled_token_ids = torch.stack( + [ + sampler_output.sampled_token_ids.flatten() + for sampler_output in sampler_output_list + ], + dim=0, + ).transpose(0, 1) + + return sampled_token_ids, sampled_token_probs + + +@contextmanager +def nvtx_range(msg, *args, **kwargs): + """ + Context manager / decorator that pushes an NVTX range at the beginning + of its scope, and pops it at the end. If extra arguments are given, + they are passed as arguments to msg.format(). + + If running with cuda graphs, you must enable nsys cuda graph profiling. + + Arguments: + msg (string): message to associate with the range + """ + torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) + try: + yield + finally: + torch.cuda.nvtx.range_pop() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 9023b0c59b3fb..0dd2309079403 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -97,8 +97,6 @@ def load_model(self) -> None: f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" ) - vocab_size = self.model.config.vocab_size - if self.lora_config: assert hasattr( self.model, "supported_lora_modules" @@ -111,7 +109,7 @@ def load_model(self) -> None: self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens + - self.scheduler_config.max_paddings, vocab_size, + self.scheduler_config.max_paddings, self.vocab_size, self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) @@ -607,8 +605,7 @@ def execute_model( @torch.inference_mode() def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. - vocab_size = self.model_config.get_vocab_size() - sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1) + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens max_num_seqs = self.scheduler_config.max_num_seqs @@ -774,6 +771,10 @@ def __del__(self) -> None: self.graph_runners.clear() self.cupy_nccl_backend = None + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + class CUDAGraphRunner: diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py deleted file mode 100644 index ab3e28389a04c..0000000000000 --- a/vllm/worker/spec_decode/multi_step_worker.py +++ /dev/null @@ -1,178 +0,0 @@ -from typing import List, Dict -import copy - -import torch - -from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.worker.worker import Worker - - -class MultiStepWorker(Worker): - """The MultiStepWorker is equivalent to a Worker except that it allows - multiple forward passes in a single call, assuming the scheduler has - allocated enough space to store the additional KV. This reduces overhead - by invoking the scheduler less. - - The MultiStepWorker does not support cache swap operations, or beam search. - Cache swap operations do not require large modifications. On the other hand, - beam search requires memory allocations during sequence forks and thus - requires more thought for MultiStepWorker support. - """ - - @torch.inference_mode() - def execute_model_multi_step( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_steps: int, - ) -> List[SamplerOutput]: - """Run the model forward pass num_steps times. Returns the list of - sampler output, one per model forward pass. - """ - self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, - blocks_to_swap_out, blocks_to_copy) - - # Shallow copy input data so modifications (such as appending tokens) - # do not cause side-effects. - copied_seq_group_metadata_list = self._shallow_copy_inputs( - seq_group_metadata_list) - - # Assert enough KV space for num_steps tokens per sequence. - self._assert_enough_kv_space(seq_group_metadata_list, num_steps) - - # Run model num_steps times. - model_outputs = [] - for _ in range(num_steps): - model_output = super().execute_model( - seq_group_metadata_list=copied_seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) - - self._append_new_tokens(model_output, - copied_seq_group_metadata_list) - model_outputs.append(model_output) - - return model_outputs - - def _append_new_tokens( - self, model_output: SamplerOutput, - seq_group_metadata_list: SequenceGroupMetadata) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - - seq.append_token_id(token_id, token_logprob.logprob) - - def _shallow_copy_inputs( - self, seq_group_metadata_list: List[SequenceGroupMetadata] - ) -> List[SequenceGroupMetadata]: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - The multi-step worker must be able to append tokens to sequences after - a forward pass. This necessitates modification of the data structures - used by the worker. Since these data structures are shared with other - parts of vLLM, like the scheduler, we must take care not to introduce - unexpected side-effects. - - When Ray is used to orchestrate worker processes (such as when the - tensor-parallel degree is >1), this is not a problem because the input - datastructures will be serialized and created anew in the worker - process. - - However, when Ray is not used to orchestrate the worker processes (such - as when the tensor-parallel degree is 1), this is a problem. We avoid - the problem by shallow-copying the input datastructures (specifically, - the parts that will change in multiple steps). - """ - - # Shallow-copy the list of SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] - - for old_seq_group_metadata in seq_group_metadata_list: - # We must shallow-copy seq_group_metadata as is_prompt could change. - seq_group_metadata = copy.copy(old_seq_group_metadata) - new_seq_group_metadata_list.append(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[ - seq_id].output_token_ids = old_seq_data.output_token_ids[:] - - seq_group_metadata.seq_data = new_seq_data - - return new_seq_group_metadata_list - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _raise_if_unsupported( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 157e8c45836b1..0dcd4018afa5f 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -130,8 +130,8 @@ def profile_num_available_blocks( # GPU did not change their memory usage during the profiling. peak_memory = self.init_gpu_memory - free_gpu_memory - cache_block_size = CacheEngine.get_cache_block_size( - block_size, cache_dtype, self.model_config, self.parallel_config) + cache_block_size = self.get_cache_block_size_bytes( + block_size, cache_dtype) num_gpu_blocks = int( (total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size) @@ -232,6 +232,22 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_runner.list_loras() + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + def get_cache_block_size_bytes(self, block_size: int, + cache_dtype: str) -> int: + """Get the size of the KV cache block size in bytes. + """ + return CacheEngine.get_cache_block_size(block_size, cache_dtype, + self.model_config, + self.parallel_config) + def init_distributed_environment( parallel_config: ParallelConfig, From 0bba88df03754c40bd9135fc2ff9554ffca59c87 Mon Sep 17 00:00:00 2001 From: Terry <149540247+tterrysun@users.noreply.github.com> Date: Sat, 9 Mar 2024 17:14:16 -0800 Subject: [PATCH 076/113] Enhance lora tests with more layer and rank variations (#3243) --- csrc/punica/bgmv/bgmv_config.h | 1 + requirements-dev.txt | 1 + tests/lora/test_layer_variation.py | 104 +++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 tests/lora/test_layer_variation.py diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 3eb84ceb4d534..4dc90de1ab42a 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -14,6 +14,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128) \ f(in_T, out_T, W_T, narrow, 256) \ f(in_T, out_T, W_T, narrow, 512) \ + f(in_T, out_T, W_T, narrow, 768) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1728) \ diff --git a/requirements-dev.txt b/requirements-dev.txt index dfcbfa4253f1c..5502c97d014ac 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,6 +21,7 @@ einops # required for MPT openai requests ray +peft # Benchmarking aiohttp diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py new file mode 100644 index 0000000000000..1a1da517b2276 --- /dev/null +++ b/tests/lora/test_layer_variation.py @@ -0,0 +1,104 @@ +from typing import List, Optional +import peft +import pytest +from random import sample +import tempfile +from transformers import AutoModelForCausalLM + +import vllm +from vllm.lora.request import LoRARequest +from .conftest import cleanup + +MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" +PROMPTS = [ + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", +] + + +def get_lora_model(model_id: str, target_modules: List[str], rank: int): + model = AutoModelForCausalLM.from_pretrained(model_id) + lora_config = peft.tuners.lora.LoraConfig(target_modules, rank) + lora_model = peft.PeftModel(model, lora_config) + return lora_model + + +def do_sample(llm, + lora_path: Optional[str] = None, + lora_id: Optional[int] = None, + logprobs: int = 0, + n_tokens: int = 256): + prompts = PROMPTS + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=n_tokens, + logprobs=logprobs, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts = [] + generated_logprobs = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + generated_logprobs.append([ + list(logprob.keys()) for out in output.outputs + for logprob in out.logprobs + ]) + return generated_logprobs if logprobs else generated_texts + + +SUPPORTED_MODULES = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens", + "lm_head" +] +TARGET_MODULES_LIST = [] +for length in range(2, 6): + TARGET_MODULES_LIST.extend( + [sample(SUPPORTED_MODULES, length) for _ in range(3)]) + + +# Test the correctness when layer and rank are varied +# step 1: init a base model and serve with LoRA to get the reference results +# step 2: merge the same LoRA to the base model, serve the merged model +# step 3: compare the results from step 1 and step 2 +@pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST) +@pytest.mark.parametrize("rank", [8, 16, 32, 64]) +def test_layer_variation_correctness(tp_size, target_modules, rank): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=tp_size, + worker_use_ray=True) + model = get_lora_model(MODEL_PATH, target_modules, rank) + with tempfile.TemporaryDirectory() as tmpdir: + model.save_pretrained(tmpdir) + merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32) + del llm + cleanup() + reference_id_sets = [set(prob[0]) for prob in merged_probs] + + model = get_lora_model(MODEL_PATH, target_modules, rank) + with tempfile.TemporaryDirectory() as tmpdir: + merged_model = model.merge_and_unload() + merged_model.save_pretrained(tmpdir) + llm = vllm.LLM(tmpdir, + tokenizer=MODEL_PATH, + enable_lora=False, + max_num_seqs=16, + tensor_parallel_size=tp_size, + worker_use_ray=True) + probs = do_sample(llm, logprobs=5, n_tokens=32) + del llm + cleanup() + # verify the top-5 tokens are identical for each token + id_sets = [set(prob[0]) for prob in probs] + assert id_sets == reference_id_sets From e4a28e53165902ffc5daf20977c70885d0c05768 Mon Sep 17 00:00:00 2001 From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com> Date: Sun, 10 Mar 2024 17:27:45 -0500 Subject: [PATCH 077/113] [ROCM] Fix blockReduceSum to use correct warp counts for ROCm and CUDA (#3262) --- csrc/attention/attention_kernels.cu | 8 -------- csrc/cuda_compat.h | 10 ++++++++++ csrc/reduction_utils.cuh | 6 +++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index b5be3befa07e2..5e61668d5cc1a 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -15,9 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifdef USE_ROCM -#include -#endif #include #include @@ -31,11 +28,6 @@ #include -#ifndef USE_ROCM -#define WARP_SIZE 32 -#else -#define WARP_SIZE warpSize -#endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h index aa58dd73c148a..c711d8d1b24b9 100644 --- a/csrc/cuda_compat.h +++ b/csrc/cuda_compat.h @@ -1,5 +1,15 @@ #pragma once +#ifdef USE_ROCM +#include +#endif + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + #ifndef USE_ROCM #define VLLM_LDG(arg) __ldg(arg) #else diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index b95ccef162072..210bf0b023ab2 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -24,7 +24,7 @@ namespace vllm { template __inline__ __device__ T warpReduceSum(T val) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) val += VLLM_SHFL_XOR_SYNC(val, mask); return val; } @@ -32,7 +32,7 @@ __inline__ __device__ T warpReduceSum(T val) { /* Calculate the sum of all elements in a block */ template __inline__ __device__ T blockReduceSum(T val) { - static __shared__ T shared[32]; + static __shared__ T shared[WARP_SIZE]; int lane = threadIdx.x & 0x1f; int wid = threadIdx.x >> 5; @@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) { // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent // blockDim.x is not divided by 32 - val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); + val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f); val = warpReduceSum(val); return val; } From 9e8744a545f34ca636a5113ae98cec5851af56eb Mon Sep 17 00:00:00 2001 From: Roy Date: Mon, 11 Mar 2024 10:17:16 +0800 Subject: [PATCH 078/113] [BugFix] Fix get tokenizer when using ray (#3301) --- tests/async_engine/test_async_llm_engine.py | 3 +++ vllm/engine/async_llm_engine.py | 9 +++++++-- vllm/engine/llm_engine.py | 8 +++++++- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/transformers_utils/tokenizer.py | 6 ++++-- 6 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1e31ff7373031..cb125a7bfec30 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -89,3 +89,6 @@ async def test_new_requests_event(): await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 assert engine.engine.step_calls == old_step_calls + 1 + + engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) + assert engine.get_tokenizer() is not None diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 65ab0c0634176..5629d1a863d04 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -5,6 +5,8 @@ from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union, AsyncIterator, Callable) +from transformers import PreTrainedTokenizer + from vllm.lora.request import LoRARequest from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs @@ -372,8 +374,11 @@ def _error_callback(self, exc: Exception) -> None: self.set_errored(exc) self._request_tracker.propagate_exception(exc) - def get_tokenizer(self): - return self.engine.tokenizer.tokenizer + async def get_tokenizer(self) -> "PreTrainedTokenizer": + if self.engine_use_ray: + return await self.engine.get_tokenizer.remote() + else: + return self.engine.get_tokenizer() def start_background_loop(self) -> None: """Start the background loop.""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8484014c9a13f..5b46d9db5649a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -7,6 +7,8 @@ from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union) +from transformers import PreTrainedTokenizer + import vllm from vllm.lora.request import LoRARequest from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, @@ -163,7 +165,11 @@ def __reduce__(self): # the closure used to initialize Ray worker actors raise RuntimeError("LLMEngine should not be pickled!") - def get_tokenizer_for_seq(self, sequence: Sequence): + def get_tokenizer(self) -> "PreTrainedTokenizer": + return self.tokenizer.get_lora_tokenizer() + + def get_tokenizer_for_seq(self, + sequence: Sequence) -> "PreTrainedTokenizer": return self.tokenizer.get_lora_tokenizer(sequence.lora_request) def _dispatch_worker(self): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ba352f18f6454..7d5603c85e4e9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -65,7 +65,7 @@ async def create_chat_completion( lora_request = self._maybe_get_lora(request) guided_decode_logits_processor = ( await get_guided_decoding_logits_processor( - request, self.engine.get_tokenizer())) + request, await self.engine.get_tokenizer())) if guided_decode_logits_processor: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a8244fd150753..c673b2582c47b 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -126,7 +126,7 @@ async def create_completion(self, request: CompletionRequest, lora_request = self._maybe_get_lora(request) guided_decode_logit_processor = ( await get_guided_decoding_logits_processor( - request, self.engine.get_tokenizer())) + request, await self.engine.get_tokenizer())) if guided_decode_logit_processor is not None: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 6edc225cdfc80..2600ea2642da2 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -120,7 +120,8 @@ async def encode_async( def get_lora_tokenizer( self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": if not lora_request or not self.enable_lora: return self.tokenizer if lora_request.lora_int_id not in self.lora_tokenizers: @@ -133,7 +134,8 @@ def get_lora_tokenizer( async def get_lora_tokenizer_async( self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": if not lora_request or not self.enable_lora: return self.tokenizer if lora_request.lora_int_id not in self.lora_tokenizers: From 4b59f00e917679337169c88c981f268e6ab96cd6 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 10 Mar 2024 19:17:46 -0700 Subject: [PATCH 079/113] [Fix] Fix best_of behavior when n=1 (#3298) --- vllm/outputs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index 4f9eddee11cd4..b8173fd7a0638 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -87,12 +87,12 @@ def __init__( @classmethod def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": - # Get the top-n sequences. - n = seq_group.sampling_params.n seqs = seq_group.get_seqs() - if n == 1: + if len(seqs) == 1: top_n_seqs = seqs else: + # Get the top-n sequences. + n = seq_group.sampling_params.n if seq_group.sampling_params.use_beam_search: sorting_key = lambda seq: seq.get_beam_search_score( seq_group.sampling_params.length_penalty) From 2f8844ba08d77af8a64784317055b03a475f6051 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Sun, 10 Mar 2024 19:49:14 -0700 Subject: [PATCH 080/113] Re-enable the 80 char line width limit (#3305) --- pyproject.toml | 6 +- setup.py | 4 +- tests/async_engine/test_chat_template.py | 6 +- tests/core/test_block_manager.py | 3 +- tests/entrypoints/test_guided_processors.py | 4 +- tests/entrypoints/test_openai_server.py | 36 +++--- tests/kernels/test_moe.py | 3 +- tests/kernels/test_prefix_prefill.py | 3 +- tests/lora/test_layer_variation.py | 6 +- tests/lora/test_layers.py | 15 ++- tests/lora/test_llama.py | 47 ++++---- tests/lora/test_mixtral.py | 12 +- tests/metrics/test_metrics.py | 14 ++- tests/models/test_marlin.py | 15 +-- tests/prefix_caching/test_prefix_caching.py | 15 ++- tests/samplers/test_logprobs.py | 4 +- tests/samplers/test_sampler.py | 17 +-- tests/spec_decode/test_metrics.py | 6 +- tests/spec_decode/test_multi_step_worker.py | 3 +- tests/spec_decode/test_spec_decode_worker.py | 18 ++- vllm/config.py | 14 ++- vllm/core/block_manager.py | 15 ++- vllm/core/evictor.py | 6 +- vllm/core/scheduler.py | 8 +- vllm/engine/llm_engine.py | 27 +++-- vllm/engine/metrics.py | 22 ++-- vllm/entrypoints/api_server.py | 8 +- vllm/entrypoints/openai/api_server.py | 33 +++--- vllm/entrypoints/openai/serving_chat.py | 25 ++-- vllm/entrypoints/openai/serving_completion.py | 28 +++-- vllm/entrypoints/openai/serving_engine.py | 13 ++- vllm/lora/layers.py | 14 ++- vllm/lora/models.py | 3 +- vllm/lora/worker_manager.py | 7 +- vllm/model_executor/guided_decoding.py | 6 +- .../guided_logits_processors.py | 15 ++- .../layers/attention/attention.py | 4 +- .../layers/fused_moe/fused_moe.py | 107 ++++++++++++------ vllm/model_executor/layers/linear.py | 12 +- .../layers/quantization/__init__.py | 3 +- .../model_executor/layers/quantization/awq.py | 6 +- .../layers/quantization/gptq.py | 10 +- .../layers/quantization/marlin.py | 39 ++++--- .../layers/quantization/squeezellm.py | 3 +- vllm/model_executor/layers/sampler.py | 3 +- vllm/model_executor/models/baichuan.py | 3 +- vllm/model_executor/models/deepseek.py | 8 +- vllm/model_executor/models/gpt_j.py | 3 +- vllm/model_executor/models/internlm2.py | 3 +- vllm/model_executor/models/olmo.py | 19 ++-- vllm/model_executor/models/qwen2.py | 3 +- vllm/model_executor/models/stablelm.py | 13 ++- vllm/model_executor/models/starcoder2.py | 3 +- vllm/model_executor/neuron_model_loader.py | 3 +- .../parallel_utils/communication_op.py | 5 +- vllm/model_executor/sampling_metadata.py | 3 +- vllm/sampling_params.py | 4 +- vllm/sequence.py | 3 +- vllm/spec_decode/batch_expansion.py | 29 +++-- vllm/spec_decode/multi_step_worker.py | 14 ++- vllm/spec_decode/spec_decode_worker.py | 19 ++-- vllm/transformers_utils/configs/mpt.py | 89 +++------------ vllm/transformers_utils/configs/starcoder2.py | 72 ------------ .../transformers_utils/tokenizers/baichuan.py | 92 +++++++-------- vllm/utils.py | 12 +- vllm/worker/model_runner.py | 11 +- vllm/worker/neuron_worker.py | 6 +- 67 files changed, 557 insertions(+), 528 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c5db016cebdb7..d6fa5d7a035ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,10 @@ requires = [ ] build-backend = "setuptools.build_meta" +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 + [tool.ruff.lint] select = [ # pycodestyle @@ -29,8 +33,6 @@ ignore = [ "F405", "F403", # lambda expression assignment "E731", - # line too long, handled by black formatting - "E501", # .strip() with multi-character strings "B005", # Loop control variable not used within loop body diff --git a/setup.py b/setup.py index 745b5a9b2d02a..023c3cde1910c 100644 --- a/setup.py +++ b/setup.py @@ -142,8 +142,8 @@ def get_pytorch_rocm_arch() -> Set[str]: # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator if env_arch_list is None: command = "rocm_agent_enumerator" - env_arch_list = subprocess.check_output([command]).decode('utf-8')\ - .strip().replace("\n", ";") + env_arch_list = (subprocess.check_output( + [command]).decode('utf-8').strip().replace("\n", ";")) arch_source_str = "rocm_agent_enumerator" else: arch_source_str = "PYTORCH_ROCM_ARCH env variable" diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 32d110e0f0b47..e98bba8d43b49 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -73,7 +73,7 @@ def test_load_chat_template(): assert template_content is not None # Hard coded value for template_chatml.jinja assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" +{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501 def test_no_load_chat_template(): @@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt) # Test assertion - assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}" + assert result == expected_output, ( + f"The generated prompt does not match the expected output for " + f"model {model} and template {template}") diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 04d01f7724e4f..b280fd1d73c2f 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -4,7 +4,8 @@ from vllm import SamplingParams from vllm.block import PhysicalTokenBlock -from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus +from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager, + AllocStatus) from vllm.utils import Device from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index 5b39269916f8b..4a0e3e759e25a 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -46,8 +46,8 @@ "required": ["name", "age", "skills", "work history"] } -TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") def test_guided_logits_processors(): diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index f4a6e44d88a87..a5b2bf4c0f0c9 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -5,9 +5,12 @@ import sys import pytest import requests -import ray # using Ray for overall ease of process management, parallel requests, and debugging. +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray import openai # use the official client for correctness check -from huggingface_hub import snapshot_download # downloading lora to test lora requests +# downloading lora to test lora requests +from huggingface_hub import snapshot_download # imports for guided decoding tests import json @@ -17,8 +20,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here -LORA_NAME = "typeof/zephyr-7b-beta-lora" # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" TEST_SCHEMA = { "type": "object", @@ -59,8 +65,8 @@ "required": ["name", "age", "skills", "work history"] } -TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") TEST_CHOICE = [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", @@ -120,8 +126,9 @@ def server(zephyr_lora_files): server_runner = ServerRunner.remote([ "--model", MODEL_NAME, + # use half precision for speed and memory savings in CI environment "--dtype", - "bfloat16", # use half precision for speed and memory savings in CI environment + "bfloat16", "--max-model-len", "8192", "--enforce-eager", @@ -392,7 +399,8 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client. + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. use_beam_search=True), ) assert len(batch.choices) == 4 @@ -469,8 +477,8 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): async def test_guided_json_completion(server, client: openai.AsyncOpenAI): completion = await client.completions.create( model=MODEL_NAME, - prompt= - f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}", + prompt=f"Give an example JSON for an employee profile " + f"that fits this schema: {TEST_SCHEMA}", n=3, temperature=1.0, max_tokens=500, @@ -489,9 +497,11 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI): "role": "system", "content": "you are a helpful assistant" }, { - "role": "user", - "content": "Give an example JSON for an employee profile that " + \ - f"fits this schema: {TEST_SCHEMA}" + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" }] chat_completion = await client.chat.completions.create( model=MODEL_NAME, diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index c402fe3e98c7f..6165225d2d819 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -57,7 +57,8 @@ def test_fused_moe( [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() def test_mixtral_moe(dtype: torch.dtype): - "Make sure our Mixtral MoE implementation agrees with the one from huggingface." + """Make sure our Mixtral MoE implementation agrees with the one from + huggingface.""" # Instantiate our and huggingface's MoE blocks config = MixtralConfig() diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index e881cd1ec3753..a0be658acac7b 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -114,7 +114,8 @@ def test_contexted_kv_attention( v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() - # Warm up the Triton kernel by calling it once before actually measuring generation time + # Warm up the Triton kernel by calling it once before actually measuring + # generation time context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, b_start_loc, b_seq_len, b_ctx_len, max_input_len) torch.cuda.synchronize() diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 1a1da517b2276..95cf0cede8729 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -11,9 +11,9 @@ MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" PROMPTS = [ - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501 ] diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 18ce300449dbf..46f054c5b84ef 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -17,14 +17,16 @@ LoRAMapping, BaseLayerWithLoRA, ) -from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights +from vllm.lora.models import (LoRALayerWeights, convert_mapping, + PackedLoRALayerWeights) from vllm.config import LoRAConfig from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear, QKVParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.utils import set_random_seed from .utils import DummyLoRAManager @@ -258,7 +260,8 @@ def create_random_embedding_layer(): @torch.inference_mode() -# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.") +# @pytest.mark.skip( +# reason="Fails when loras are in any slot other than the first.") @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None: @@ -674,9 +677,9 @@ class FakeConfig: result = linear(input_)[0] subloras = sublora_dict[lora_id] for i, sublora in enumerate(subloras): - result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * ( - i + 1 - )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling + result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * + (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * + sublora.scaling) expected_results.append(result) expected_result = torch.cat(expected_results) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index dfaf8c700695a..130906c3d584d 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -10,12 +10,12 @@ def do_sample(llm, lora_path: str, lora_id: int): prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, @@ -48,20 +48,20 @@ def test_llama_lora(sql_lora_files, tp_size): tensor_parallel_size=tp_size) expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 ] expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 ] print("lora adapter created") @@ -121,7 +121,8 @@ def test_llama_tensor_parallel_equality(sql_lora_files): def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and is more conservative""" + """Test that the LLM initialization works with a warmup LORA path and + is more conservative""" @ray.remote(num_gpus=1) def get_num_gpu_blocks_lora(): @@ -132,13 +133,15 @@ def get_num_gpu_blocks_lora(): @ray.remote(num_gpus=1) def get_num_gpu_blocks_no_lora(): llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks + num_gpu_blocks_no_lora_warmup = ( + llm.llm_engine.cache_config.num_gpu_blocks) return num_gpu_blocks_no_lora_warmup num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) num_gpu_blocks_no_lora_warmup = ray.get( get_num_gpu_blocks_no_lora.remote()) assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more" - " conservative than without lora, therefore the number of memory blocks for the KV cache should be " + "The warmup with lora should be more " + "conservative than without lora, therefore the number of " + "memory blocks for the KV cache should be " "less when using lora than when not using lora") diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index e45fb92ab7edf..4d74722aaa926 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -9,9 +9,9 @@ def do_sample(llm, lora_path: str, lora_id: int): prompts = [ - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501 ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) outputs = llm.generate( @@ -42,9 +42,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): worker_use_ray=True) expected_lora_output = [ - "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", - "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", - "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", + "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 + "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 + "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501 ] assert do_sample(llm, mixtral_lora_files, diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 410bdfa5c69e2..0ab9c63ce4377 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens( gpu_memory_utilization=0.4) tokenizer = vllm_model.model.get_tokenizer() prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] - # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding. + # This test needs at least 2 prompts in a batch of different lengths to + # verify their token count is correct despite padding. assert len(example_prompts) > 1, "at least 2 prompts are required" assert prompt_token_counts[0] != prompt_token_counts[1], ( "prompts of different lengths are required") @@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens( **stat_logger.labels)._value.get() assert vllm_prompt_token_count == metric_count, ( - f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}" - ) + f"prompt token count: {vllm_prompt_token_count!r}\n" + f"metric: {metric_count!r}") @pytest.mark.parametrize("model", MODELS) @@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens( for i in range(len(example_prompts)): vllm_output_ids, vllm_output_str = vllm_outputs[i] prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens. + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) assert vllm_generation_count == metric_count, ( - f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}" - ) + f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index f3cc517364f06..a3a1487e62e05 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -1,7 +1,7 @@ """Compare the outputs of a GPTQ model to a Marlin model. -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the +Note: GPTQ and Marlin do not have bitwise correctness. +As a result, in this test, we just confirm that the top selected tokens of the Marlin/GPTQ models are in the top 3 selections of each other. Note: Marlin internally uses locks to synchronize the threads. This can @@ -14,7 +14,8 @@ import pytest import torch from dataclasses import dataclass -from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY +from vllm.model_executor.layers.quantization import ( + _QUANTIZATION_CONFIG_REGISTRY) capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] @@ -87,11 +88,11 @@ def test_models( if marlin_output_id != gptq_output_id: # Each predicted token must be in top 5 of the other's assert gptq_output_id in marlin_logprobs[idx], ( - f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" - ) + f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n" + f"Marlin:\t{marlin_output_str!r}") assert marlin_output_id in gptq_logprobs[idx], ( - f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" - ) + f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n" + f"Marlin:\t{marlin_output_str!r}") # Break out since sequences will now diverge. break diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7ef8dde7bb8f6..c83551c36ef10 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -20,20 +20,23 @@ def test_block_allocator( num_blocks, enable_caching=True) - # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock + # Allocate two PysicalTokenBlocks with the same hash and check + # that they are the same PhysicalTokenBlock first_block = block_allocator.allocate(block_hash, 0) second_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (second_block.ref_count == 2) - # Free the first_block and confirm that the ref_count is correctly decremented on the second block + # Free the first_block and confirm that the ref_count is correctly + # decremented on the second block block_allocator.free(first_block) assert (second_block.ref_count == 1) # Free the second block block_allocator.free(second_block) - # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back + # Reallocate the first block and confirm that, even after the block + # had its ref_count go to 0, we still get the same block back first_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (first_block.block_hash == block_hash) @@ -56,7 +59,8 @@ def test_eviction(num_blocks: int, ): for block in blocks: block_allocator.free(block) - # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block + # Allocate a new block and confirm that it's the first block freed. + # I.E The Least Recently Used block new_block_hash = block_size new_block = block_allocator.allocate(new_block_hash, 0) assert (new_block == blocks[0]) @@ -68,7 +72,8 @@ def test_eviction(num_blocks: int, ): assert (realloc_block == blocks[realloc_block_hash]) assert (realloc_block.block_hash == realloc_block_hash) - # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list + # Allocate a new block and confirm that it's not the realloc_block, + # since the realloc_block shouldn't be in the free list new_block_hash = block_size + 1 new_block = block_allocator.allocate(new_block_hash, 0) assert (realloc_block != new_block) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 1abb55f021214..14f1872c45258 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -70,8 +70,8 @@ def test_get_prompt_logprobs( hf_logprob[i][-1][token_id].item(), atol=1e-2, rtol=1e-2) - assert isinstance(sample_logprob.decoded_token, str), \ - ("The token should be decoded by the time it is returned " + assert isinstance(sample_logprob.decoded_token, str), ( + "The token should be decoded by the time it is returned " " to the user.") diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 31e865f42ff3b..1bc8703d1a8e0 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -255,9 +255,10 @@ def test_sampling(model_runner: ModelRunner): if metadata.sampling_params.use_beam_search: continue - if metadata.sampling_params.seed is not None \ - and expected_tokens[i] is None: - # Record seeded random result to compare with results of second invocation + if (metadata.sampling_params.seed is not None + and expected_tokens[i] is None): + # Record seeded random result to compare with results of + # second invocation expected_tokens[i] = [ nth_output.output_token for nth_output in sequence_output.samples @@ -265,11 +266,13 @@ def test_sampling(model_runner: ModelRunner): continue for n, nth_output in enumerate(sequence_output.samples): - if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None: + if (metadata.sampling_params.temperature == 0 + or metadata.sampling_params.seed is not None): # Ensure exact matches for greedy or random with seed assert nth_output.output_token == expected_tokens[i][n] else: - # For non-seeded random check that one of the high-logit tokens were chosen + # For non-seeded random check that one of the high-logit + # tokens were chosen assert nth_output.output_token in expected_tokens[i] # Test batch @@ -284,8 +287,8 @@ def test_sampling(model_runner: ModelRunner): input_tensor.data = input_tensor.index_select(0, target_index) fake_logits.data = fake_logits.index_select(0, target_index) - # This time, results of seeded random samples will be compared with the corresponding - # sample in the pre-shuffled batch + # This time, results of seeded random samples will be compared with + # the corresponding sample in the pre-shuffled batch test_sampling(model_runner) del model_runner diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 941ea37aa81e0..09847136d13e9 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -150,8 +150,10 @@ def test_initial_metrics_has_correct_values(has_data: bool): assert metrics.emitted_tokens == num_emitted_tokens if has_data: - assert metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens - assert metrics.system_efficiency == num_emitted_tokens / num_possible_tokens + assert (metrics.draft_acceptance_rate == num_accepted_tokens / + num_draft_tokens) + assert (metrics.system_efficiency == num_emitted_tokens / + num_possible_tokens) else: assert math.isnan(metrics.draft_acceptance_rate) assert math.isnan(metrics.system_efficiency) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 88bb7c293fe95..45b43ec59ee8f 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -3,7 +3,8 @@ import pytest from unittest.mock import MagicMock -from vllm.spec_decode.multi_step_worker import MultiStepWorker, DraftModelTop1Proposer +from vllm.spec_decode.multi_step_worker import (MultiStepWorker, + DraftModelTop1Proposer) from vllm.worker.worker import Worker from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplerOutput diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index e919711c3ed2c..bfc69e01e3eb9 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -4,12 +4,15 @@ from unittest.mock import MagicMock from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker, split_num_cache_blocks_evenly +from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, + split_num_cache_blocks_evenly) from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.model_executor.utils import set_random_seed from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from .utils import mock_worker, create_batch, ExecuteModelData, create_sampler_output_list -from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics, AsyncMetricsCollector +from .utils import (mock_worker, create_batch, ExecuteModelData, + create_sampler_output_list) +from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics, + AsyncMetricsCollector) @pytest.mark.parametrize('k', [1, 2, 6]) @@ -391,13 +394,15 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): mock_rejsample_metrics = MagicMock( spec=SpecDecodeWorkerMetrics) if returns_metrics else None - metrics_collector.maybe_collect_rejsample_metrics.return_value = mock_rejsample_metrics + metrics_collector.maybe_collect_rejsample_metrics.return_value = ( + mock_rejsample_metrics) output = worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics - call_args_list = metrics_collector.maybe_collect_rejsample_metrics.call_args_list + call_args_list = ( + metrics_collector.maybe_collect_rejsample_metrics.call_args_list) assert len(call_args_list) == 1 args, kwargs = call_args_list[0] assert args[0] == k or kwargs.get('k', -1) == k @@ -547,7 +552,8 @@ def test_profile_num_available_blocks(available_gpu_blocks: int, target_worker.profile_num_available_blocks.return_value = ( available_gpu_blocks, available_cpu_blocks) - target_worker.get_cache_block_size_bytes.return_value = target_cache_block_size_bytes + target_worker.get_cache_block_size_bytes.return_value = ( + target_cache_block_size_bytes) draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, diff --git a/vllm/config.py b/vllm/config.py index ef9a920f29c2a..e893fe702c975 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -45,7 +45,7 @@ class ModelConfig: a tag name, or a commit id. If unspecified, will use the default version. code_revision: The specific revision to use for the model code on - Hugging Face Hub. It can be a branch name, a tag name, or a + Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. tokenizer_revision: The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use @@ -189,8 +189,8 @@ def _verify_quantization(self) -> None: if is_hip( ) and self.quantization in rocm_not_supported_quantization: raise ValueError( - f"{self.quantization} quantization is currently not supported " - f"in ROCm.") + f"{self.quantization} quantization is currently not " + f"supported in ROCm.") if self.quantization != "marlin": logger.warning( f"{self.quantization} quantization is not fully " @@ -321,7 +321,8 @@ def __init__( self.num_cpu_blocks = None def metrics_info(self): - # convert cache_config to dict(key: str, value: str) for prometheus metrics info + # convert cache_config to dict(key: str, value: str) for prometheus + # metrics info return {key: str(value) for key, value in self.__dict__.items()} def _verify_args(self) -> None: @@ -399,8 +400,9 @@ def __init__( ) -> None: self.pipeline_parallel_size = pipeline_parallel_size if is_neuron(): - # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly. - # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload + # For Neuron device support, here we assign TP=1 to avoid sharding + # within vLLM directly. Transformer-neuronx would take + # neuron_tp_degree attribute, and distribute the workload # to multiple NeuronCores. self.tensor_parallel_size = 1 self.neuron_tp_degree = tensor_parallel_size diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 52b120f227eda..8bfc14999f0a7 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -95,13 +95,15 @@ def free(self, block: PhysicalTokenBlock) -> None: del self.cached_blocks[block.block_hash] def get_num_free_blocks(self) -> int: - return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks + return (self.num_blocks - self.current_num_blocks + + self.evictor.num_blocks) def contains_block(self, block_hash: int) -> bool: return block_hash in self.cached_blocks or block_hash in self.evictor def update_hash(self, block_hash: int, block: PhysicalTokenBlock): - # If caching is enabled, update the hash of block and the cached_blocks dictionary. + # If caching is enabled, update the hash of block and the + # cached_blocks dictionary. if self.enable_caching: assert not self.contains_block(block_hash) old_hash = block.block_hash @@ -218,10 +220,12 @@ def _promote_last_block( seq: Sequence, last_block: PhysicalTokenBlock, ) -> PhysicalTokenBlock: - # Compute a new hash for the block so that it can be shared by other Sequences + # Compute a new hash for the block so that it can be shared by + # other Sequences new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) - # if new_hash is already in the cached table, then free last_block and return the cached version + # if new_hash is already in the cached table, then free last_block + # and return the cached version if self.gpu_allocator.contains_block(new_hash): self.gpu_allocator.free(last_block) return self.gpu_allocator.allocate(new_hash) @@ -289,7 +293,8 @@ def append_slot( assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. - # If the last block is now complete, promote it to a full block so that it can be shared + # If the last block is now complete, promote it to a full block so + # that it can be shared new_block = self._maybe_promote_last_block(seq, last_block) block_table[-1] = new_block return None diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index b538ea574b604..1d81f5a97d71c 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -39,9 +39,9 @@ def add(self, block: PhysicalTokenBlock): @abstractmethod def remove(self, block_hash: int) -> PhysicalTokenBlock: """Simply removes the block with the hash value block_hash from the - evictor. Caller is responsible for making sure that block_hash is contained - in the evictor before calling remove. Should be used to "bring back" blocks - that have been freed but not evicted yet. + evictor. Caller is responsible for making sure that block_hash is + contained in the evictor before calling remove. Should be used to + "bring back" blocks that have been freed but not evicted yet. """ pass diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c96c6d62ef19d..9255f91be55cb 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -214,8 +214,8 @@ def _schedule(self) -> SchedulerOutputs: lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: + if (lora_int_id > 0 and lora_int_id not in curr_loras + and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. leftover_waiting_sequences.appendleft(seq_group) @@ -309,8 +309,8 @@ def _schedule(self) -> SchedulerOutputs: lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: + if (lora_int_id > 0 and lora_int_id not in curr_loras + and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. leftover_swapped.appendleft(seq_group) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5b46d9db5649a..6e045cd6d73c6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -100,7 +100,8 @@ def __init__( f"download_dir={model_config.download_dir!r}, " f"load_format={model_config.load_format}, " f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " - f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, " + f"disable_custom_all_reduce=" + f"{parallel_config.disable_custom_all_reduce}, " f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " @@ -929,7 +930,8 @@ def _get_stats(self, # Latency Timings. time_last_iters = [] for seq_group in scheduler_outputs.scheduled_seq_groups: - # Time since last token. (n.b. updates seq_group.metrics.last_token_time) + # Time since last token. + # (n.b. updates seq_group.metrics.last_token_time) time_last_iters.append(seq_group.get_last_latency(now)) # Time since arrival for all finished requests. if seq_group.is_finished(): @@ -961,16 +963,17 @@ def _decode_logprobs(self, seq: Sequence, prms: SamplingParams, for token_id, sample_logprob in logprobs.items(): if (sample_logprob.decoded_token is None and token_id != -1): all_input_ids_with_logprob = all_input_ids[:-1] + [token_id] - _, new_text, prefix_offset, read_offset = detokenize_incrementally( - self.get_tokenizer_for_seq(seq), - all_input_ids=all_input_ids_with_logprob, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) + (_, new_text, prefix_offset, + read_offset) = detokenize_incrementally( + self.get_tokenizer_for_seq(seq), + all_input_ids=all_input_ids_with_logprob, + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms. + spaces_between_special_tokens, + ) sample_logprob.decoded_token = new_text def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index d31542159e4a4..17b1852f5b0a3 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,5 +1,6 @@ from vllm.logger import init_logger -from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics +from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY, + disable_created_metrics) import time import numpy as np @@ -177,10 +178,12 @@ def _log_prometheus(self, stats: Stats) -> None: def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on the vLLM side. - # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the grafana/prometheus side. - # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 + # Support legacy gauge metrics that make throughput calculations on + # the vLLM side. Moving forward, we should use counters like + # counter_prompt_tokens, counter_generation_tokens + # Which log raw data and calculate summaries using rate() on the + # grafana/prometheus side. See + # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 self.metrics.gauge_avg_prompt_throughput.labels( **self.labels).set(prompt_throughput) self.metrics.gauge_avg_generation_throughput.labels( @@ -188,7 +191,7 @@ def _log_prometheus_interval(self, prompt_throughput: float, def log(self, stats: Stats) -> None: """Called by LLMEngine. - Logs to prometheus and tracked stats every iteration. + Logs to prometheus and tracked stats every iteration. Logs to Stdout every self.local_interval seconds.""" # Log to prometheus. @@ -200,8 +203,8 @@ def log(self, stats: Stats) -> None: # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): - - # Compute summary metrics for tracked stats (and log them to promethus if applicable). + # Compute summary metrics for tracked stats (and log them + # to promethus if applicable). prompt_throughput = self._get_throughput(self.num_prompt_tokens, now=stats.now) generation_throughput = self._get_throughput( @@ -213,7 +216,8 @@ def log(self, stats: Stats) -> None: # Log to stdout. logger.info( f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " - f"Avg generation throughput: {generation_throughput:.1f} tokens/s, " + f"Avg generation throughput: " + f"{generation_throughput:.1f} tokens/s, " f"Running: {stats.num_running} reqs, " f"Swapped: {stats.num_swapped} reqs, " f"Pending: {stats.num_waiting} reqs, " diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 1eb4ab8b06b64..86b6c4c67cfa4 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,7 +1,9 @@ """ -NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. -It is not intended for production use. For production use, we recommend using our OpenAI compatible server. -We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead. +NOTE: This API server is used only for demonstrating usage of AsyncEngine +and simple performance benchmarks. It is not intended for production use. +For production use, we recommend using our OpenAI compatible server. +We are also not going to accept PRs modifying this file, please +change `vllm/entrypoints/openai/api_server.py` instead. """ import argparse diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9f29b4ac92f48..00407bc0e809c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -18,7 +18,9 @@ import vllm from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest, + ErrorResponse) from vllm.logger import init_logger from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -84,13 +86,11 @@ def parse_args(): type=json.loads, default=["*"], help="allowed headers") - parser.add_argument( - "--api-key", - type=str, - default=None, - help= - "If provided, the server will require this key to be presented in the header." - ) + parser.add_argument("--api-key", + type=str, + default=None, + help="If provided, the server will require this key " + "to be presented in the header.") parser.add_argument("--served-model-name", type=str, default=None, @@ -103,9 +103,8 @@ def parse_args(): default=None, nargs='+', action=LoRAParserAction, - help= - "LoRA module configurations in the format name=path. Multiple modules can be specified." - ) + help="LoRA module configurations in the format name=path. " + "Multiple modules can be specified.") parser.add_argument("--chat-template", type=str, default=None, @@ -138,9 +137,10 @@ def parse_args(): help="Additional ASGI middleware to apply to the app. " "We accept multiple --middleware arguments. " "The value should be an import path. " - "If a function is provided, vLLM will add it to the server using @app.middleware('http'). " - "If a class is provided, vLLM will add it to the server using app.add_middleware(). " - ) + "If a function is provided, vLLM will add it to the server " + "using @app.middleware('http'). " + "If a class is provided, vLLM will add it to the server " + "using app.add_middleware(). ") parser = AsyncEngineArgs.add_cli_args(parser) return parser.parse_args() @@ -235,9 +235,8 @@ async def authentication(request: Request, call_next): elif inspect.iscoroutinefunction(imported): app.middleware("http")(imported) else: - raise ValueError( - f"Invalid middleware {middleware}. Must be a function or a class." - ) + raise ValueError(f"Invalid middleware {middleware}. " + f"Must be a function or a class.") logger.info(f"vLLM API server version {vllm.__version__}") logger.info(f"args: {args}") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7d5603c85e4e9..d2fb9ca001b15 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -12,7 +12,8 @@ UsageInfo) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA -from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) logger = init_logger(__name__) @@ -37,8 +38,9 @@ async def create_chat_completion( ChatCompletionResponse]: """Completion API similar to OpenAI's API. - See https://platform.openai.com/docs/api-reference/chat/create - for the API specification. This API mimics the OpenAI ChatCompletion API. + See https://platform.openai.com/docs/api-reference/chat/create + for the API specification. This API mimics the OpenAI + ChatCompletion API. NOTE: Currently we do not support the following feature: - function_call (Users should implement this by themselves) @@ -116,7 +118,8 @@ async def chat_completion_stream_generator( # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: - # Send first response for each request.n (index) with the role + # Send first response for each request.n (index) with + # the role role = self.get_chat_request_role(request) for i in range(request.n): choice_data = ChatCompletionResponseStreamChoice( @@ -133,7 +136,8 @@ async def chat_completion_stream_generator( data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" - # Send response to echo the input portion of the last message + # Send response to echo the input portion of the + # last message if request.echo: last_msg_content = "" if request.messages and isinstance( @@ -145,11 +149,12 @@ async def chat_completion_stream_generator( if last_msg_content: for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage( - content=last_msg_content), - finish_reason=None) + choice_data = ( + ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage( + content=last_msg_content), + finish_reason=None)) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c673b2582c47b..b78f053800f3c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,7 +1,8 @@ import asyncio import time from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple +from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional, + Dict, Tuple) from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -16,7 +17,8 @@ ) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA -from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) logger = init_logger(__name__) @@ -44,9 +46,8 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]: prompt_is_tokens = True prompts = prompt # case 4: array of token arrays else: - raise ValueError( - "prompt must be a string, array of strings, array of tokens, or array of token arrays" - ) + raise ValueError("prompt must be a string, array of strings, " + "array of tokens, or array of token arrays") return prompt_is_tokens, prompts @@ -156,7 +157,8 @@ async def create_completion(self, request: CompletionRequest, int, RequestOutput]] = merge_async_iterators(*generators) # Similar to the OpenAI API, when n != best_of, we do not stream the - # results. In addition, we do not stream the results when use beam search. + # results. In addition, we do not stream the results when use + # beam search. stream = (request.stream and (request.best_of is None or request.n == request.best_of) and not request.use_beam_search) @@ -223,7 +225,8 @@ async def completion_stream_generator( for output in res.outputs: i = output.index + prompt_idx * request.n - # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. + # TODO(simon): optimize the performance by avoiding full + # text O(n^2) sending. if request.echo and request.max_tokens == 0: # only return the prompt @@ -231,11 +234,12 @@ async def completion_stream_generator( delta_token_ids = res.prompt_token_ids top_logprobs = res.prompt_logprobs has_echoed[i] = True - elif request.echo and request.max_tokens > 0 and not has_echoed[ - i]: + elif (request.echo and request.max_tokens > 0 + and not has_echoed[i]): # echo the prompt and first token delta_text = res.prompt + output.text - delta_token_ids = res.prompt_token_ids + output.token_ids + delta_token_ids = (res.prompt_token_ids + + output.token_ids) top_logprobs = res.prompt_logprobs + (output.logprobs or []) has_echoed[i] = True @@ -248,7 +252,9 @@ async def completion_stream_generator( i]:] if output.logprobs else None if request.logprobs is not None: - assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" + assert top_logprobs is not None, ( + "top_logprobs must be provided when logprobs " + "is requested") logprobs = self._create_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 230d13d97dbba..2db884945c491 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -50,10 +50,12 @@ def __init__(self, except RuntimeError: event_loop = None - if event_loop is not None and event_loop.is_running( - ): # If the current is instanced by Ray Serve, there is already a running event loop + if event_loop is not None and event_loop.is_running(): + # If the current is instanced by Ray Serve, + # there is already a running event loop event_loop.create_task(self._post_init()) - else: # When using single vLLM without engine_use_ray + else: + # When using single vLLM without engine_use_ray asyncio.run(self._post_init()) async def _post_init(self): @@ -178,8 +180,9 @@ def _validate_prompt_and_tokenize( if token_num + request.max_tokens > self.max_model_len: raise ValueError( - f"This model's maximum context length is {self.max_model_len} tokens. " - f"However, you requested {request.max_tokens + token_num} tokens " + f"This model's maximum context length is " + f"{self.max_model_len} tokens. However, you requested " + f"{request.max_tokens + token_num} tokens " f"({token_num} in the messages, " f"{request.max_tokens} in the completion). " f"Please reduce the length of the messages or completion.", ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e667d70f71e39..99e6cdeee6364 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -20,10 +20,12 @@ RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim +from vllm.model_executor.parallel_utils.utils import ( + split_tensor_along_last_dim) if TYPE_CHECKING: pass @@ -84,7 +86,8 @@ def _apply_lora_packed_nslice( lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) indices: (batch_size) output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), where n is number of slices + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices """ org_output = output x = x.view(-1, x.shape[-1]) @@ -819,9 +822,8 @@ def create_lora_weights( ) -> None: # Keep this in sync with csrc/punica/bgmv/bgmv_config.h if 32000 < self.base_layer.vocab_size > 33024: - raise ValueError( - "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024" - ) + raise ValueError("When using LoRA, vocab size must be " + "32000 >= vocab_size <= 33024") self.lora_a_stacked = torch.zeros( ( max_loras, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 7386d21c58e4e..238da256b7cdc 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,8 @@ from vllm.config import LoRAConfig from vllm.utils import LRUCache, in_wsl -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler +from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer, + from_layer_sampler) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 7e92bc93ab472..911115d63a639 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -154,10 +154,9 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: f"LoRA rank {lora.rank} is greater than max_lora_rank " f"{self.lora_config.max_lora_rank}.") if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} is greater than " - f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}." - ) + raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} " + f"is greater than lora_extra_vocab_size " + f"{self.lora_config.lora_extra_vocab_size}.") return lora def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py index a8573f8bdc6c8..00984460d79a6 100644 --- a/vllm/model_executor/guided_decoding.py +++ b/vllm/model_executor/guided_decoding.py @@ -8,8 +8,10 @@ from typing import Union, Tuple from pydantic import BaseModel -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest -from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest) +from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor, + RegexLogitsProcessor) class GuidedDecodingMode(Enum): diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py index 1b3e5e71a5911..76d41aa37dd7b 100644 --- a/vllm/model_executor/guided_logits_processors.py +++ b/vllm/model_executor/guided_logits_processors.py @@ -107,12 +107,15 @@ def __init__(self, Parameters ---------- schema - A JSON schema that encodes the structure we want the model to generate + A JSON schema that encodes the structure we want the model to + generate tokenizer The model's tokenizer whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` + Pattern to use for JSON syntactic whitespace (doesn't impact + string literals) + Example: allow only a single space or newline with + `whitespace_pattern=r"[\n ]?"` """ if isinstance(schema, type(BaseModel)): schema_str = json.dumps(schema.model_json_schema()) @@ -122,8 +125,8 @@ def __init__(self, schema_str = schema else: raise ValueError( - f"Cannot parse schema {schema}. The schema must be either " + - "a Pydantic object, a dictionary or a string that contains the JSON " - + "Schema specification") + f"Cannot parse schema {schema}. The schema must be either " + f"a Pydantic object, a dictionary or a string that contains " + f"the JSON Schema specification") regex_string = build_regex_from_schema(schema_str, whitespace_pattern) super().__init__(regex_string, tokenizer) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 724dd0511c5aa..4b63b9eaf59a7 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -35,12 +35,12 @@ def __init__( ) -> None: super().__init__() if _use_flash_attn(): - from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend + from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend # noqa: E501 self.backend = FlashAttentionBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) else: - from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend + from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend # noqa: E501 self.backend = XFormersBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 08e3c2d5b706e..3e6dd0dfe2eb3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -30,9 +30,10 @@ def fused_moe_kernel( K, EM, num_valid_tokens, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). stride_am, stride_ak, stride_be, @@ -50,17 +51,30 @@ def fused_moe_kernel( compute_type: tl.constexpr, ): """ - Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices. + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. Key Parameters: - - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token. - - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension. - - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated, - and N is the output feature dimension. - - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to. - - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A. - This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids` - by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert. + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. """ # ----------------------------------------------------------- # Map program ids `pid` to the block of C it should compute. @@ -105,7 +119,8 @@ def fused_moe_kernel( accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): - # Load the next block of A and B, generate a mask by checking the K dimension. + # Load the next block of A and B, generate a mask by checking the + # K dimension. a = tl.load(a_ptrs, mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), @@ -139,30 +154,41 @@ def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Aligns the token distribution across experts to be compatible with block size for matrix multiplication. + Aligns the token distribution across experts to be compatible with block + size for matrix multiplication. Parameters: - - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token. + - topk_ids: A tensor of shape [total_tokens, top_k] representing the + top-k expert indices for each token. - block_size: The block size used in block matrix multiplication. - num_experts: The total number of experts. Returns: - - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert. + - sorted_token_ids: A tensor containing the sorted token indices according + to their allocated expert. - expert_ids: A tensor indicating the assigned expert index for each block. - - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size. + - num_tokens_post_padded: The total number of tokens after padding, + ensuring divisibility by block_size. - This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. - Padding ensures that during block matrix multiplication, the dimensions align correctly. + This function pads the number of tokens that each expert needs to process + so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions + align correctly. Example: - Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4: - - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens. + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], + block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, + with each expert needing to process 3 tokens. - As block_size is 4, we pad 1 token for each expert. - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. - Then append padding tokens [12, 12, 12, 12] for each block. - - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. - Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication. - - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. + - After sorting by expert index, we obtain token_ids + [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in + the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible + by block_size for proper block matrix operations. """ sorted_ids = torch.empty( (topk_ids.numel() + num_experts * (block_size - 1), ), @@ -224,13 +250,14 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: """ Return optimized configurations for the fused MoE kernel. - The return value will be a dictionary that maps an irregular grid of batch sizes - to configurations of the fused_moe kernel. To evaluate the kernel on a given batch - size bs, the closest batch size in the grid should be picked and the associated - configuration chosen to invoke the kernel. + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. """ - # First look up if an optimized configuration is available in the configs directory + # First look up if an optimized configuration is available in the configs + # directory device_name = torch.cuda.get_device_name().replace(" ", "_") config_file_path = os.path.join( @@ -243,7 +270,8 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: # If a configuration has been found, return it return {int(key): val for key, val in json.load(f).items()} - # If no optimized configuration is available, we will use the default configuration + # If no optimized configuration is available, we will use the default + # configuration return None @@ -258,18 +286,22 @@ def fused_moe( override_config: Optional[Dict[str, Any]] = None, ) -> torch.Tensor: """ - This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. - + This function computes a Mixture of Experts (MoE) layer using two sets of + weights, w1 and w2, and top-k gating mechanism. + Parameters: - hidden_states (torch.Tensor): The input tensor to the MoE layer. - w1 (torch.Tensor): The first set of expert weights. - w2 (torch.Tensor): The second set of expert weights. - - gating_output (torch.Tensor): The output of the gating operation (before softmax). + - gating_output (torch.Tensor): The output of the gating operation + (before softmax). - topk (int): The number of top-k experts to select. - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - - inplace (bool): If True, perform the operation in-place. Defaults to False. - - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration. - + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - override_config (Optional[Dict[str, Any]]): Optional override + for the kernel configuration. + Returns: - torch.Tensor: The output tensor after applying the MoE layer. """ @@ -325,7 +357,8 @@ def fused_moe( configs = get_moe_configs(E, w2.shape[2]) if configs: - # If an optimal configuration map has been found, look up the optimal config + # If an optimal configuration map has been found, look up the + # optimal config config = configs[min(configs.keys(), key=lambda x: abs(x - M))] else: # Else use the default config diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index b2396a1d6f141..60f6fc83b200f 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -285,7 +285,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -307,7 +308,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -413,7 +415,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -442,7 +445,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index dc54641878c64..af27b1844cea4 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,6 +1,7 @@ from typing import Type -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3e1c814dd233c..2caef5f1ebf50 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -6,7 +6,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class AWQConfig(QuantizationConfig): @@ -50,7 +51,8 @@ def get_min_capability(self) -> int: def get_config_filenames() -> List[str]: return [ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq - "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + "quantize_config.json", ] @classmethod diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 2e6aabb232673..bb69c7235a133 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -31,8 +31,8 @@ def __init__( self.pack_factor = Fraction(32, self.weight_bits) if self.weight_bits not in [2, 3, 4, 8]: raise ValueError( - "Currently, only 2/3/4/8-bit weight quantization is supported for " - f"GPTQ, but got {self.weight_bits} bits.") + "Currently, only 2/3/4/8-bit weight quantization is " + f"supported for GPTQ, but got {self.weight_bits} bits.") def __repr__(self) -> str: return (f"GPTQConfig(weight_bits={self.weight_bits}, " @@ -101,7 +101,8 @@ def create_weights( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") - if output_size_per_partition % self.quant_config.pack_factor.numerator != 0: + if (output_size_per_partition % self.quant_config.pack_factor.numerator + != 0): raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " @@ -114,7 +115,8 @@ def create_weights( exllama_state = ExllamaState.UNINITIALIZED scale_and_zero_size = input_size // group_size scale_and_zero_input_dim = None - if input_size != input_size_per_partition and self.quant_config.group_size != -1: + if (input_size != input_size_per_partition + and self.quant_config.group_size != -1): # For act-order models, we cannot use Exllama for row parallel layer if self.quant_config.desc_act: exllama_state = ExllamaState.UNUSED diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 7566d78a8aba4..0c4f20d9e3a58 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -5,7 +5,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class MarlinConfig(QuantizationConfig): @@ -22,8 +23,9 @@ def __init__( self.group_size = group_size if self.group_size != 128 and self.group_size != -1: raise ValueError( - "Currently, only group size 128 and -1 (channelwise) is supported for " - f"Marlin, but got group_size of {self.group_size}") + "Currently, only group size 128 and -1 (channelwise) " + "is supported for Marlin, but got group_size of " + f"{self.group_size}") # 4 Bits packed into 32 bit datatype. self.pack_factor = 32 // 4 @@ -37,7 +39,8 @@ def __init__( # Min in_features dim self.min_k_threads = 128 - # Max parallel problems to solve at once (improves large batch performance) + # Max parallel problems to solve at once (improves large + # batch performance) self.max_parallel = 16 # Permutation length used by the marlin kernels. @@ -102,22 +105,26 @@ def create_weights( # Validate output_size_per_partition if output_size_per_partition % self.quant_config.min_n_threads != 0: raise ValueError( - f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}." - ) + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"min_n_threads = {self.quant_config.min_n_threads}.") if output_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( - f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}." - ) + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"pack_factor = {self.quant_config.pack_factor}.") # Validate input_size_per_partition if input_size_per_partition % self.quant_config.min_k_threads != 0: raise ValueError( - f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}." - ) - if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0: - raise ValueError( - f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}." - ) + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"min_k_threads = {self.quant_config.min_k_threads}.") + if (self.quant_config.group_size != -1 and + input_size_per_partition % self.quant_config.group_size != 0): + raise ValueError(f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"group_size = {self.quant_config.group_size}.") # Check that we have at least 4 tiles horizontally in the shard num_tiles_per_perm = self.quant_config.perm_len // ( @@ -149,7 +156,9 @@ def create_weights( ) # Determine if channelwise or not - input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size + input_groups = (1 if self.quant_config.group_size == -1 else + input_size_per_partition // + self.quant_config.group_size) scales = Parameter( torch.empty( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 9244e88552756..ed25455e6ec1f 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -6,7 +6,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.utils import is_hip diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 19e7f630c4620..4377b845df628 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -6,7 +6,8 @@ from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_gather) -from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors +from vllm.model_executor.sampling_metadata import (SamplingMetadata, + SamplingTensors) from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceData, SequenceGroupOutput, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 6da0082b94285..cbf472750e294 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -333,7 +333,8 @@ def load_weights(self, if "rotary_emb.inv_freq" in name: continue if name == "lm_head.weight": - # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to: + # Unlike Baichuan, Baichuan2 normalizes the head weights. + # Refer to: # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508 # Distinguish between Baichuan and Baichuan2 by checking the # vocab size. This is suggested by diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index f2dca3df27cfb..13c080cb02774 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -119,7 +119,8 @@ def __init__( linear_method=None) if config.n_shared_experts is not None: - intermediate_size = config.moe_intermediate_size * config.n_shared_experts + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) self.shared_experts = DeepseekMLP( hidden_size=config.hidden_size, intermediate_size=intermediate_size, @@ -273,8 +274,9 @@ def __init__( max_position_embeddings=max_position_embeddings, linear_method=linear_method, ) - if (config.n_routed_experts is not None and \ - layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): self.mlp = DeepseekMoE(config=config, linear_method=linear_method) else: self.mlp = DeepseekMLP( diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index b8c6822e9825e..93dce7b67a7a5 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -143,7 +143,8 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, ): super().__init__() - inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner + inner_dim = (4 * config.n_embd + if config.n_inner is None else config.n_inner) self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention(config, linear_method) self.mlp = GPTJMLP(inner_dim, config, linear_method) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 0ae0a85643456..7b2215ef4bda5 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -305,7 +305,8 @@ def load_weights(self, param = params_dict[name] if "wqkv" in name: config = self.config - kv_groups = config.num_attention_heads // config.num_key_value_heads + kv_groups = (config.num_attention_heads // + config.num_key_value_heads) head_dim = config.hidden_size // config.num_attention_heads loaded_weight = loaded_weight.view(-1, 2 + kv_groups, head_dim, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index fa7a6d850051e..2b0a420e82faf 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -52,7 +52,8 @@ ) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -81,7 +82,8 @@ def output_multiplier(self) -> float: class OlmoAttention(nn.Module): """ - This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + This is the attention block where the output is computed as + ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ @@ -94,11 +96,12 @@ def __init__( self.config = config self.hidden_size = config.d_model assert config.d_model % config.n_heads == 0 - tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( - ) + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) self.total_num_heads = self.config.n_heads assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = self.total_num_heads // tensor_model_parallel_world_size + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) self.head_dim = self.hidden_size // self.total_num_heads # Layer norms. @@ -158,7 +161,8 @@ def forward( class OlmoMLP(nn.Module): """ - This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + This is the MLP block where the output is computed as + ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ @@ -217,7 +221,8 @@ def forward( class OlmoBlock(nn.Module): """ - This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` + This is a typical transformer block where the output is + computed as ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 4dd63f923e5f2..3e4f843e649b4 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -170,7 +170,8 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers + use_sliding_window = (config.use_sliding_window + and layer_idx < config.max_window_layers) self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index d1a547f815616..c66f327beee7a 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,5 +1,6 @@ # coding=utf-8 -# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. +# All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +17,8 @@ # This code is based off the following work: # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json -"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" +"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) +model compatible with HuggingFace weights.""" from typing import List, Optional, Tuple import torch @@ -102,9 +104,9 @@ def __init__(self, self.kv_size = self.num_key_value_heads * self.head_dim self.qkv_bias = getattr(config, "use_qkv_bias", False) if (self.head_dim * self.num_heads * tp_size) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads}).") + raise ValueError(f"hidden_size must be divisible by num_heads " + f"(got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads}).") self.qkv_proj = QKVParallelLinear(self.hidden_size, self.head_dim, @@ -192,7 +194,6 @@ def __init__(self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None) -> None: super().__init__() - # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index efa235233372f..cfbb1bdb7909e 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -35,7 +35,8 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) -from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py index b8d63d4ff12fc..c434b270a5562 100644 --- a/vllm/model_executor/neuron_model_loader.py +++ b/vllm/model_executor/neuron_model_loader.py @@ -34,7 +34,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: def get_model(model_config: ModelConfig, device_config: DeviceConfig, **kwargs) -> nn.Module: - from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig + from transformers_neuronx.config import (NeuronConfig, + ContinuousBatchingConfig) parallel_config = kwargs.get("parallel_config") scheduler_config = kwargs.get("scheduler_config") diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index cf805df892fdc..521b6b8a383b0 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -11,7 +11,8 @@ get_tensor_model_parallel_group, is_cupy_nccl_enabled_for_all_reduce, ) -from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce +from vllm.model_executor.parallel_utils.custom_all_reduce import ( + custom_all_reduce) def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: @@ -24,7 +25,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: and GPU topology. TLDR: always assume this function modifies its input, but use the return - value as the output. + value as the output. """ # Bypass the function if we are using only 1 GPU. if get_tensor_model_parallel_world_size() == 1: diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 7deb80801856e..b23f0170a6ca5 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -114,7 +114,8 @@ def from_sampling_metadata( do_penalties = True if (i < sampling_metadata.num_prompts and sampling_params.prompt_logprobs is not None): - # For tokens in the prompt that we only need to get their logprobs + # For tokens in the prompt that we only need to get + # their logprobs prompt_len = sampling_metadata.prompt_lens[i] temperatures += [temperature] * (prompt_len - 1) top_ps += [top_p] * (prompt_len - 1) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 8103f3c2b24bf..4aa158878fb96 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -74,8 +74,8 @@ class SamplingParams: stop_token_ids: List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. - include_stop_str_in_output: Whether to include the stop strings in output - text. Defaults to False. + include_stop_str_in_output: Whether to include the stop strings in + output text. Defaults to False. ignore_eos: Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. max_tokens: Maximum number of tokens to generate per output sequence. diff --git a/vllm/sequence.py b/vllm/sequence.py index 37c102407a5f2..4a002edaf580f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -351,7 +351,8 @@ def maybe_set_first_token_time(self, time: float) -> None: self.metrics.first_token_time = time def maybe_set_first_scheduled_time(self, time: float) -> None: - """Sets the first scheduled time and time in queue for Request level timings.""" + """Sets the first scheduled time and time in queue for Request + level timings.""" if self.metrics.first_scheduled_time is None: self.metrics.first_scheduled_time = time self.metrics.time_in_queue = time - self.metrics.arrival_time diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 478c950f52873..0f698fa346010 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -5,8 +5,12 @@ from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData) from vllm.worker.worker import Worker -from vllm.spec_decode.util import nvtx_range, sampler_output_to_torch, get_all_seq_ids, split_batch_by_proposal_len -from vllm.spec_decode.interfaces import SpeculativeScorer, SpeculativeProposals, SpeculativeScores +from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, + get_all_seq_ids, + split_batch_by_proposal_len) +from vllm.spec_decode.interfaces import (SpeculativeScorer, + SpeculativeProposals, + SpeculativeScores) SeqId = int TargetSeqId = int @@ -68,11 +72,12 @@ def score_proposals( proposal_lens_list = proposals.proposal_lens.tolist() proposal_token_ids_list = proposals.proposal_token_ids.tolist() - spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens = self._expand_batch( - seq_group_metadata_list=seq_group_metadata_list, - proposal_token_ids_list=proposal_token_ids_list, - proposal_lens_list=proposal_lens_list, - ) + (spec_indices, non_spec_indices, target_seq_group_metadata_list, + num_scoring_tokens) = self._expand_batch( + seq_group_metadata_list=seq_group_metadata_list, + proposal_token_ids_list=proposal_token_ids_list, + proposal_lens_list=proposal_lens_list, + ) target_sampler_output = self._scorer_worker.execute_model( seq_group_metadata_list=target_seq_group_metadata_list, @@ -125,7 +130,8 @@ def _expand_batch( num_scoring_tokens = len(target_seq_group_metadata_list) target_seq_group_metadata_list.extend(non_spec_seqs) - return spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens + return (spec_indices, non_spec_indices, target_seq_group_metadata_list, + num_scoring_tokens) def _contract_batch(self, original_bs: int, target_sampler_output: List[SamplerOutput], @@ -306,10 +312,11 @@ def _split_scoring_output( # Convert non-speculative output tokens to tensors. sampler_output.sampled_token_probs = non_spec_probs sampler_output.sampled_token_ids = non_spec_sampled_tokens - non_spec_target_token_ids, non_spec_target_probs = sampler_output_to_torch( - [sampler_output]) + non_spec_target_token_ids, non_spec_target_probs = ( + sampler_output_to_torch([sampler_output])) - return target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs + return (target_token_ids, target_probs, non_spec_target_token_ids, + non_spec_target_probs) def _create_target_seq_id_iterator( self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index f7be14d3d22c2..0915c275b0408 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -5,7 +5,8 @@ from vllm.sequence import SamplerOutput, SequenceGroupMetadata from vllm.worker.worker import Worker -from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeProposer) from vllm.spec_decode.util import sampler_output_to_torch @@ -247,8 +248,9 @@ def get_proposals( """ # Split speculative- and non-speculative- sequences. - proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices = self._split_by_max_model_len( - seq_group_metadata_list, max_proposal_len) + (proposal_lens, nonzero_proposal_len_seqs, + nonzero_proposal_len_indices) = self._split_by_max_model_len( + seq_group_metadata_list, max_proposal_len) if nonzero_proposal_len_seqs: # Speculate tokens using the draft worker for the speculative @@ -306,7 +308,8 @@ def _split_by_max_model_len( else: proposal_lens.append(0) - return proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices + return (proposal_lens, nonzero_proposal_len_seqs, + nonzero_proposal_len_indices) def _merge_outputs( self, @@ -356,7 +359,8 @@ def _merge_outputs( device=self._device) entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs - proposal_tokens, proposal_probs = entire_proposal_tokens, entire_proposal_probs + proposal_tokens, proposal_probs = (entire_proposal_tokens, + entire_proposal_probs) proposal_lens = torch.zeros(batch_size, dtype=torch.long, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 890e479202372..1e56741347008 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -10,7 +10,8 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.config import CacheConfig -from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len +from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids, + split_batch_by_proposal_len) from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import SpeculativeScorer @@ -25,7 +26,7 @@ class SpecDecodeWorker: LLM, after which some verification routine determines which (if any) of the speculative tokens are accepted by the larger LLM. - See https://github.com/vllm-project/vllm/pull/2188 and + See https://github.com/vllm-project/vllm/pull/2188 and https://github.com/vllm-project/vllm/pull/3103 for more info. The current implementation has the following limitations: @@ -109,10 +110,12 @@ def profile_num_available_blocks(self, block_size: int, block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype)) - scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes( - block_size, cache_dtype) - proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes( - block_size, cache_dtype) + scorer_cache_block_size_bytes = ( + self.scorer_worker.get_cache_block_size_bytes( + block_size, cache_dtype)) + proposer_cache_block_size_bytes = ( + self.proposer_worker.get_cache_block_size_bytes( + block_size, cache_dtype)) new_num_gpu_blocks = split_num_cache_blocks_evenly( scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, @@ -320,8 +323,8 @@ def _create_output_sampler_list( sampler_output_list.append( SamplerOutput(outputs=step_output_token_ids)) - maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics( - k) + maybe_rejsample_metrics = ( + self._metrics.maybe_collect_rejsample_metrics(k)) if maybe_rejsample_metrics is not None: sampler_output_list[ 0].spec_decode_worker_metrics = maybe_rejsample_metrics diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 5ea0d9122ef11..2c0e45623aa25 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -62,62 +62,6 @@ def __init__(self, fc_type: str = 'torch', verbose: Optional[int] = None, **kwargs: Any): - """The MPT configuration class. - Args: - d_model (int): The size of the embedding dimension of the model. - n_heads (int): The number of attention heads. - n_layers (int): The number of layers in the model. - expansion_ratio (int): The ratio of the up/down scale in the ffn. - max_seq_len (int): The maximum sequence length of the model. - vocab_size (int): The size of the vocabulary. - resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. - emb_pdrop (float): The dropout probability for the embedding layer. - learned_pos_emb (bool): Whether to use learned positional embeddings - attn_config (Dict): A dictionary used to configure the model's attention module: - attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention - attn_pdrop (float): The dropout probability for the attention layers. - attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. - qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. - clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to - this value. - softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, - use the default scale of ``1/sqrt(d_keys)``. - prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an - extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix - can attend to one another bi-directionally. Tokens outside the prefix use causal attention. - attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. - When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates - which sub-sequence each token belongs to. - Defaults to ``False`` meaning any provided `sequence_id` will be ignored. - alibi (bool): Whether to use the alibi bias instead of position embeddings. - alibi_bias_max (int): The maximum value of the alibi bias. - kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. - ffn_config (Dict): A dictionary used to configure the model's ffn module: - ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp - init_device (str): The device to use for parameter initialization. - logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. - no_bias (bool): Whether to use bias in all layers. - verbose (int): The verbosity level. 0 is silent. - embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. - norm_type (str): choose type of norm to use - use_cache (bool): Whether or not the model should return the last key/values attentions - init_config (Dict): A dictionary used to configure the model initialization: - init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', - 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or - 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. - init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. - emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. - emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution - used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. - init_std (float): The standard deviation of the normal distribution used to initialize the model, - if using the baseline_ parameter initialization scheme. - init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. - fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. - init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. - --- - See llmfoundry.models.utils.param_init_fns.py for info on other param init config options - fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs. - """ self.d_model = d_model self.n_heads = n_heads self.n_layers = n_layers @@ -139,8 +83,8 @@ def __init__(self, self.fc_type = fc_type if verbose is not None: warnings.warn(DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' - ), + 'verbose argument for MPTConfig is now ignored and ' + 'will be removed. Use python_log_level instead.'), stacklevel=2) if 'name' in kwargs: del kwargs['name'] @@ -149,7 +93,8 @@ def __init__(self, if self.attn_config.get('alibi', False): self.learned_pos_emb = False warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', + f'alibi is turned on, setting `learned_pos_emb` ' + f'to {self.learned_pos_emb}`', stacklevel=2) super().__init__(**kwargs) self._validate_config() @@ -176,8 +121,8 @@ def _validate_config(self) -> None: [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] )): raise ValueError( - "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long - ) + "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " + "probabilities and must be between 0 and 1") if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: raise ValueError( f"Unknown attn_impl={self.attn_config['attn_impl']}") @@ -193,17 +138,17 @@ def _validate_config(self) -> None: if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ 'attn_impl'] not in ['torch', 'triton']: raise NotImplementedError( - 'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long - ) + 'attn_uses_sequence_id only implemented with torch ' + 'and triton attention.') if self.embedding_fraction > 1 or self.embedding_fraction <= 0: raise ValueError( - 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long - ) + 'model.embedding_fraction must be between 0 (exclusive) ' + 'and 1 (inclusive)!') if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model': raise ValueError( - f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long - ) + f"self.logit_scale={self.logit_scale!r} is not recognized as " + "an option; use numeric value or 'inv_sqrt_d_model'.") if self.init_config.get('name', None) is None: raise ValueError( f"self.init_config={self.init_config!r} 'name' needs to be set." @@ -219,11 +164,11 @@ def _validate_config(self) -> None: del te except Exception as exc: raise ImportError( - # pylint: disable=line-too-long - 'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' - + - 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' - + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + + 'TransformerEngine import fail. `fc_type: te` requires ' + 'TransformerEngine be installed. ' + 'The required version of transformer_engine also requires ' + 'FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' ) from exc if self.ffn_config['ffn_type'] == 'mptmlp': diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py index 4c3b6b8def074..2879cd0445275 100644 --- a/vllm/transformers_utils/configs/starcoder2.py +++ b/vllm/transformers_utils/configs/starcoder2.py @@ -2,78 +2,6 @@ class Starcoder2Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a - Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model. - - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 49152): - Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Starcoder2Model`] - hidden_size (`int`, *optional*, defaults to 3072): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 12288): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 30): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 24): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 2): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - norm_epsilon (`float`, *optional*, defaults to 1e-05): - Epsilon value for the layer norm - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - bos_token_id (`int`, *optional*, defaults to 50256): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 50256): - The id of the "end-of-sequence" token. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*): - Sliding window attention window size. If not specified, will default to `None` (no sliding window). - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - residual_dropout (`float`, *optional*, defaults to 0.0): - Residual connection dropout value. - embedding_dropout (`float`, *optional*, defaults to 0.0): - Embedding dropout. - use_bias (`bool`, *optional*, defaults to `True`): - Whether to use bias term on linear layers of the model. - - - ```python - >>> from transformers import Starcoder2Model, Starcoder2Config - - >>> # Initializing a Starcoder2 7B style configuration - >>> configuration = Starcoder2Config() - - >>> # Initializing a model from the Starcoder2 7B style configuration - >>> model = Starcoder2Model(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "starcoder2" keys_to_ignore_at_inference = ["past_key_values"] diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py index 1dd241e4a5c4b..02045bdcb2ccf 100644 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ b/vllm/transformers_utils/tokenizers/baichuan.py @@ -1,4 +1,3 @@ -# yapf: disable # Adapted from # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py # This includes a fix suggested in @@ -13,7 +12,6 @@ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging - logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} @@ -52,27 +50,16 @@ def __init__( clean_up_tokenization_spaces=False, **kwargs, ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - bos_token = ( - AddedToken(bos_token, lstrip=False, rstrip=False) - if isinstance(bos_token, str) - else bos_token - ) - eos_token = ( - AddedToken(eos_token, lstrip=False, rstrip=False) - if isinstance(eos_token, str) - else eos_token - ) - unk_token = ( - AddedToken(unk_token, lstrip=False, rstrip=False) - if isinstance(unk_token, str) - else unk_token - ) - pad_token = ( - AddedToken(pad_token, lstrip=False, rstrip=False) - if isinstance(pad_token, str) - else pad_token - ) + self.sp_model_kwargs = ({} if sp_model_kwargs is None else + sp_model_kwargs) + bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False) + if isinstance(bos_token, str) else bos_token) + eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False) + if isinstance(eos_token, str) else eos_token) + unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False) + if isinstance(unk_token, str) else unk_token) + pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False) + if isinstance(pad_token, str) else pad_token) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token @@ -107,7 +94,10 @@ def vocab_size(self): def get_vocab(self): """Returns vocab as a dict""" - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab = { + self.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } vocab.update(self.added_tokens_encoder) return vocab @@ -130,7 +120,8 @@ def convert_tokens_to_string(self, tokens): out_string = "" prev_is_special = False for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using sentencepiece model + # make sure that special tokens are not decoded using + # sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " @@ -143,9 +134,9 @@ def convert_tokens_to_string(self, tokens): out_string += self.sp_model.decode(current_sub_tokens) return out_string - def save_vocabulary( - self, save_directory, filename_prefix: Optional[str] = None - ) -> Tuple[str]: + def save_vocabulary(self, + save_directory, + filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. @@ -157,24 +148,24 @@ def save_vocabulary( `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") + logger.error(f"Vocabulary path ({save_directory}) " + "should be a directory") return out_vocab_file = os.path.join( save_directory, - (filename_prefix + "-" if filename_prefix else "") - + VOCAB_FILES_NAMES["vocab_file"], + (filename_prefix + "-" if filename_prefix else "") + + VOCAB_FILES_NAMES["vocab_file"], ) if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file - ) and os.path.isfile(self.vocab_file): + out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) - return (out_vocab_file,) + return (out_vocab_file, ) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] @@ -194,7 +185,8 @@ def get_special_tokens_mask( already_has_special_tokens: bool = False, ) -> List[int]: """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + Retrieve sequence ids from a token list that has no special tokens + added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: @@ -202,11 +194,14 @@ def get_special_tokens_mask( List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. + already_has_special_tokens (`bool`, *optional*, defaults to + `False`): + Whether or not the token list is already formatted with + special tokens for the model. Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + `List[int]`: A list of integers in the range [0, 1]: + 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( @@ -220,20 +215,16 @@ def get_special_tokens_mask( if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return ( - bos_token_id - + ([0] * len(token_ids_0)) - + eos_token_id - + bos_token_id - + ([0] * len(token_ids_1)) - + eos_token_id - ) + return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + Creates a mask from the two sequences passed to be used in a + sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` @@ -250,7 +241,8 @@ def create_token_type_ids_from_sequences( Optional second list of IDs for sequence pairs. Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + `List[int]`: List of [token type IDs](../glossary#token-type-ids) + according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] diff --git a/vllm/utils.py b/vllm/utils.py index 5b94067cec777..fe6fd27962cd3 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int: # the Neuron-X backend does not have the `cuda_utils` module. from vllm._C import cuda_utils - max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( - gpu) - # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail + max_shared_mem = ( + cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu)) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py + # will fail assert max_shared_mem > 0, "max_shared_mem can not be zero" return int(max_shared_mem) @@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]: if not cuda_home: cuda_home = '/usr/local/cuda' if os.path.isfile(cuda_home + '/bin/nvcc'): - logger.info( - f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' - ) + logger.info(f'CUDA_HOME is not found in the environment. ' + f'Using {cuda_home} as CUDA_HOME.') else: logger.warning( f'Not found nvcc in {cuda_home}. Skip cuda version check!') diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0dd2309079403..7eac576e3f0fe 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -93,14 +93,13 @@ def load_model(self) -> None: scheduler_config=self.scheduler_config) self.model_memory_usage = m.consumed_memory - logger.info( - f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" - ) + logger.info(f"Loading model weights took " + f"{self.model_memory_usage / float(2**30):.4f} GB") if self.lora_config: - assert hasattr( - self.model, "supported_lora_modules" - ) and self.model.supported_lora_modules, "Model does not support LoRA" + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") assert hasattr( self.model, "embedding_modules"), "Model does not have embedding_modules" diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 3229a21c11a38..340c079600c78 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -79,7 +79,8 @@ def profile_num_available_blocks( cpu_swap_space: int = 0, cache_dtype: str = "float16", ) -> Tuple[int, int]: - """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" + """Simply returns max_num_seqs as num_gpu_blocks, 0 as + num_cpu_blocks.""" num_gpu_blocks = self.scheduler_config.max_num_seqs num_cpu_blocks = 0 return num_gpu_blocks, num_cpu_blocks @@ -177,7 +178,8 @@ def _init_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: - distributed_backend = distributed_backend if distributed_backend else "nccl" + distributed_backend = (distributed_backend + if distributed_backend else "nccl") torch.distributed.init_process_group( backend=distributed_backend, world_size=parallel_config.world_size, From 657061fdced8a33a60c1b09f5da2525de9da8f03 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Mon, 11 Mar 2024 00:54:51 -0700 Subject: [PATCH 081/113] [docs] Add LoRA support information for models (#3299) --- docs/source/models/lora.rst | 3 ++- docs/source/models/supported_models.rst | 27 ++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 21b18c75fc552..f05fafe9f8279 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -92,7 +92,8 @@ LoRA adapter requests if they were provided and ``max_loras`` is set high enough The following is an example request -.. code-block::bash +.. code-block:: bash + curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 9d4ec663a16e5..4019e0bbd90fb 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -8,84 +8,109 @@ The following is the list of model architectures that are currently supported by Alongside each architecture, we include some popular models that use it. .. list-table:: - :widths: 25 25 50 + :widths: 25 25 50 5 :header-rows: 1 * - Architecture - Models - Example HuggingFace Models + - :ref:`LoRA ` * - :code:`AquilaForCausalLM` - Aquila - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. + - ✅︎ * - :code:`BaiChuanForCausalLM` - Baichuan - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. + - * - :code:`DeciLMForCausalLM` - DeciLM - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. + - * - :code:`BloomForCausalLM` - BLOOM, BLOOMZ, BLOOMChat - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. + - * - :code:`FalconForCausalLM` - Falcon - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. + - ✅︎ * - :code:`GPT2LMHeadModel` - GPT-2 - :code:`gpt2`, :code:`gpt2-xl`, etc. + - * - :code:`GPTBigCodeForCausalLM` - StarCoder, SantaCoder, WizardCoder - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. + - * - :code:`GPTJForCausalLM` - GPT-J - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. + - * - :code:`GPTNeoXForCausalLM` - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. + - * - :code:`InternLMForCausalLM` - InternLM - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. + - ✅︎ * - :code:`InternLM2ForCausalLM` - InternLM2 - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. + - * - :code:`LlamaForCausalLM` - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. + - ✅︎ * - :code:`MistralForCausalLM` - Mistral, Mistral-Instruct - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ * - :code:`MixtralForCausalLM` - Mixtral-8x7B, Mixtral-8x7B-Instruct - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc. + - ✅︎ * - :code:`MPTForCausalLM` - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. + - * - :code:`OLMoForCausalLM` - OLMo - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc. + - * - :code:`OPTForCausalLM` - OPT, OPT-IML - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. + - * - :code:`OrionForCausalLM` - Orion - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. + - * - :code:`PhiForCausalLM` - Phi - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. + - * - :code:`Qwen2ForCausalLM` - Qwen2 - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - ✅︎ * - :code:`StableLmForCausalLM` - StableLM - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. From 4c922709b65ff5c0652ae36b93047016bdeaace8 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Mon, 11 Mar 2024 11:03:45 -0700 Subject: [PATCH 082/113] Add distributed model executor abstraction (#3191) --- docs/source/dev/engine/llm_engine.rst | 2 +- format.sh | 8 +- tests/lora/conftest.py | 3 +- vllm/__init__.py | 4 +- vllm/config.py | 7 +- vllm/engine/async_llm_engine.py | 106 +++--- vllm/engine/llm_engine.py | 446 +++----------------------- vllm/engine/ray_utils.py | 58 ++-- vllm/executor/__init__.py | 0 vllm/executor/executor_base.py | 75 +++++ vllm/executor/gpu_executor.py | 163 ++++++++++ vllm/executor/ray_gpu_executor.py | 442 +++++++++++++++++++++++++ vllm/executor/utils.py | 13 + 13 files changed, 818 insertions(+), 509 deletions(-) create mode 100644 vllm/executor/__init__.py create mode 100644 vllm/executor/executor_base.py create mode 100644 vllm/executor/gpu_executor.py create mode 100644 vllm/executor/ray_gpu_executor.py create mode 100644 vllm/executor/utils.py diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst index b550a9b5faa62..1de6d7adc87c6 100644 --- a/docs/source/dev/engine/llm_engine.rst +++ b/docs/source/dev/engine/llm_engine.rst @@ -2,5 +2,5 @@ LLMEngine ================================= .. autoclass:: vllm.engine.llm_engine.LLMEngine - :members: add_request, abort_request, step, _init_cache + :members: add_request, abort_request, step :show-inheritance: \ No newline at end of file diff --git a/format.sh b/format.sh index eb2c5ab031626..ff30111123bee 100755 --- a/format.sh +++ b/format.sh @@ -95,13 +95,17 @@ echo 'vLLM yapf: Done' # echo 'vLLM mypy:' # mypy +CODESPELL_EXCLUDES=( + '--skip' '*docs/source/_build/**' +) + # check spelling of specified files spell_check() { codespell "$@" } spell_check_all(){ - codespell --toml pyproject.toml + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" } # Spelling check of files that differ from main branch. @@ -116,7 +120,7 @@ spell_check_changed() { if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell + codespell "${CODESPELL_EXCLUDES[@]}" fi } diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 67273144ecd02..30a8ad03c8ada 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -152,4 +152,5 @@ def get_model_patched(model_config, device_config, **kwargs): @pytest.fixture def llama_2_7b_model_extra_embeddings( llama_2_7b_engine_extra_embeddings) -> nn.Module: - yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model + yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker. + model_runner.model) diff --git a/vllm/__init__.py b/vllm/__init__.py index f1e30f5eb6e6e..5e40c3c20fcd2 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -3,7 +3,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster +from vllm.engine.ray_utils import initialize_ray_cluster from vllm.entrypoints.llm import LLM from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import SamplingParams @@ -19,5 +19,5 @@ "EngineArgs", "AsyncLLMEngine", "AsyncEngineArgs", - "initialize_cluster", + "initialize_ray_cluster", ] diff --git a/vllm/config.py b/vllm/config.py index e893fe702c975..d2b68b6fa1fe2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, ClassVar +from typing import TYPE_CHECKING, Optional, Union, ClassVar from dataclasses import dataclass import os from packaging.version import Version @@ -10,6 +10,9 @@ from vllm.transformers_utils.config import get_config from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + logger = init_logger(__name__) _GB = 1 << 30 @@ -397,6 +400,7 @@ def __init__( max_parallel_loading_workers: Optional[int] = None, disable_custom_all_reduce: bool = False, ray_workers_use_nsight: bool = False, + placement_group: Optional["PlacementGroup"] = None, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size if is_neuron(): @@ -412,6 +416,7 @@ def __init__( self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce self.ray_workers_use_nsight = ray_workers_use_nsight + self.placement_group = placement_group self.world_size = pipeline_parallel_size * self.tensor_parallel_size # Ray worker is not supported for Neuron backend. diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5629d1a863d04..0cee604c14d45 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,8 +2,8 @@ import os import time from functools import partial -from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator, Callable) +from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, + Union, AsyncIterator) from transformers import PreTrainedTokenizer @@ -11,7 +11,7 @@ from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster, ray +from vllm.engine.ray_utils import initialize_ray_cluster, ray from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -208,17 +208,10 @@ async def step_async(self) -> List[RequestOutput]: if not scheduler_outputs.is_empty(): # Execute the model. - all_outputs = await self._run_workers_async( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] + output = await self.model_executor.execute_model_async( + seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, + scheduler_outputs.blocks_to_swap_out, + scheduler_outputs.blocks_to_copy) else: output = [] @@ -268,37 +261,8 @@ async def add_request_async( lora_request=lora_request, ) - async def _run_workers_async( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Run the driver worker asynchronously. - driver_executor = getattr(self.driver_worker, method) - coros.append(asyncio.get_event_loop().run_in_executor( - None, partial(driver_executor, *driver_args, **driver_kwargs))) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs - - async def check_health_async(self): - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() + async def check_health_async(self) -> None: + self.model_executor.check_health() class AsyncLLMEngine: @@ -353,6 +317,34 @@ def __init__(self, self._request_tracker: Optional[RequestTracker] = None self._errored_with: Optional[BaseException] = None + @classmethod + def from_engine_args(cls, + engine_args: AsyncEngineArgs, + start_engine_loop: bool = True) -> "AsyncLLMEngine": + """Creates an async LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + if parallel_config.worker_use_ray or engine_args.engine_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync + executor_class = RayGPUExecutorAsync + else: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + from vllm.executor.gpu_executor import GPUExecutorAsync + executor_class = GPUExecutorAsync + # Create the async LLM engine. + engine = cls(parallel_config.worker_use_ray, + engine_args.engine_use_ray, + *engine_configs, + executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + max_log_len=engine_args.max_log_len, + start_engine_loop=start_engine_loop) + return engine + @property def is_running(self) -> bool: return (self.background_loop is not None @@ -670,35 +662,13 @@ async def get_model_config(self) -> ModelConfig: else: return self.engine.get_model_config() - @classmethod - def from_engine_args(cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config, - engine_args.engine_use_ray) - # Create the async LLM engine. - engine = cls(parallel_config.worker_use_ray, - engine_args.engine_use_ray, - *engine_configs, - placement_group, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - max_log_len=engine_args.max_log_len, - start_engine_loop=start_engine_loop) - return engine - async def do_log_stats(self) -> None: if self.engine_use_ray: await self.engine.do_log_stats.remote() else: self.engine.do_log_stats() - async def check_health(self): + async def check_health(self) -> None: """Raises an error if engine is unhealthy.""" t = time.perf_counter() logger.debug("Starting health check...") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 6e045cd6d73c6..4cdad4180aa14 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,11 +1,5 @@ -import copy -from collections import defaultdict -import os import time -import pickle -import importlib -from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, - Union) +from typing import Dict, Iterable, List, Optional, Tuple, Type, Union from transformers import PreTrainedTokenizer @@ -15,8 +9,9 @@ ParallelConfig, SchedulerConfig, LoRAConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs +from vllm.executor.executor_base import ExecutorBase from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray +from vllm.engine.ray_utils import initialize_ray_cluster from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -24,29 +19,11 @@ SequenceGroupOutput, SequenceOutput, SequenceStatus) from vllm.transformers_utils.tokenizer import (detokenize_incrementally, TokenizerGroup) -from vllm.utils import (Counter, set_cuda_visible_devices, get_ip, - get_open_port, get_distributed_init_method) - -if ray: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup +from vllm.utils import Counter logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 -# A map between the device type (in device config) to its worker module. -DEVICE_TO_WORKER_MODULE_MAP = { - "cuda": "vllm.worker.worker", - "neuron": "vllm.worker.neuron_worker", -} - -# If the env var is set, it uses the Ray's compiled DAG API -# which optimizes the control plane overhead. -# Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. -USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) - class LLMEngine: """An LLM engine that receives requests and generates texts. @@ -71,8 +48,8 @@ class LLMEngine: parallel_config: The configuration related to distributed execution. scheduler_config: The configuration related to the request scheduler. device_config: The configuration related to the device. - placement_group: Ray placement group for distributed execution. - Required for distributed execution. + executor_class: The model executor class for managing distributed + execution. log_stats: Whether to log statistics. """ @@ -84,7 +61,7 @@ def __init__( scheduler_config: SchedulerConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - placement_group: Optional["PlacementGroup"], + executor_class: Type[ExecutorBase], log_stats: bool, ) -> None: logger.info( @@ -121,33 +98,13 @@ def __init__( self._init_tokenizer() self.seq_counter = Counter() - # Create the parallel GPU workers. - if self.parallel_config.worker_use_ray: - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - # Pass additional arguments to initialize the worker - additional_ray_args = {} - if self.parallel_config.ray_workers_use_nsight: - logger.info("Configuring Ray workers to use nsight.") - additional_ray_args = { - "runtime_env": { - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - } - } - self._init_workers_ray(placement_group, **additional_ray_args) - else: - self._init_workers() - - # Profile the memory usage and initialize the cache. - self._init_cache() + self.model_executor = executor_class(model_config, cache_config, + parallel_config, scheduler_config, + device_config, lora_config) # Create the scheduler. + # NOTE: the cache_config here have been updated with the numbers of + # GPU and CPU blocks, which are profiled in the distributed executor. self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) # Metric Logging. @@ -157,9 +114,29 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) - self.forward_dag = None - if USE_RAY_COMPILED_DAG: - self.forward_dag = self._compiled_ray_dag() + @classmethod + def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + + # Initialize the cluster and specify the executor class. + if parallel_config.worker_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_gpu_executor import RayGPUExecutor + executor_class = RayGPUExecutor + else: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + from vllm.executor.gpu_executor import GPUExecutor + executor_class = GPUExecutor + + # Create the LLM engine. + engine = cls(*engine_configs, + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats) + return engine def __reduce__(self): # This is to ensure that the LLMEngine is not referenced in @@ -173,39 +150,6 @@ def get_tokenizer_for_seq(self, sequence: Sequence) -> "PreTrainedTokenizer": return self.tokenizer.get_lora_tokenizer(sequence.lora_request) - def _dispatch_worker(self): - worker_module = DEVICE_TO_WORKER_MODULE_MAP[ - self.device_config.device_type] - imported_worker = importlib.import_module(worker_module) - Worker = imported_worker.Worker - return Worker - - def _init_workers(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - Worker = self._dispatch_worker() - - assert self.parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - - self.workers: List[Worker] = [] - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, - ) - self._run_workers("init_model") - self._run_workers("load_model") - def _init_tokenizer(self, **tokenizer_init_kwargs): init_kwargs = dict( enable_lora=bool(self.lora_config), @@ -218,126 +162,6 @@ def _init_tokenizer(self, **tokenizer_init_kwargs): self.tokenizer: TokenizerGroup = TokenizerGroup( self.model_config.tokenizer, **init_kwargs) - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - if self.parallel_config.tensor_parallel_size == 1: - num_gpus = self.cache_config.gpu_memory_utilization - else: - num_gpus = 1 - - self.driver_dummy_worker: RayWorkerVllm = None - self.workers: List[RayWorkerVllm] = [] - - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - worker = ray.remote( - num_cpus=0, - num_gpus=num_gpus, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerVllm).remote(self.model_config.trust_remote_code) - - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - else: - self.workers.append(worker) - - if self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "GPU node.") - - driver_node_id, driver_gpu_ids = ray.get( - self.driver_dummy_worker.get_node_and_gpu_ids.remote()) - worker_node_and_gpu_ids = ray.get( - [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) - - node_workers = defaultdict(list) - node_gpus = defaultdict(list) - - node_workers[driver_node_id].append(0) - node_gpus[driver_node_id].extend(driver_gpu_ids) - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, - start=1): - node_workers[node_id].append(i) - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - # Set CUDA_VISIBLE_DEVICES for the driver. - set_cuda_visible_devices(node_gpus[driver_node_id]) - for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): - worker.set_cuda_visible_devices.remote(node_gpus[node_id]) - - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - Worker = self._dispatch_worker() - - # Initialize torch distributed process group for the workers. - model_config = copy.deepcopy(self.model_config) - parallel_config = copy.deepcopy(self.parallel_config) - scheduler_config = copy.deepcopy(self.scheduler_config) - device_config = copy.deepcopy(self.device_config) - lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype - - for rank, (worker, (node_id, - _)) in enumerate(zip(self.workers, - worker_node_and_gpu_ids), - start=1): - local_rank = node_workers[node_id].index(rank) - worker.init_worker.remote( - lambda rank=rank, local_rank=local_rank: Worker( - model_config, - parallel_config, - scheduler_config, - device_config, - local_rank, - rank, - distributed_init_method, - lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, - )) - - driver_rank = 0 - driver_local_rank = node_workers[driver_node_id].index(driver_rank) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - driver_local_rank, - driver_rank, - distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=True, - ) - - # don't use cupy for eager mode - self._run_workers("init_model", - cupy_port=get_open_port() - if not model_config.enforce_eager else None) - self._run_workers( - "load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers, - ) - def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) self.cache_config.verify_with_parallel_config(self.parallel_config) @@ -346,81 +170,6 @@ def _verify_args(self) -> None: self.lora_config.verify_with_scheduler_config( self.scheduler_config) - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameters. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - # FIXME(woosuk): Change to debug log. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = self.cache_config.block_size * num_gpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") - - @classmethod - def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config) - # Create the LLM engine. - engine = cls(*engine_configs, - placement_group, - log_stats=not engine_args.disable_log_stats) - return engine - def encode_request( self, request_id: str, # pylint: disable=unused-argument @@ -826,7 +575,7 @@ def step(self) -> List[RequestOutput]: - A Sequence Group (SG) refer to a group of sequences that are generated from the same prompt. - - Step 2: Calls the workers to execute the model. + - Step 2: Calls the distributed executor to execute the model. - Step 3: Processes the model output. This mainly includes: - Decodes the relevant outputs. @@ -862,19 +611,10 @@ def step(self) -> List[RequestOutput]: seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() if not scheduler_outputs.is_empty(): - # Execute the model. - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] + output = self.model_executor.execute_model( + seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, + scheduler_outputs.blocks_to_swap_out, + scheduler_outputs.blocks_to_copy) else: output = [] @@ -1043,111 +783,13 @@ def _finalize_sequence(self, seq: Sequence, seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) + return self.model_executor.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) + return self.model_executor.remove_lora(lora_id) def list_loras(self) -> List[int]: - return self._run_workers("list_loras") - - def _run_workers( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - max_concurrent_workers: Optional[int] = None, - use_ray_compiled_dag: bool = False, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - if use_ray_compiled_dag: - # Right now, compiled DAG can only accept a single - # input. TODO(sang): Fix it. - output_channels = self.forward_dag.execute(1) - else: - # Start the ray workers first. - ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) - for worker in self.workers - ] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Start the driver worker after all the ray workers. - driver_worker_output = getattr(self.driver_worker, - method)(*driver_args, **driver_kwargs) - - # Get the results of the ray workers. - if self.workers: - if use_ray_compiled_dag: - try: - ray_worker_outputs = [ - pickle.loads(chan.begin_read()) - for chan in output_channels - ] - finally: - # Has to call end_read in order to reuse the DAG. - for chan in output_channels: - chan.end_read() - else: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return [driver_worker_output] + ray_worker_outputs - - def _compiled_ray_dag(self): - import pkg_resources - required_version = "2.9" - current_version = pkg_resources.get_distribution("ray").version - if current_version < required_version: - raise ValueError(f"Ray version {required_version} or greater is " - f"required, but found {current_version}") - - from ray.dag import MultiOutputNode, InputNode - assert self.parallel_config.worker_use_ray - - # Right now, compiled DAG requires at least 1 arg. We send - # a dummy value for now. It will be fixed soon. - with InputNode() as input_data: - forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote.bind(input_data) - for worker in self.workers - ]) - return forward_dag.experimental_compile() + return self.model_executor.list_loras() def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() - - def _check_if_any_actor_is_dead(self): - if not self.parallel_config.worker_use_ray: - return - - if not self.workers: - return - - dead_actors = [] - for actor in self.workers: - actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access - if actor_state["State"] == "DEAD": - dead_actors.append(actor) - if dead_actors: - raise RuntimeError("At least one Worker is dead. " - f"Dead Workers: {dead_actors}. ") + self.model_executor.check_health() diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index bbcbbdfea2f00..742f3dc575190 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,6 +1,6 @@ import pickle -from typing import Optional, List, Tuple, TYPE_CHECKING +from typing import Optional, List, Tuple from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -65,45 +65,38 @@ def execute_model_compiled_dag_remote(self, ignored): ray = None RayWorkerVllm = None -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -def initialize_cluster( +def initialize_ray_cluster( parallel_config: ParallelConfig, - engine_use_ray: bool = False, ray_address: Optional[str] = None, -) -> Optional["PlacementGroup"]: - """Initialize the distributed cluster probably with Ray. +): + """Initialize the distributed cluster with Ray. + + it will connect to the Ray cluster and create a placement group + for the workers, which includes the specification of the resources + for each distributed worker. Args: parallel_config: The configurations for parallel execution. - engine_use_ray: Whether to use Ray for async engine. ray_address: The address of the Ray cluster. If None, uses the default Ray cluster address. - - Returns: - An optional `PlacementGroup`. It includes the specification - of the resources for each distributed worker. None if Ray is - not used. """ - if parallel_config.worker_use_ray or engine_use_ray: - if ray is None: - raise ImportError( - "Ray is not installed. Please install Ray to use distributed " - "serving.") - # Connect to a ray cluster. - if is_hip(): - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if not parallel_config.worker_use_ray: - assert parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - return None + if ray is None: + raise ImportError( + "Ray is not installed. Please install Ray to use distributed " + "serving.") + + # Connect to a ray cluster. + if is_hip(): + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) + + if parallel_config.placement_group: + # Placement group is already set. + return # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() @@ -138,4 +131,5 @@ def initialize_cluster( # if they cannot be provisioned. ray.get(current_placement_group.ready(), timeout=1800) - return current_placement_group + # Set the placement group in the parallel config + parallel_config.placement_group = current_placement_group diff --git a/vllm/executor/__init__.py b/vllm/executor/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py new file mode 100644 index 0000000000000..30717e8a87358 --- /dev/null +++ b/vllm/executor/executor_base.py @@ -0,0 +1,75 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + + +class ExecutorBase(ABC): + """Base class for all executors. + + An executor is responsible for executing the model on a specific device + type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor + that can execute the model on multiple devices. + """ + + @abstractmethod + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + raise NotImplementedError + + @abstractmethod + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + @abstractmethod + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + @abstractmethod + def list_loras(self) -> List[int]: + raise NotImplementedError + + @abstractmethod + def check_health(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError + + +class ExecutorAsyncBase(ExecutorBase): + + @abstractmethod + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + async def check_health_async(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py new file mode 100644 index 0000000000000..9019ee7763c77 --- /dev/null +++ b/vllm/executor/gpu_executor.py @@ -0,0 +1,163 @@ +import importlib +from typing import Dict, List, Optional + +from vllm.lora.request import LoRARequest +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_ip, get_open_port, get_distributed_init_method, + make_async) + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + + +class GPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + # Instantiate the worker and load the model to GPU. + self._init_worker() + + # Profile the memory usage and initialize the cache. + self._init_cache() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + self.driver_worker.init_model() + self.driver_worker.load_model() + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine first profiles the existing memory usage. + Then, it allocates the remaining memory for KV blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_gpu_blocks, num_cpu_blocks = ( + self.driver_worker.profile_num_available_blocks( + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config. + gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + )) + + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self.driver_worker.warm_up_model() + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + output = self.driver_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) + + def list_loras(self) -> List[int]: + return self.driver_worker.list_loras() + + def check_health(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return + + +class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy) + return output + + async def check_health_async(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py new file mode 100644 index 0000000000000..261fcfb7dad9b --- /dev/null +++ b/vllm/executor/ray_gpu_executor.py @@ -0,0 +1,442 @@ +import asyncio +import copy +from collections import defaultdict +import os +import pickle +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.engine.ray_utils import RayWorkerVllm, ray +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port, + get_distributed_init_method, make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayGPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + assert self.parallel_config.worker_use_ray + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self._init_cache() + + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: RayWorkerVllm = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerVllm] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + driver_node_id, driver_gpu_ids = ray.get( + self.driver_dummy_worker.get_node_and_gpu_ids.remote()) + worker_node_and_gpu_ids = ray.get( + [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + node_workers[driver_node_id].append(0) + node_gpus[driver_node_id].extend(driver_gpu_ids) + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, + start=1): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + # Set CUDA_VISIBLE_DEVICES for the driver and workers. + set_cuda_visible_devices(node_gpus[driver_node_id]) + for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): + worker.set_cuda_visible_devices.remote(node_gpus[node_id]) + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + model_config = copy.deepcopy(self.model_config) + parallel_config = copy.deepcopy(self.parallel_config) + scheduler_config = copy.deepcopy(self.scheduler_config) + device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype + + # Initialize the actual workers with the Worker class. + for rank, (worker, (node_id, _)) in enumerate( + zip(self.workers, worker_node_and_gpu_ids), + start=1, + ): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: Worker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, + )) + + # Initialize the driver worker with the Worker class. + driver_rank = 0 + driver_local_rank = node_workers[driver_node_id].index(driver_rank) + self.driver_worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + driver_local_rank, + driver_rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=True, + ) + + # FIXME(woosuk): We are not properly initializing cupy NCCL when + # we have multiple nodes. + self._run_workers("init_model", + cupy_port=get_open_port() + if not model_config.enforce_eager else None) + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + from class :class:`~vllm.worker.Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + all_outputs = self._run_workers( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> List[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *args, **kwargs) + for worker in self.workers + ] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Start the driver worker after all the ray workers. + driver_worker_output = getattr(self.driver_worker, + method)(*driver_args, **driver_kwargs) + + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import MultiOutputNode, InputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote.bind(input_data) + for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase): + + async def _run_workers_async( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + coros = [] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Run the driver worker asynchronously. + driver_executor = make_async(getattr(self.driver_worker, method)) + coros.append(driver_executor(*driver_args, **driver_kwargs)) + + # Run the ray workers asynchronously. + for worker in self.workers: + coros.append(worker.execute_method.remote(method, *args, **kwargs)) + + all_outputs = await asyncio.gather(*coros) + return all_outputs + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + all_outputs = await self._run_workers_async( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + async def check_health_async(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py new file mode 100644 index 0000000000000..44976696a77c6 --- /dev/null +++ b/vllm/executor/utils.py @@ -0,0 +1,13 @@ +def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") From c9415c19d3df26d8ede611abefba35c6837cd934 Mon Sep 17 00:00:00 2001 From: kliuae <17350011+kliuae@users.noreply.github.com> Date: Tue, 12 Mar 2024 04:14:07 +0800 Subject: [PATCH 083/113] [ROCm] Fix warp and lane calculation in blockReduceSum (#3321) --- csrc/reduction_utils.cuh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index 210bf0b023ab2..c25464e866e55 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -29,12 +29,22 @@ __inline__ __device__ T warpReduceSum(T val) { return val; } +__inline__ __device__ constexpr int _calculateLaneMask(int warp_size) { + return warp_size - 1; +} + +__inline__ __device__ constexpr int _calculateWidShift(int warp_size) { + return 5 + (warp_size >> 6); +} + /* Calculate the sum of all elements in a block */ template __inline__ __device__ T blockReduceSum(T val) { static __shared__ T shared[WARP_SIZE]; - int lane = threadIdx.x & 0x1f; - int wid = threadIdx.x >> 5; + constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE); + constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE); + int lane = threadIdx.x & LANE_MASK; + int wid = threadIdx.x >> WID_SHIFT; val = warpReduceSum(val); From 654865e21df8ac6fe95de926625306e5756c2c0d Mon Sep 17 00:00:00 2001 From: DAIZHENWEI <32122197+DAIZHENWEI@users.noreply.github.com> Date: Mon, 11 Mar 2024 13:19:51 -0700 Subject: [PATCH 084/113] Support Mistral Model Inference with transformers-neuronx (#3153) --- examples/offline_inference_neuron.py | 10 ++- vllm/model_executor/models/__init__.py | 7 +- vllm/model_executor/models/neuron/mistral.py | 82 ++++++++++++++++++++ 3 files changed, 93 insertions(+), 6 deletions(-) mode change 100644 => 100755 examples/offline_inference_neuron.py mode change 100644 => 100755 vllm/model_executor/models/__init__.py create mode 100755 vllm/model_executor/models/neuron/mistral.py diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py old mode 100644 new mode 100755 index 9b9dc4d94892f..da8874abd92a2 --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference_neuron.py @@ -14,14 +14,16 @@ llm = LLM( model="openlm-research/open_llama_3b", max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as max sequence length, - # when targeting neuron device. Currently, this is a known limitation in continuous batching - # support in transformers-neuronx. + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx. max_model_len=128, block_size=128, # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, or explicitly assigned. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. device="neuron") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py old mode 100644 new mode 100755 index 75c2ae1e9f48e..bc3b6a582d53d --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -62,8 +62,11 @@ "Sliding window attention is not yet supported in ROCm's flash attention", } -# Models not supported by Neuron. -_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"} +# Models supported by Neuron. +_NEURON_SUPPORTED_MODELS = { + "LlamaForCausalLM": "neuron.llama", + "MistralForCausalLM": "neuron.mistral" +} class ModelRegistry: diff --git a/vllm/model_executor/models/neuron/mistral.py b/vllm/model_executor/models/neuron/mistral.py new file mode 100755 index 0000000000000..a302cce30abab --- /dev/null +++ b/vllm/model_executor/models/neuron/mistral.py @@ -0,0 +1,82 @@ +"""Inference-only Mistral model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import MistralConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput +import os + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class MistralForCausalLM(nn.Module): + + def __init__( + self, + config: MistralConfig, + linear_method=None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = None + self.lm_head = None + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> SamplerOutput: + with torch.inference_mode(): + seq_ids = [] + block_size = self.model.context_buckets[-1] + if input_metadata.is_prompt: + seq_ids = input_metadata.slot_mapping[:, 0] // block_size + else: + seq_ids = input_metadata.block_tables + + logits = self.model(input_ids, + cache_ids=positions, + start_ids=seq_ids) + return logits + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.model.chkpt_model.lm_head, + hidden_states, sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + **kwargs): + from transformers_neuronx.mistral.model import MistralForSampling + + split_model_dir = f"{model_name_or_path}-split" + if os.path.isdir(os.path.join(model_name_or_path, + "pytorch_model.bin")): + split_model_dir = model_name_or_path + elif not os.path.exists(f"{model_name_or_path}-split"): + from transformers import MistralForCausalLM + from transformers_neuronx.module import save_pretrained_split + + hf_model = MistralForCausalLM.from_pretrained( + model_name_or_path, low_cpu_mem_usage=True) + save_pretrained_split(hf_model, f"{model_name_or_path}-split") + + self.model = MistralForSampling.from_pretrained( + split_model_dir, **kwargs) + self.model.to_neuron() From b0925b38789bb3b20dcc39e229fcfe12a311e487 Mon Sep 17 00:00:00 2001 From: Sherlock Xu <65327072+Sherlock113@users.noreply.github.com> Date: Wed, 13 Mar 2024 01:34:30 +0800 Subject: [PATCH 085/113] docs: Add BentoML deployment doc (#3336) Signed-off-by: Sherlock113 --- docs/source/index.rst | 1 + docs/source/serving/deploying_with_bentoml.rst | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 docs/source/serving/deploying_with_bentoml.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index c0250bf99f7ae..65bfbbabf8be1 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -73,6 +73,7 @@ Documentation serving/run_on_sky serving/deploying_with_kserve serving/deploying_with_triton + serving/deploying_with_bentoml serving/deploying_with_docker serving/serving_with_langchain serving/metrics diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst new file mode 100644 index 0000000000000..4b9d19f5bdb72 --- /dev/null +++ b/docs/source/serving/deploying_with_bentoml.rst @@ -0,0 +1,8 @@ +.. _deploying_with_bentoml: + +Deploying with BentoML +====================== + +`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file From 49a3c8662ba745503890ab8b3c502aad7e1a0a19 Mon Sep 17 00:00:00 2001 From: Breno Faria Date: Wed, 13 Mar 2024 01:30:08 +0100 Subject: [PATCH 086/113] Fixes #1556 double free (#3347) --- tests/core/test_block_manager.py | 87 ++++++++++++++++++++++++++++++++ vllm/core/block_manager.py | 17 ++++++- 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index b280fd1d73c2f..44ac05a1430b3 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -274,3 +274,90 @@ def test_reset(): # Resetting block manager frees all allocated blocks. block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks + + +def test_sliding_window_multi_seq(): + """ + Tests that memory allocation and deallocation is handled + correctly with multiple sequences that exceed the sliding + window's capacity. + """ + block_size = 1 + num_cpu_blocks = 8 + num_gpu_blocks = 8 + sliding_window = 2 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + sliding_window=sliding_window, + watermark=0) + + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks + + parent = Sequence(1, "one two three", [0, 1, 2], block_size) + seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(), + None) + block_manager.allocate(seq_group) + + # assert the number of blocks allocated is correct + # the parent seq has len 3, but since sliding_window is 2, + # we will use at most 2 blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # Fork prompt and copy block tables. + child = parent.fork(2) + block_manager.fork(parent, child) + + # assert the number of blocks allocated is correct + # forking does not increase memory consumption + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # assert both parent and child share all blocks + assert block_manager.get_block_table( + parent) == block_manager.get_block_table(child) + + token_id = 4 + # Append token to child. Block is shared so copy on write occurs. + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(child) + + # assert the number of blocks allocated is correct + # we will use now one block more. Each seq will use 2 blocks, + # but only one can be shared + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + token_id = 5 + parent.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(parent) + + # assert the number of blocks allocated is correct + # no change, because both sequences are still just sharing one block + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + block_table_parent = block_manager.get_block_table(parent) + block_table_child = block_manager.get_block_table(child) + + assert block_table_parent != block_table_child + + # assert both blocks are sharing the second-last block + assert block_table_parent[-2] == block_table_child[-2] + + # now let's clean up... + block_manager.free(parent) + + # assert the number of blocks allocated is correct + # We have freed one seq, reducing the ref count of two blocks by one. + # One of the two was only used by the parent seq, so this is now free. + # The child seq still consumes sliding_window blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # free all blocks + block_manager.free(child) + + # assert all blocks are free now + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 8bfc14999f0a7..8b089a5650f48 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -312,7 +312,12 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: # Thus, it is always safe from OOM. src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.copy() - for block in src_block_table: + # When using a sliding window, blocks will be eventually reused. + # In this case the block tables will contain repeated blocks. + # When forking, we must make sure that each block's `ref_count` + # is only incremented by one, so we deduplicate them by wrapping + # them in a set. + for block in set(src_block_table): block.ref_count += 1 def _get_physical_blocks( @@ -393,7 +398,15 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: return block_number_mapping def _free_block_table(self, block_table: BlockTable) -> None: - for block in set(block_table): + # when using a sliding window, each seq will only use up + # to `self.block_sliding_window` blocks. When freeing + # the block table, we must make sure to not free blocks more + # than once. If no sliding window is used, there is no block + # reuse in the block table, so we must free all blocks. + blocks_to_free = (block_table[-self.block_sliding_window:] + if self.block_sliding_window is not None else + block_table) + for block in set(blocks_to_free): if block.device == Device.GPU: self.gpu_allocator.free(block) else: From 602358f8a86ef9fc0ba882e083e19b44e00b9302 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Mar 2024 22:06:17 -0700 Subject: [PATCH 087/113] Add kernel for GeGLU with approximate GELU (#3337) --- csrc/activation_kernels.cu | 22 +++++++++++++++++++++- csrc/ops.h | 4 ++++ csrc/pybind.cpp | 6 +++++- tests/kernels/test_activation.py | 11 ++++++++--- vllm/model_executor/layers/activation.py | 13 +++++++++++-- 5 files changed, 49 insertions(+), 7 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 22b10f0571d1c..24d972702c858 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -33,12 +33,25 @@ template __device__ __forceinline__ T gelu_kernel(const T& x) { // Equivalent to PyTorch GELU with 'none' approximation. // Refer to: - // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38 + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 const float f = (float) x; constexpr float ALPHA = M_SQRT1_2; return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA))); } +template +__device__ __forceinline__ T gelu_tanh_kernel(const T& x) { + // Equivalent to PyTorch GELU with 'tanh' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 + const float f = (float) x; + constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; + constexpr float KAPPA = 0.044715; + float x_cube = f * f * f; + float inner = BETA * (f + KAPPA * x_cube); + return (T) (0.5f * f * (1.0f + ::tanhf(inner))); +} + } // namespace vllm // Launch activation and gating kernel. @@ -73,6 +86,13 @@ void gelu_and_mul( LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); } +void gelu_tanh_and_mul( + torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] +{ + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); +} + namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/ops.h b/csrc/ops.h index 249c7451bf73c..53222972abb70 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -61,6 +61,10 @@ void gelu_and_mul( torch::Tensor& out, torch::Tensor& input); +void gelu_tanh_and_mul( + torch::Tensor& out, + torch::Tensor& input); + void gelu_new( torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 4b6ade7566398..39384f08d928c 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -25,7 +25,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ops.def( "gelu_and_mul", &gelu_and_mul, - "Activation function used in GeGLU."); + "Activation function used in GeGLU with `none` approximation."); + ops.def( + "gelu_tanh_and_mul", + &gelu_tanh_and_mul, + "Activation function used in GeGLU with `tanh` approximation."); ops.def( "gelu_new", &gelu_new, diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index e0dec144eba11..f78913f120aa4 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -16,7 +16,7 @@ ] -@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul]) +@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -24,7 +24,7 @@ @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_act_and_mul( - activation: Type[torch.nn.Module], + activation: str, num_tokens: int, d: int, dtype: torch.dtype, @@ -36,7 +36,12 @@ def test_act_and_mul( torch.cuda.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) - layer = activation() + if activation == "silu": + layer = SiluAndMul() + elif activation == "gelu": + layer = GeluAndMul(approximate="none") + elif activation == "gelu_tanh": + layer = GeluAndMul(approximate="tanh") out = layer(x) ref_out = layer._forward(x) # The SiLU and GELU implementations are equivalent to the native PyTorch diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5a3a7b2dbaee7..3eb73ee109f50 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -47,16 +47,25 @@ class GeluAndMul(nn.Module): return: (batch_size, seq_len, d) or (num_tokens, d) """ + def __init__(self, approximate: str = "none"): + super().__init__() + self.approximate = approximate + if approximate not in ("none", "tanh"): + raise ValueError(f"Unknown approximate mode: {approximate}") + def _forward(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" d = x.shape[-1] // 2 - return F.gelu(x[..., :d]) * x[..., d:] + return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.gelu_and_mul(out, x) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) return out From b167109ba12f18d028d2be8a61d3dce950eb2724 Mon Sep 17 00:00:00 2001 From: Bo-Wen Wang <1849994161@qq.com> Date: Wed, 13 Mar 2024 13:51:42 +0800 Subject: [PATCH 088/113] [Fix] Fix quantization="gptq" when using Marlin (#3319) Co-authored-by: Woosuk Kwon --- vllm/config.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index d2b68b6fa1fe2..319c1569f5e98 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -168,13 +168,18 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: - hf_quant_method = str(hf_quant_config["quant_method"]).lower() + # If the GPTQ model is serialized in marlin format, use marlin. if (hf_quant_method == "gptq" and "is_marlin_format" in hf_quant_config and hf_quant_config["is_marlin_format"]): + logger.info("The model is serialized in Marlin format. " + "Using Marlin kernel.") hf_quant_method = "marlin" + if self.quantization == "gptq": + self.quantization = hf_quant_method + if self.quantization is None: self.quantization = hf_quant_method elif self.quantization != hf_quant_method: From e221910e77087743a50560e4ae69c3c2a12beb53 Mon Sep 17 00:00:00 2001 From: Ronan McGovern <78278410+RonanKMcGovern@users.noreply.github.com> Date: Wed, 13 Mar 2024 06:33:43 +0000 Subject: [PATCH 089/113] add hf_transfer to requirements.txt (#3031) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index dd4867702d3de..18770f994ebd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -96,7 +96,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate + pip install accelerate hf_transfer COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm From ba8dc958a3d8533a6e5b7debda47e4d42a062b78 Mon Sep 17 00:00:00 2001 From: Hui Liu <96135754+hliuca@users.noreply.github.com> Date: Wed, 13 Mar 2024 09:16:55 -0700 Subject: [PATCH 090/113] [Minor] Fix bias in if to remove ambiguity (#3259) --- vllm/model_executor/layers/linear.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 60f6fc83b200f..40e681df48f86 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -73,7 +73,7 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: - if bias: + if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) return F.linear(x, weight, bias) From 739c350c1926682f435316294491aa54661849b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= Date: Thu, 14 Mar 2024 00:43:24 +0800 Subject: [PATCH 091/113] [Minor Fix] Use cupy-cuda11x in CUDA 11.8 build (#3256) --- setup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/setup.py b/setup.py index 023c3cde1910c..accf6bb400310 100644 --- a/setup.py +++ b/setup.py @@ -431,6 +431,12 @@ def get_requirements() -> List[str]: else: with open(get_path("requirements.txt")) as f: requirements = f.read().strip().split("\n") + if nvcc_cuda_version <= Version("11.8"): + # replace cupy-cuda12x with cupy-cuda11x for cuda 11.x + for i in range(len(requirements)): + if requirements[i].startswith("cupy-cuda12x"): + requirements[i] = "cupy-cuda11x" + break return requirements From ae0ccb40170d140ded8de99fc905fd8cb0bd409c Mon Sep 17 00:00:00 2001 From: Or Sharir Date: Wed, 13 Mar 2024 21:18:25 +0200 Subject: [PATCH 092/113] Add missing kernel for CodeLlama-34B on A/H100 (no tensor parallelism) when using Multi-LoRA. (#3350) --- csrc/punica/bgmv/bgmv_config.h | 1 + tests/lora/test_punica.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 4dc90de1ab42a..a7415dfc91369 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -43,6 +43,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 20480) \ + f(in_T, out_T, W_T, narrow, 22016) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 28672) \ f(in_T, out_T, W_T, narrow, 32000) \ diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index cbe0f6fa2e851..fd707766c6a30 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -45,7 +45,7 @@ def _lora_ref_impl( H1 = H2 = [ 128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120, 5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, - 24576, 32000, 32256, 32512, 32768, 33024 + 22016, 24576, 32000, 32256, 32512, 32768, 33024 ] SEED = [0xabcdabcd987] From 7e9bd08f60a4b18e3646ff986caeacde9ffffa53 Mon Sep 17 00:00:00 2001 From: Terry <149540247+tterrysun@users.noreply.github.com> Date: Wed, 13 Mar 2024 13:45:26 -0700 Subject: [PATCH 093/113] Add batched RoPE kernel (#3095) --- benchmarks/kernels/benchmark_rope.py | 120 ++++++++++++++++ csrc/ops.h | 10 ++ csrc/pos_encoding_kernels.cu | 126 ++++++++++++++-- csrc/pybind.cpp | 5 + tests/kernels/test_pos_encoding.py | 135 +++++++++++++++++- .../model_executor/layers/rotary_embedding.py | 58 +++++--- 6 files changed, 417 insertions(+), 37 deletions(-) create mode 100644 benchmarks/kernels/benchmark_rope.py diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py new file mode 100644 index 0000000000000..f9564dd9588f0 --- /dev/null +++ b/benchmarks/kernels/benchmark_rope.py @@ -0,0 +1,120 @@ +from typing import Optional + +import argparse +import torch +import nvtx +from itertools import accumulate +from vllm.model_executor.layers.rotary_embedding import get_rope + + +def benchmark_rope_kernels_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + # silulating serving 4 LoRAs + scaling_factors = [1, 2, 4, 8] + # batched RoPE can take multiple scaling factors + batched_rope = get_rope(head_size, rotary_dim, max_position, base, + is_neox_style, { + "type": "linear", + "factor": tuple(scaling_factors) + }) + # non-batched RoPE takes only one scaling factor, we create multiple + # instances to simulate the same behavior + non_batched_ropes = [] + for scaling_factor in scaling_factors: + non_batched_ropes.append( + get_rope(head_size, rotary_dim, max_position, base, is_neox_style, + { + "type": "linear", + "factor": (scaling_factor, ) + })) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # create query offsets for batched RoPE, we concat multiple kv cache + # together and each query needs to find the right kv cache of its type + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + # map query types to offsets + query_offsets = offset_map[query_types] + # the kernel takes flattened offsets + flatten_offsets = query_offsets.flatten() + + # batched queries of the same type together for non-batched RoPE + queries = [query[query_types == i] for i in range(len(scaling_factors))] + keys = [key[query_types == i] for i in range(len(scaling_factors))] + packed_qkr = zip(queries, keys, non_batched_ropes) + # synchronize before start timing + torch.cuda.synchronize() + with nvtx.annotate("non-batched", color="yellow"): + for q, k, r in packed_qkr: + r.forward(positions, q, k) + torch.cuda.synchronize() + with nvtx.annotate("batched", color="green"): + batched_rope.forward(positions, query, key, flatten_offsets) + torch.cuda.synchronize() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Benchmark the rotary embedding kernels.") + parser.add_argument("--is-neox-style", type=bool, default=True) + parser.add_argument("--batch-size", type=int, default=16) + parser.add_argument("--seq-len", type=int, default=512) + parser.add_argument("--num-heads", type=int, default=8) + parser.add_argument("--head-size", + type=int, + choices=[64, 80, 96, 112, 128, 256], + default=128) + parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) + parser.add_argument("--dtype", + type=str, + choices=["bfloat16", "float"], + default="float") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--device", + type=str, + choices=["cuda:0", "cuda:1"], + default="cuda:0") + args = parser.parse_args() + print(args) + + benchmark_rope_kernels_multi_lora( + is_neox_style=args.is_neox_style, + batch_size=args.batch_size, + seq_len=args.seq_len, + num_heads=args.num_heads, + head_size=args.head_size, + rotary_dim=args.rotary_dim, + dtype=getattr(torch, args.dtype), + seed=args.seed, + device=args.device, + ) diff --git a/csrc/ops.h b/csrc/ops.h index 53222972abb70..d5d6e240da7c4 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -53,6 +53,16 @@ void rotary_embedding( torch::Tensor& cos_sin_cache, bool is_neox); +void batched_rotary_embedding( + torch::Tensor& positions, + torch::Tensor& query, + torch::Tensor& key, + int head_size, + torch::Tensor& cos_sin_cache, + bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets); + void silu_and_mul( torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 5f522795619e1..d80cb6973fad6 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -8,7 +8,7 @@ namespace vllm { template -inline __device__ void apply_rotary_embedding( +inline __device__ void apply_token_rotary_embedding( scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr, const scalar_t* __restrict__ sin_ptr, @@ -38,22 +38,18 @@ inline __device__ void apply_rotary_embedding( } template -__global__ void rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] +inline __device__ void apply_rotary_embedding( scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] - const int rot_dim, - const int64_t query_stride, - const int64_t key_stride, + const scalar_t* cache_ptr, + const int head_size, const int num_heads, const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - + const int rot_dim, + const int token_idx, + const int64_t query_stride, + const int64_t key_stride) +{ const int embed_dim = rot_dim / 2; const scalar_t* cos_ptr = cache_ptr; const scalar_t* sin_ptr = cache_ptr + embed_dim; @@ -63,7 +59,7 @@ __global__ void rotary_embedding_kernel( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * query_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_rotary_embedding(query + token_head, cos_ptr, + apply_token_rotary_embedding(query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } @@ -72,11 +68,53 @@ __global__ void rotary_embedding_kernel( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * key_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_rotary_embedding(key + token_head, cos_ptr, + apply_token_rotary_embedding(key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } } +template +__global__ void rotary_embedding_kernel( + const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int rot_dim, + const int64_t query_stride, + const int64_t key_stride, + const int num_heads, + const int num_kv_heads, + const int head_size) { + // Each thread block is responsible for one token. + const int token_idx = blockIdx.x; + int64_t pos = positions[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; + + apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); +} + +template +__global__ void batched_rotary_embedding_kernel( + const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] or [num_tokens] + const int rot_dim, + const int64_t query_stride, + const int64_t key_stride, + const int num_heads, + const int num_kv_heads, + const int head_size) { + // Each thread block is responsible for one token. + const int token_idx = blockIdx.x; + int64_t pos = positions[token_idx]; + int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; + + apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); +} + } // namespace vllm void rotary_embedding( @@ -128,3 +166,61 @@ void rotary_embedding( } }); } + +/* +Batched version of rotary embedding, pack multiple LoRAs together +and process in batched manner. +*/ +void batched_rotary_embedding( + torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] + torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] + torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] + int head_size, + torch::Tensor& cos_sin_cache, // [max_position, rot_dim] + bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets // [num_tokens] +) { + int64_t num_tokens = cos_sin_cache_offsets.size(0); + int num_heads = query.size(-1) / head_size; + int num_kv_heads = key.size(-1) / head_size; + int64_t query_stride = query.stride(-2); + int64_t key_stride = key.stride(-2); + + dim3 grid(num_tokens); + dim3 block(std::min(num_heads * rot_dim / 2, 512)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + query.scalar_type(), + "rotary_embedding", + [&] { + if (is_neox) { + vllm::batched_rotary_embedding_kernel<<>>( + positions.data_ptr(), + query.data_ptr(), + key.data_ptr(), + cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), + rot_dim, + query_stride, + key_stride, + num_heads, + num_kv_heads, + head_size); + } else { + vllm::batched_rotary_embedding_kernel<<>>( + positions.data_ptr(), + query.data_ptr(), + key.data_ptr(), + cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), + rot_dim, + query_stride, + key_stride, + num_heads, + num_kv_heads, + head_size); + } + }); +} diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 39384f08d928c..a5c6439fd6909 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -56,6 +56,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &rotary_embedding, "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); + ops.def( + "batched_rotary_embedding", + &batched_rotary_embedding, + "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)"); + // Quantization ops #ifndef USE_ROCM ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 0d27bbaff9fc5..ffdcc1e8c80fd 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,8 +1,9 @@ -from typing import Optional +from typing import List, Optional import pytest import torch from allclose_default import get_default_atol, get_default_rtol +from itertools import accumulate from vllm.model_executor.layers.rotary_embedding import get_rope IS_NEOX_STYLE = [True, False] @@ -72,3 +73,135 @@ def test_rotary_embedding( ref_key, atol=get_default_atol(out_key), rtol=get_default_rtol(out_key)) + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_batched_rotary_embedding( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { + "type": "linear", + "factor": (1, ) + }) + rope = rope.to(dtype=dtype) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # NOTE(woosuk): The reference implementation should be executed first + # because the custom kernel is in-place. + ref_query, ref_key = rope._forward(positions, query, key) + out_query, out_key = rope.forward(positions, + query, + key, + offsets=torch.zeros(batch_size * seq_len, + dtype=int, + device=device)) + # Compare the results. + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_batched_rotary_embedding_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + scaling_factors: List[int] = [1, 2, 4] + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { + "type": "linear", + "factor": tuple(scaling_factors) + }) + rope = rope.to(dtype=dtype) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + query_offsets = offset_map[query_types] + + # NOTE(woosuk): The reference implementation should be executed first + # because the custom kernel is in-place. + ref_query, ref_key = rope._forward(positions, query, key, query_offsets) + out_query, out_key = rope.forward(positions, query, key, + query_offsets.flatten()) + # Compare the results. + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 13749570f28a2..db5c7080b50b0 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -22,7 +22,7 @@ # limitations under the License. """Rotary Positional Embeddings.""" import math -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -96,6 +96,7 @@ def _forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward().""" query = query.view(*query.shape[:-1], -1, self.head_size) @@ -107,7 +108,9 @@ def _forward( query_pass = query[..., self.rotary_dim:] key_pass = key[..., self.rotary_dim:] - cos_sin = self.cos_sin_cache[positions] + self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) + cos_sin = self.cos_sin_cache[torch.add(positions, offsets) + if offsets is not None else positions] cos, sin = cos_sin.chunk(2, dim=-1) if self.is_neox_style: # NOTE(woosuk): Here we assume that the positions tensor has the @@ -137,11 +140,19 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - # ops.rotary_embedding() is an in-place operation that - # updates the query and key tensors. - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) + # ops.rotary_embedding()/batched_rotary_embedding() are in-place operations that + # update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key @@ -158,27 +169,32 @@ def __init__( max_position_embeddings: int, base: int, is_neox_style: bool, - scaling_factor: float, + scaling_factors: Union[List[float], float], ) -> None: - self.scaling_factor = scaling_factor + if isinstance(scaling_factors, float): + scaling_factors = [scaling_factors] + self.scaling_factors = scaling_factors super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style) def _compute_cos_sin_cache(self) -> torch.Tensor: inv_freq = self._compute_inv_freq(self.base) - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor - t = torch.arange(max_len, dtype=torch.float) - t = t / self.scaling_factor - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache + cache_list = [] + for scaling_factor in self.scaling_factors: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * scaling_factor + t = torch.arange(max_len, dtype=torch.float) + t = t / scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + cache_list.append(cache) + return torch.cat(cache_list, dim=0) class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): From c33afd89f56ba5c260275fdd6723c59642f82f22 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 13 Mar 2024 13:56:49 -0700 Subject: [PATCH 094/113] Fix lint (#3388) --- vllm/model_executor/layers/rotary_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index db5c7080b50b0..71af9b26e2e93 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -143,8 +143,8 @@ def forward( offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) - # ops.rotary_embedding()/batched_rotary_embedding() are in-place operations that - # update the query and key tensors. + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. if offsets is not None: ops.batched_rotary_embedding(positions, query, key, self.head_size, self.cos_sin_cache, From eeab52a4ff02e15f970880a689df2861ad173770 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Wed, 13 Mar 2024 14:18:40 -0700 Subject: [PATCH 095/113] [FIX] Simpler fix for async engine running on ray (#3371) --- vllm/executor/ray_gpu_executor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 261fcfb7dad9b..82a2b456895e8 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -430,8 +430,7 @@ async def execute_model_async( "blocks_to_swap_in": blocks_to_swap_in, "blocks_to_swap_out": blocks_to_swap_out, "blocks_to_copy": blocks_to_copy, - }, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + }) # Only the driver worker returns the sampling results. output = all_outputs[0] From 81653d968842d2ec51b2642b6b5d83786271f9af Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 13 Mar 2024 17:02:21 -0700 Subject: [PATCH 096/113] [Hotfix] [Debug] test_openai_server.py::test_guided_regex_completion (#3383) --- .buildkite/test-pipeline.yaml | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 42a1eacb6de57..6a130f6fadcc3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -13,7 +13,7 @@ steps: - label: Basic Correctness Test command: pytest -v -s --forked basic_correctness - + - label: Core Test command: pytest -v -s core diff --git a/requirements.txt b/requirements.txt index 05ec2e804e13b..d6c33ad85da58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,5 @@ pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 pynvml == 11.5.0 triton >= 2.1.0 -outlines >= 0.0.27 +outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. From a37415c31b3b5c7ab40d2d897192025f0ca7be08 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Thu, 14 Mar 2024 14:35:13 +0800 Subject: [PATCH 097/113] allow user to chose which vllm's merics to display in grafana (#3393) --- examples/production_monitoring/grafana.json | 184 ++++++++++---------- 1 file changed, 88 insertions(+), 96 deletions(-) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index f48b6314eb055..071f134c6e5e0 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1,35 +1,4 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.2.3" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -42,6 +11,12 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] @@ -50,14 +25,14 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 29, "links": [], "liveNow": false, "panels": [ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "End to end request latency measured in seconds.", "fieldConfig": { @@ -66,7 +41,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -80,7 +54,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -138,11 +111,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -154,11 +127,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -171,11 +144,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -188,11 +161,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -205,10 +178,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])", + "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Average", @@ -222,7 +195,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Number of tokens processed per second", "fieldConfig": { @@ -231,7 +204,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -245,7 +217,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -302,11 +273,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])", + "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -318,11 +289,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:generation_tokens_total[$__rate_interval])", + "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -339,7 +310,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Inter token latency in seconds.", "fieldConfig": { @@ -348,7 +319,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -362,7 +332,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -420,11 +389,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -436,11 +405,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -453,11 +422,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -470,11 +439,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -487,10 +456,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])", + "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Mean", @@ -504,7 +473,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", "fieldConfig": { @@ -513,7 +482,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -527,7 +495,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -585,11 +552,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_running", + "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -601,11 +568,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_swapped", + "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -618,11 +585,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_waiting", + "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -639,7 +606,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "P50, P90, P95, and P99 TTFT latency in seconds.", "fieldConfig": { @@ -648,7 +615,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -662,7 +628,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -720,11 +685,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -737,11 +702,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -753,11 +718,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -770,11 +735,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -787,10 +752,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])", + "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Average", @@ -804,7 +769,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Percentage of used cache blocks by vLLM.", "fieldConfig": { @@ -813,7 +778,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -827,7 +791,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -885,10 +848,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "vllm:gpu_cache_usage_perc", + "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", "instant": false, "legendFormat": "GPU Cache Usage", "range": true, @@ -897,10 +860,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc", + "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", "hide": false, "instant": false, "legendFormat": "CPU Cache Usage", @@ -913,10 +876,39 @@ } ], "refresh": "", - "schemaVersion": 39, + "schemaVersion": 37, + "style": "dark", "tags": [], "templating": { - "list": [] + "list": [ + { + "current": { + "selected": false, + "text": "vllm", + "value": "vllm" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(model_name)", + "hide": 0, + "includeAll": false, + "label": "model_name", + "multi": false, + "name": "model_name", + "options": [], + "query": { + "query": "label_values(model_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { "from": "now-5m", From 8fe838659164b415d7f3044ec6b7e5bc52c6b6a5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 14 Mar 2024 01:11:48 -0700 Subject: [PATCH 098/113] [Kernel] change benchmark script so that result can be directly used; tune moe kernel in A100/H100 with tp=2,4,8 (#3389) --- benchmarks/kernels/benchmark_mixtral_moe.py | 30 ++-- .../layers/fused_moe/__init__.py | 6 +- ...792,device_name=NVIDIA_A100-SXM4-80GB.json | 146 +++++++++++++++ ...792,device_name=NVIDIA_H100_80GB_HBM3.json | 146 +++++++++++++++ ...584,device_name=NVIDIA_A100-SXM4-80GB.json | 162 +++++++++++++++-- ...584,device_name=NVIDIA_H100_80GB_HBM3.json | 146 +++++++++++++++ ...168,device_name=NVIDIA_A100-SXM4-80GB.json | 146 +++++++++++++++ ...168,device_name=NVIDIA_H100_80GB_HBM3.json | 166 +++++++++++++++--- .../layers/fused_moe/fused_moe.py | 10 +- 9 files changed, 903 insertions(+), 55 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py index 9e08df76947f8..964eca5aaf72b 100644 --- a/benchmarks/kernels/benchmark_mixtral_moe.py +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -2,13 +2,13 @@ import os import sys -os.environ['CUDA_VISIBLE_DEVICES'] = '0' - -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name import torch import torch.nn.functional as F import triton +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + def main(): method = fused_moe @@ -64,7 +64,7 @@ def run_grid(bs, method): print(f'{tp_size=} {bs=}') print(f'{config}') # warmup - print(f'warming up') + print('warming up') try: for _ in range(num_warmup_trials): run_timing( @@ -82,7 +82,7 @@ def run_grid(bs, method): continue # trial - print(f'benchmarking') + print('benchmarking') for _ in range(num_trials): kernel_dur_ms = run_timing( num_calls=num_calls, @@ -103,17 +103,25 @@ def run_grid(bs, method): best_config = config best_time_us = kernel_dur_us - print( - f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}' - ) + print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}' + f' {bs=} {tp_size=} {top_k=} {num_total_experts=} ' + f'{d_model=} {model_intermediate_size=} {num_layers=}') print("best_time_us", best_time_us) print("best_config", best_config) - filename = "/tmp/config.jsonl" + # holds Dict[str, Dict[str, int]] + filename = get_config_file_name(num_total_experts, + model_intermediate_size // tp_size) print(f"writing config to file {filename}") - with open(filename, "a") as f: - f.write(json.dumps({str(bs): best_config}) + "\n") + existing_content = {} + if os.path.exists(filename): + with open(filename, "r") as f: + existing_content = json.load(f) + existing_content[str(bs)] = best_config + with open(filename, "w") as f: + json.dump(existing_content, f, indent=4) + f.write("\n") def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 1391d43c8abeb..299ab44f8f3d5 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,5 +1,9 @@ -from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_moe, + get_config_file_name, +) __all__ = [ "fused_moe", + "get_config_file_name", ] diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..5c8185cfdeec1 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..97c9f4445b166 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json index 1fefb5ff7e42d..edf2a38d12ad3 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -1,20 +1,146 @@ { - "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "96": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, - "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}, - "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, - "2048": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "3072": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}, - "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4} + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 0000000000000..b2100cebb7f58 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 0000000000000..f578c8d0160ac --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json index 64d49ca66c1c8..e341a67917d51 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json @@ -1,24 +1,146 @@ { - "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 4}, - "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "80": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "200": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, - "208": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, - "216": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "224": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, - "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, - "512": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "2048": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "3072": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "4096": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4} + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } } diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3e6dd0dfe2eb3..1ec09f0cd4c28 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -245,6 +245,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, ) +def get_config_file_name(E: int, N: int) -> str: + device_name = torch.cuda.get_device_name().replace(" ", "_") + return f"E={E},N={N},device_name={device_name}.json" + + @functools.lru_cache def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: """ @@ -258,11 +263,10 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: # First look up if an optimized configuration is available in the configs # directory - device_name = torch.cuda.get_device_name().replace(" ", "_") + json_file_name = get_config_file_name(E, N) config_file_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "configs", - f"E={E},N={N},device_name={device_name}.json") + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) if os.path.exists(config_file_path): with open(config_file_path) as f: logger.info( From 06ec486794f42db656c3cc16c8c5ed56ce4f696b Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 14 Mar 2024 18:55:54 +0100 Subject: [PATCH 099/113] Install `flash_attn` in Docker image (#3396) --- Dockerfile | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Dockerfile b/Dockerfile index 18770f994ebd2..8be03b3567f0e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1 RUN python3 setup.py build_ext --inplace #################### EXTENSION Build IMAGE #################### +#################### FLASH_ATTENTION Build IMAGE #################### +FROM dev as flash-attn-builder +# max jobs used for build +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# flash attention version +ARG flash_attn_version=v2.5.6 +ENV FLASH_ATTN_VERSION=${flash_attn_version} + +WORKDIR /usr/src/flash-attention-v2 + +# Download the wheel or build it if a pre-compiled release doesn't exist +RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ + --no-build-isolation --no-deps --no-cache-dir + +#################### FLASH_ATTENTION Build IMAGE #################### #################### TEST IMAGE #################### # image to run unit testing suite @@ -68,6 +84,9 @@ WORKDIR /vllm-workspace # ADD is used to preserve directory structure ADD . /vllm-workspace/ COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose @@ -88,6 +107,11 @@ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt + +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir + #################### RUNTIME BASE IMAGE #################### From c17ca8ef186b5e90a500d3e37724b220944450f7 Mon Sep 17 00:00:00 2001 From: Dan Clark <44146800+declark1@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:11:45 -0700 Subject: [PATCH 100/113] Add args for mTLS support (#3410) Co-authored-by: Daniel Clark --- vllm/entrypoints/api_server.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 86b6c4c67cfa4..5130586e036b2 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -82,6 +82,14 @@ async def stream_results() -> AsyncGenerator[bytes, None]: parser.add_argument("--port", type=int, default=8000) parser.add_argument("--ssl-keyfile", type=str, default=None) parser.add_argument("--ssl-certfile", type=str, default=None) + parser.add_argument("--ssl-ca-certs", + type=str, + default=None, + help="The CA certificates file") + parser.add_argument("--ssl-cert-reqs", + type=int, + default=0, + help="Whether client certificate is required") parser.add_argument( "--root-path", type=str, @@ -100,4 +108,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]: log_level="debug", timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile) + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs) From dfc77408bdca19308cbb28a54dfe697442fbf335 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 14 Mar 2024 13:16:00 -0700 Subject: [PATCH 101/113] [issue templates] add some issue templates (#3412) --- .github/ISSUE_TEMPLATE/100-documentation.yml | 22 + .github/ISSUE_TEMPLATE/200-installation.yml | 39 + .github/ISSUE_TEMPLATE/300-usage.yml | 37 + .github/ISSUE_TEMPLATE/400-bug report.yml | 81 +++ .../ISSUE_TEMPLATE/500-feature request.yml | 31 + .github/ISSUE_TEMPLATE/600-new model.yml | 33 + .../700-performance discussion.yml | 51 ++ .../ISSUE_TEMPLATE/800-misc discussion.yml | 21 + .github/ISSUE_TEMPLATE/config.yml | 1 + .yapfignore | 1 + collect_env.py | 688 ++++++++++++++++++ 11 files changed, 1005 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/100-documentation.yml create mode 100644 .github/ISSUE_TEMPLATE/200-installation.yml create mode 100644 .github/ISSUE_TEMPLATE/300-usage.yml create mode 100644 .github/ISSUE_TEMPLATE/400-bug report.yml create mode 100644 .github/ISSUE_TEMPLATE/500-feature request.yml create mode 100644 .github/ISSUE_TEMPLATE/600-new model.yml create mode 100644 .github/ISSUE_TEMPLATE/700-performance discussion.yml create mode 100644 .github/ISSUE_TEMPLATE/800-misc discussion.yml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .yapfignore create mode 100644 collect_env.py diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml new file mode 100644 index 0000000000000..7ef052a525963 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -0,0 +1,22 @@ +name: 📚 Documentation +description: Report an issue related to https://docs.vllm.ai/ +title: "[Doc]: " +labels: ["doc"] + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in https://docs.vllm.ai/ is an issue. + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml new file mode 100644 index 0000000000000..4c6c96187cc6c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/200-installation.yml @@ -0,0 +1,39 @@ +name: 🛠️ Installation +description: Report an issue here when you hit errors during installation. +title: "[Installation]: " +labels: ["installation"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How you are installing vllm + description: | + Paste the full command you are trying to execute. + value: | + ```sh + pip install -vvv vllm + ``` +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml new file mode 100644 index 0000000000000..88227b4b2e7b9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -0,0 +1,37 @@ +name: 💻 Usage +description: Raise an issue here if you don't know how to use vllm. +title: "[Usage]: " +labels: ["usage"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How would you like to use vllm + description: | + A detailed description of how you want to use vllm. + value: | + I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml new file mode 100644 index 0000000000000..f1124dfa78bbc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/400-bug report.yml @@ -0,0 +1,81 @@ +name: 🐛 Bug report +description: Raise an issue here if you find a bug. +title: "[Bug]: " +labels: ["bug"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: 🐛 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + + If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM(model="facebook/opt-125m") + + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + placeholder: | + A clear and concise description of what the bug is. + + ```python + # Sample code to reproduce the problem + ``` + + ``` + The error message you got, with the full traceback. + ``` + validations: + required: true +- type: markdown + attributes: + value: > + ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + + - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). + + - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. + + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature request.yml new file mode 100644 index 0000000000000..0dd5a3e5d14de --- /dev/null +++ b/.github/ISSUE_TEMPLATE/500-feature request.yml @@ -0,0 +1,31 @@ +name: 🚀 Feature request +description: Submit a proposal/request for a new vllm feature +title: "[Feature]: " +labels: ["feature"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: 🚀 The feature, motivation and pitch + description: > + A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A description of any alternative solutions or features you've considered, if any. +- type: textarea + attributes: + label: Additional context + description: > + Add any other context or screenshots about the feature request. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new model.yml new file mode 100644 index 0000000000000..bbddbfd67138a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/600-new model.yml @@ -0,0 +1,33 @@ +name: 🤗 Support request for a new model from huggingface +description: Submit a proposal/request for a new model from huggingface +title: "[New Model]: " +labels: ["new model"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). + + #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. +- type: textarea + attributes: + label: The model to consider. + description: > + A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . + validations: + required: true +- type: textarea + attributes: + label: The closest model vllm already supports. + description: > + Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? +- type: textarea + attributes: + label: What's your difficulty of supporting the model you want? + description: > + For example, any new operators or new architecture? +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml new file mode 100644 index 0000000000000..9e8e7b4aa3530 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml @@ -0,0 +1,51 @@ +name: ⚡ Discussion on the performance of vllm +description: Submit a proposal/discussion about the performance of vllm +title: "[Performance]: " +labels: ["performance"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Proposal to improve performance + description: > + How do you plan to improve vllm's performance? + validations: + required: false +- type: textarea + attributes: + label: Report of performance regression + description: > + Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . + validations: + required: false +- type: textarea + attributes: + label: Misc discussion on performance + description: > + Anything about the performance. + validations: + required: false +- type: textarea + attributes: + label: Your current environment (if you think it is necessary) + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: false +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc discussion.yml new file mode 100644 index 0000000000000..ddb10f72db293 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/800-misc discussion.yml @@ -0,0 +1,21 @@ +name: 🎲 Misc/random discussions that do not fit into the above categories. +description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. +title: "[Misc]: " +labels: ["misc"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Anything you want to discuss about vllm. + description: > + Anything you want to discuss about vllm. + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000..3ba13e0cec6cb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.yapfignore b/.yapfignore new file mode 100644 index 0000000000000..2d6dcf8380cac --- /dev/null +++ b/.yapfignore @@ -0,0 +1 @@ +collect_env.py diff --git a/collect_env.py b/collect_env.py new file mode 100644 index 0000000000000..a886db693e2f1 --- /dev/null +++ b/collect_env.py @@ -0,0 +1,688 @@ +# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py + +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` +import datetime +import locale +import re +import subprocess +import sys +import os +from collections import namedtuple + + +try: + import torch + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple('SystemEnv', [ + 'torch_version', + 'is_debug_build', + 'cuda_compiled_version', + 'gcc_version', + 'clang_version', + 'cmake_version', + 'os', + 'libc_version', + 'python_version', + 'python_platform', + 'is_cuda_available', + 'cuda_runtime_version', + 'cuda_module_loading', + 'nvidia_driver_version', + 'nvidia_gpu_models', + 'cudnn_version', + 'pip_version', # 'pip' or 'pip3' + 'pip_packages', + 'conda_packages', + 'hip_compiled_version', + 'hip_runtime_version', + 'miopen_runtime_version', + 'caching_allocator_config', + 'is_xnnpack_available', + 'cpu_info', + 'rocm_version', # vllm specific field + 'neuron_sdk_version', # vllm specific field + 'vllm_version', # vllm specific field + 'vllm_build_flags', # vllm specific field + 'gpu_topo', # vllm specific field +]) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "cudatoolkit", + "soumith", + "mkl", + "magma", + "triton", + "optree", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "triton", + "optree", + "onnx", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + p = subprocess.Popen(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=shell) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == 'win32': + enc = 'oem' + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + err = raw_err.decode(enc) + return rc, output.strip(), err.strip() + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split('\n')[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get('CONDA_EXE', 'conda') + out = run_and_read_all(run_lambda, "{} list".format(conda)) + if out is None: + return out + + return "\n".join( + line + for line in out.splitlines() + if not line.startswith("#") + and any(name in line for name in patterns) + ) + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)') + +def get_clang_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'clang --version', r'clang version (.*)') + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)') + + +def get_nvidia_driver_version(run_lambda): + if get_platform() == 'darwin': + cmd = 'kextstat | grep -i cuda' + return run_and_parse_first_match(run_lambda, cmd, + r'com[.]nvidia[.]CUDA [(](.*?)[)]') + smi = get_nvidia_smi() + return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ') + + +def get_gpu_info(run_lambda): + if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(torch.version, 'hip') and torch.version.hip is not None): + if TORCH_AVAILABLE and torch.cuda.is_available(): + if torch.version.hip is not None: + prop = torch.cuda.get_device_properties(0) + if hasattr(prop, "gcnArchName"): + gcnArch = " ({})".format(prop.gcnArchName) + else: + gcnArch = "NoGCNArchNameOnOldPyTorch" + else: + gcnArch = "" + return torch.cuda.get_device_name(None) + gcnArch + return None + smi = get_nvidia_smi() + uuid_regex = re.compile(r' \(UUID: .+?\)') + rc, out, _ = run_lambda(smi + ' -L') + if rc != 0: + return None + # Anonymize GPUs by removing their UUID + return re.sub(uuid_regex, '', out) + + +def get_running_cuda_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'nvcc --version', r'release .+ V(.*)') + + +def get_cudnn_version(run_lambda): + """Return a list of libcudnn.so; it's hard to tell which one is being used.""" + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%") + where_cmd = os.path.join(system_root, 'System32', 'where') + cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) + elif get_platform() == 'darwin': + # CUDA libraries and drivers can be found in /usr/local/cuda/. See + # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install + # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. + cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*' + else: + cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' + rc, out, _ = run_lambda(cudnn_cmd) + # find will return 1 if there are permission errors or if not found + if len(out) == 0 or (rc != 1 and rc != 0): + l = os.environ.get('CUDNN_LIBRARY') + if l is not None and os.path.isfile(l): + return os.path.realpath(l) + return None + files_set = set() + for fn in out.split('\n'): + fn = os.path.realpath(fn) # eliminate symbolic links + if os.path.isfile(fn): + files_set.add(fn) + if not files_set: + return None + # Alphabetize the result because the order is non-deterministic otherwise + files = sorted(files_set) + if len(files) == 1: + return files[0] + result = '\n'.join(files) + return 'Probably one of the following:\n{}'.format(result) + + +def get_nvidia_smi(): + # Note: nvidia-smi is currently available only on Windows and Linux + smi = 'nvidia-smi' + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files') + legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi) + new_path = os.path.join(system_root, 'System32', smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = '"{}"'.format(candidate_smi) + break + return smi + + +def get_rocm_version(run_lambda): + """Returns the ROCm version if available, otherwise 'N/A'.""" + return run_and_parse_first_match(run_lambda, 'hipcc --version', r'HIP version: (\S+)') + + +def get_neuron_sdk_version(run_lambda): + # Adapted from your install script + try: + result = run_lambda(["neuron-ls"]) + return result if result[0] == 0 else 'N/A' + except Exception: + return 'N/A' + + +def get_vllm_version(): + try: + import vllm + return vllm.__version__ + except ImportError: + return 'N/A' + + +def summarize_vllm_build_flags(): + # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. + return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( + os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'), + 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled', + 'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled', + ) + + +def get_gpu_topo(run_lambda): + if get_platform() == 'linux': + return run_and_read_all(run_lambda, 'nvidia-smi topo -m') + return None + + +# example outputs of CPU infos +# * linux +# Architecture: x86_64 +# CPU op-mode(s): 32-bit, 64-bit +# Address sizes: 46 bits physical, 48 bits virtual +# Byte Order: Little Endian +# CPU(s): 128 +# On-line CPU(s) list: 0-127 +# Vendor ID: GenuineIntel +# Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# CPU family: 6 +# Model: 106 +# Thread(s) per core: 2 +# Core(s) per socket: 32 +# Socket(s): 2 +# Stepping: 6 +# BogoMIPS: 5799.78 +# Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr +# sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl +# xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 +# pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand +# hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced +# fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap +# avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 +# xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq +# avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities +# Virtualization features: +# Hypervisor vendor: KVM +# Virtualization type: full +# Caches (sum of all): +# L1d: 3 MiB (64 instances) +# L1i: 2 MiB (64 instances) +# L2: 80 MiB (64 instances) +# L3: 108 MiB (2 instances) +# NUMA: +# NUMA node(s): 2 +# NUMA node0 CPU(s): 0-31,64-95 +# NUMA node1 CPU(s): 32-63,96-127 +# Vulnerabilities: +# Itlb multihit: Not affected +# L1tf: Not affected +# Mds: Not affected +# Meltdown: Not affected +# Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +# Retbleed: Not affected +# Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +# Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +# Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence +# Srbds: Not affected +# Tsx async abort: Not affected +# * win32 +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU0 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 +# +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU1 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 + +def get_cpu_info(run_lambda): + rc, out, err = 0, '', '' + if get_platform() == 'linux': + rc, out, err = run_lambda('lscpu') + elif get_platform() == 'win32': + rc, out, err = run_lambda('wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE') + elif get_platform() == 'darwin': + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = 'None' + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith('linux'): + return 'linux' + elif sys.platform.startswith('win32'): + return 'win32' + elif sys.platform.startswith('cygwin'): + return 'cygwin' + elif sys.platform.startswith('darwin'): + return 'darwin' + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)') + + +def get_windows_version(run_lambda): + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic') + findstr_cmd = os.path.join(system_root, 'System32', 'findstr') + return run_and_read_all(run_lambda, '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd)) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)') + + +def check_release_file(run_lambda): + return run_and_parse_first_match(run_lambda, 'cat /etc/*-release', + r'PRETTY_NAME="(.*)"') + + +def get_os(run_lambda): + from platform import machine + platform = get_platform() + + if platform == 'win32' or platform == 'cygwin': + return get_windows_version(run_lambda) + + if platform == 'darwin': + version = get_mac_version(run_lambda) + if version is None: + return None + return 'macOS {} ({})'.format(version, machine()) + + if platform == 'linux': + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + return '{} ({})'.format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + return platform.platform() + + +def get_libc_version(): + import platform + if get_platform() != 'linux': + return 'N/A' + return '-'.join(platform.libc_ver()) + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + # People generally have `pip` as `pip` or `pip3` + # But here it is invoked as `python -mpip` + def run_with_pip(pip): + out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"]) + return "\n".join( + line + for line in out.splitlines() + if any(name in line for name in patterns) + ) + + pip_version = 'pip3' if sys.version[0] == '3' else 'pip' + out = run_with_pip([sys.executable, '-mpip']) + + return pip_version, out + + +def get_cachingallocator_config(): + ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '') + return ca_config + + +def get_cuda_module_loading_config(): + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.init() + config = os.environ.get('CUDA_MODULE_LOADING', '') + return config + else: + return "N/A" + + +def is_xnnpack_available(): + if TORCH_AVAILABLE: + import torch.backends.xnnpack + return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined] + else: + return "N/A" + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + cuda_available_str = str(torch.cuda.is_available()) + cuda_version_str = torch.version.cuda + if not hasattr(torch.version, 'hip') or torch.version.hip is None: # cuda version + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + else: # HIP version + def get_version_or_na(cfg, prefix): + _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] + return _lst[0] if _lst else 'N/A' + + cfg = torch._C._show_config().split('\n') + hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime') + miopen_runtime_version = get_version_or_na(cfg, 'MIOpen') + cuda_version_str = 'N/A' + hip_compiled_version = torch.version.hip + else: + version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A' + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + rocm_version = get_rocm_version(run_lambda) + neuron_sdk_version = get_neuron_sdk_version(run_lambda) + vllm_version = get_vllm_version() + vllm_build_flags = summarize_vllm_build_flags() + gpu_topo = get_gpu_topo(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version='{} ({}-bit runtime)'.format(sys_version, sys.maxsize.bit_length() + 1), + python_platform=get_python_platform(), + is_cuda_available=cuda_available_str, + cuda_compiled_version=cuda_version_str, + cuda_runtime_version=get_running_cuda_version(run_lambda), + cuda_module_loading=get_cuda_module_loading_config(), + nvidia_gpu_models=get_gpu_info(run_lambda), + nvidia_driver_version=get_nvidia_driver_version(run_lambda), + cudnn_version=get_cudnn_version(run_lambda), + hip_compiled_version=hip_compiled_version, + hip_runtime_version=hip_runtime_version, + miopen_runtime_version=miopen_runtime_version, + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + caching_allocator_config=get_cachingallocator_config(), + is_xnnpack_available=is_xnnpack_available(), + cpu_info=get_cpu_info(run_lambda), + rocm_version=rocm_version, + neuron_sdk_version=neuron_sdk_version, + vllm_version=vllm_version, + vllm_build_flags=vllm_build_flags, + gpu_topo=gpu_topo, + ) + +env_info_fmt = """ +PyTorch version: {torch_version} +Is debug build: {is_debug_build} +CUDA used to build PyTorch: {cuda_compiled_version} +ROCM used to build PyTorch: {hip_compiled_version} + +OS: {os} +GCC version: {gcc_version} +Clang version: {clang_version} +CMake version: {cmake_version} +Libc version: {libc_version} + +Python version: {python_version} +Python platform: {python_platform} +Is CUDA available: {is_cuda_available} +CUDA runtime version: {cuda_runtime_version} +CUDA_MODULE_LOADING set to: {cuda_module_loading} +GPU models and configuration: {nvidia_gpu_models} +Nvidia driver version: {nvidia_driver_version} +cuDNN version: {cudnn_version} +HIP runtime version: {hip_runtime_version} +MIOpen runtime version: {miopen_runtime_version} +Is XNNPACK available: {is_xnnpack_available} + +CPU: +{cpu_info} + +Versions of relevant libraries: +{pip_packages} +{conda_packages} +""".strip() + +env_info_fmt += """ +ROCM Version: {rocm_version} +Neuron SDK Version: {neuron_sdk_version} +vLLM Version: {vllm_version} +vLLM Build Flags: +{vllm_build_flags} +GPU Topology: +{gpu_topo} +""".strip() + + +def pretty_str(envinfo): + def replace_nones(dct, replacement='Could not collect'): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true='Yes', false='No'): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag='[prepend]'): + lines = text.split('\n') + updated_lines = [tag + line for line in lines] + return '\n'.join(updated_lines) + + def replace_if_empty(text, replacement='No relevant packages'): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split('\n')) > 1: + return '\n{}\n'.format(string) + return string + + mutable_dict = envinfo._asdict() + + # If nvidia_gpu_models is multiline, start on the next line + mutable_dict['nvidia_gpu_models'] = \ + maybe_start_on_next_line(envinfo.nvidia_gpu_models) + + # If the machine doesn't have CUDA, report some fields as 'No CUDA' + dynamic_cuda_fields = [ + 'cuda_runtime_version', + 'nvidia_gpu_models', + 'nvidia_driver_version', + ] + all_cuda_fields = dynamic_cuda_fields + ['cudnn_version'] + all_dynamic_cuda_fields_missing = all( + mutable_dict[field] is None for field in dynamic_cuda_fields) + if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing: + for field in all_cuda_fields: + mutable_dict[field] = 'No CUDA' + if envinfo.cuda_compiled_version is None: + mutable_dict['cuda_compiled_version'] = 'None' + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages']) + mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages']) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict['pip_packages']: + mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'], + '[{}] '.format(envinfo.pip_version)) + if mutable_dict['conda_packages']: + mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'], + '[conda] ') + mutable_dict['cpu_info'] = envinfo.cpu_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(torch.utils, '_crash_handler'): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S') + msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \ + "if this is related to your bug please include it when you file a report ***" + print(msg, file=sys.stderr) + + + +if __name__ == '__main__': + main() From 54be8a0be2819340ce7c2d7993382559597f5665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= Date: Fri, 15 Mar 2024 04:56:57 +0800 Subject: [PATCH 102/113] Fix assertion failure in Qwen 1.5 with prefix caching enabled (#3373) Co-authored-by: Cade Daniel --- tests/test_config.py | 43 +++++++++++++++++++++++++++++++++++++++++++ vllm/config.py | 14 ++++++++++++-- 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/test_config.py diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000000000..13a9f76212679 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,43 @@ +from vllm.config import ModelConfig + + +def test_get_sliding_window(): + TEST_SLIDING_WINDOW = 4096 + # Test that the sliding window is correctly computed. + # For Qwen1.5/Qwen2, get_sliding_window() should be None + # when use_sliding_window is False. + qwen2_model_config = ModelConfig( + "Qwen/Qwen1.5-7B", + "Qwen/Qwen1.5-7B", + tokenizer_mode="auto", + trust_remote_code=False, + download_dir=None, + load_format="dummy", + seed=0, + dtype="float16", + revision=None, + ) + + qwen2_model_config.hf_config.use_sliding_window = False + qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW + assert qwen2_model_config.get_sliding_window() is None + + qwen2_model_config.hf_config.use_sliding_window = True + assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW + + mistral_model_config = ModelConfig( + "mistralai/Mistral-7B-v0.1", + "mistralai/Mistral-7B-v0.1", + tokenizer_mode="auto", + trust_remote_code=False, + download_dir=None, + load_format="dummy", + seed=0, + dtype="float16", + revision=None, + ) + mistral_model_config.hf_config.sliding_window = None + assert mistral_model_config.get_sliding_window() is None + + mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW + assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index 319c1569f5e98..de687395a0001 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -103,6 +103,7 @@ def __init__( # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C + if not os.path.exists(model): model_path = snapshot_download(model_id=model, cache_dir=download_dir, @@ -139,7 +140,7 @@ def _verify_load_format(self) -> None: if (f not in rocm_not_supported_load_format) ] raise ValueError( - f"load format \'{load_format}\' is not supported in ROCm. " + f"load format '{load_format}' is not supported in ROCm. " f"Supported load format are " f"{rocm_supported_load_format}") @@ -232,6 +233,15 @@ def verify_with_parallel_config( f"({pipeline_parallel_size}).") def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size, or None if disabled. + """ + + # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in + # addition to sliding window size. We check if that field is present + # and if it's False, return None. + if (hasattr(self.hf_config, "use_sliding_window") + and not self.hf_config.use_sliding_window): + return None return getattr(self.hf_config, "sliding_window", None) def get_vocab_size(self) -> int: @@ -624,7 +634,7 @@ def _get_and_verify_dtype( k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE) ] - raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " + raise ValueError(f"dtype '{dtype}' is not supported in ROCm. " f"Supported dtypes are {rocm_supported_dtypes}") # Verify the dtype. From 4518f5a981aba715d7f0e8b2aab4cbef18196aa6 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 14 Mar 2024 23:18:25 +0000 Subject: [PATCH 103/113] format --- vllm/config.py | 3 ++- vllm/model_executor/layers/linear.py | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index c1fe1397ca7e9..56fe6b522e7ee 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -172,7 +172,8 @@ def _verify_sparsity(self) -> None: raise ValueError("Both sparsity and quantization detected. Only " "one or the other is supported at a time.") - if self.sparsity is not None and self.sparsity not in supported_sparsity: + if (self.sparsity is not None + and self.sparsity not in supported_sparsity): raise ValueError(f"Unknown sparse method: {self.sparsity}. Must " f"be one of {supported_sparsity}.") diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index dbddea7a3b5f3..131f1ea2208b2 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -338,9 +338,10 @@ def weight_loader(self, assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) - # This is super hacky for now but we basically want to only compress once all - # of the shards are loaded, right now we just check if the number of shards - # loaded matches the number of outputs expected, assuming one shard per output + # This is super hacky for now but we basically want to only compress + # once all of the shards are loaded, right now we just check if the + # number of shards loaded matches the number of outputs expected, + # assuming one shard per output all_shards_loaded = (len(self.loaded_shards) == len(self.output_sizes)) if all_shards_loaded and isinstance(param, LazyCompressedParameter): param.compress() @@ -489,9 +490,9 @@ def weight_loader(self, self.loaded_shards.add(loaded_shard_id) - # This is super hacky for now but we basically want to only compress once - # all of the shards are loaded, for the QKV matrix this means - # loading shards "q", "k" and "v" + # This is super hacky for now but we basically want to only + # compress once all of the shards are loaded, for the QKV matrix + # this means loading shards "q", "k" and "v" all_shards_loaded = (self.loaded_shards == set(["q", "k", "v"])) if all_shards_loaded and isinstance(param, LazyCompressedParameter): param.compress() From 5bc7a73116a67525b3692a2ecd9a51a5838b4e29 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 14 Mar 2024 23:24:49 +0000 Subject: [PATCH 104/113] formating --- .../layers/parameters/__init__.py | 4 +++- .../layers/parameters/lazy_compressed.py | 21 +++++++++++-------- .../layers/sparsity/__init__.py | 8 ++++--- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py index c05cdf56e27a4..d05d73a79c13e 100644 --- a/vllm/model_executor/layers/parameters/__init__.py +++ b/vllm/model_executor/layers/parameters/__init__.py @@ -1,4 +1,6 @@ -from vllm.model_executor.layers.parameters.lazy_compressed import LazyCompressedParameter +from vllm.model_executor.layers.parameters.lazy_compressed import ( + LazyCompressedParameter +) __all__ = [ "LazyCompressedParameter", diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py index 37128a6ed54b7..65d44167c004a 100644 --- a/vllm/model_executor/layers/parameters/lazy_compressed.py +++ b/vllm/model_executor/layers/parameters/lazy_compressed.py @@ -66,7 +66,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs): def unwrap(e): nonlocal ret_storage_format_cls if isinstance(e, LazyCompressedParameter): - assert ret_storage_format_cls is None or ret_storage_format_cls == e.storage_format_cls + assert (ret_storage_format_cls is None or + ret_storage_format_cls == e.storage_format_cls) ret_storage_format_cls = e.storage_format_cls if e.is_empty: @@ -86,7 +87,8 @@ def wrap(e): torch.Tensor) and ret_storage_format_cls is not None: return LazyCompressedParameter( e, - # Here, "e" is the output of "func" so it is real data and we store it + # Here, "e" is the output of "func" so it is real + # data and we store it is_empty=False, storage_format_cls=ret_storage_format_cls) return e @@ -98,9 +100,10 @@ def compress(self) -> None: from magic_wand import SparseSemiStructuredStorageFormat if self.storage_format_cls == SparseSemiStructuredStorageFormat: - # Semi-structured sparsity assumes a 2:4 pattern, where each 4 elements - # have at minimum 2 zeros. We need to validate this pattern exists, so - # we check the whole tensor before committing to compression. + # Semi-structured sparsity assumes a 2:4 pattern, where + # each 4 elements have at minimum 2 zeros. We need to validate + # this pattern exists, so we check the whole tensor + # before committing to compression. # Count zeros in each group of 4 reshaped_tensor = self.uncompressed_data.view(-1, 4) @@ -112,8 +115,8 @@ def compress(self) -> None: if not has_semi_structured_sparsity: logger.warning( - f"Called compress() on tensor of shape {self.shape} but does not " - "have 2:4 sparsity, skipping compression") + f"Called compress() on tensor of shape {self.shape} but " + "does not have 2:4 sparsity, skipping compression") return else: @@ -123,8 +126,8 @@ def compress(self) -> None: # Only compress if we have sufficient sparsity (>=40%) if sparsity < 0.4: logger.warning( - f"Called compress() on tensor of shape {self.shape} but only has " - f"{sparsity:.2}% sparsity, skipping compression") + f"Called compress() on tensor of shape {self.shape}, but " + f"only has {sparsity:.2}% sparsity, skipping compression") return if self.uncompressed_data is None: diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py index 874819f343373..204281924e9ad 100644 --- a/vllm/model_executor/layers/sparsity/__init__.py +++ b/vllm/model_executor/layers/sparsity/__init__.py @@ -7,9 +7,11 @@ "magic_wand is not available and required for sparsity " "support. Please install it with `pip install nm-magic-wand`") -from vllm.model_executor.layers.sparsity.base_config import SparsityConfig # noqa: E402 -from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config # noqa: E402 -from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import SemiStructuredSparseW16A16Config # noqa: E402 +from vllm.model_executor.layers.sparsity.base_config import SparsityConfig # noqa: E402 +from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config # noqa: E402 +from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import ( + SemiStructuredSparseW16A16Config # noqa: E402 +) _SPARSITY_CONFIG_REGISTRY = { "sparse_w16a16": SparseW16A16Config, From 6f60731d98dcbd763c86e643194e207be0b2f65f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 14 Mar 2024 23:28:34 +0000 Subject: [PATCH 105/113] ruff --- tests/models/compare_utils.py | 16 ++++++++++------ tests/models/test_compressed.py | 6 +++--- tests/models/test_compressed_memory.py | 12 ++++++++---- tests/models/test_marlin.py | 4 +++- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py index aefaf881048b8..235ccad0549a7 100644 --- a/tests/models/compare_utils.py +++ b/tests/models/compare_utils.py @@ -1,5 +1,5 @@ -"""Compare the logprobs of two sequences generated by different models, which should -be similar but not necessarily equal. +"""Compare the logprobs of two sequences generated by different models, +which should be similar but not necessarily equal. """ @@ -15,14 +15,18 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): for idx, (output_id_0, output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): - # If generated tokens don't match ... + # If generated tokens don't match, then if output_id_0 != output_id_1: - # ... each predicted token must be in top N logprobs of the other's + # Each predicted token must be in top N logprobs of the other assert output_id_0 in logprobs_1[idx], ( - f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}" + f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}" ) assert output_id_1 in logprobs_0[idx], ( - f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}" + f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}" ) # Break out since sequences will now diverge. diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py index fed9dfb35e881..c6fce5ef8ae7d 100644 --- a/tests/models/test_compressed.py +++ b/tests/models/test_compressed.py @@ -1,4 +1,4 @@ -"""Compare the outputs of a sparse model running sparse vs sparse model running dense. +"""Compare the outputs of a sparse model vs sparse model running dense. Note: sparse kernels do not have bitwise correctness vs the dense models. As a result, in this test, we just confirm that the top selected tokens of the sparse models are in the top N selections of same model running dense. @@ -41,7 +41,7 @@ def test_models( sparse_outputs = sparse_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Note: deleting just the model does not always free the GPU memory, not sure why. + # Deleting just the model does not always free the GPU memory. del sparse_model.model.llm_engine.driver_worker del sparse_model gc.collect() @@ -53,7 +53,7 @@ def test_models( dense_outputs = dense_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Note: deleting just the model does not always free the GPU memory, not sure why. + # Deleting just the model does not always free the GPU memory. del dense_model.model.llm_engine.driver_worker del dense_model gc.collect() diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py index 1abb9269dc15e..c331e7132272d 100644 --- a/tests/models/test_compressed_memory.py +++ b/tests/models/test_compressed_memory.py @@ -36,9 +36,11 @@ def test_models( sparsity=None, dtype=dtype, max_model_len=1024) - dense_num_kv_blocks = dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks + dense_num_kv_blocks = ( + dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks + ) - # Note: deleting just the model does not always free the GPU memory, not sure why. + # Deleting just the model does not always free the GPU memory. del dense_model.model.llm_engine.driver_worker del dense_model torch.cuda.empty_cache() @@ -48,9 +50,11 @@ def test_models( sparsity=sparsity, dtype=dtype, max_model_len=1024) - sparse_num_kv_blocks = sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks + sparse_num_kv_blocks = ( + sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks + ) - # Note: deleting just the model does not always free the GPU memory, not sure why. + # Deleting just the model does not always free the GPU memory. del sparse_model.model.llm_engine.driver_worker del sparse_model torch.cuda.empty_cache() diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 1dca24ffa9a53..35dfd7c19d8df 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -20,7 +20,9 @@ import gc from compare_utils import check_logprobs_close from dataclasses import dataclass -from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY +from vllm.model_executor.layers.quantization import ( + _QUANTIZATION_CONFIG_REGISTRY +) MAX_MODEL_LEN = 1024 From 5ba2ee147fa014b31fa6a78692a05fdd5e046c62 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 14 Mar 2024 23:29:59 +0000 Subject: [PATCH 106/113] ruff again --- vllm/model_executor/layers/sparsity/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py index 204281924e9ad..24292d849b905 100644 --- a/vllm/model_executor/layers/sparsity/__init__.py +++ b/vllm/model_executor/layers/sparsity/__init__.py @@ -9,8 +9,8 @@ from vllm.model_executor.layers.sparsity.base_config import SparsityConfig # noqa: E402 from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config # noqa: E402 -from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import ( - SemiStructuredSparseW16A16Config # noqa: E402 +from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import ( # noqa: E402 + SemiStructuredSparseW16A16Config ) _SPARSITY_CONFIG_REGISTRY = { From d342426135b1da66561cb64065617d946b9d337f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 14 Mar 2024 23:39:16 +0000 Subject: [PATCH 107/113] yapf --- tests/models/compare_utils.py | 6 ++---- tests/models/test_compressed_memory.py | 10 ++++------ tests/models/test_marlin.py | 3 +-- vllm/model_executor/layers/parameters/__init__.py | 3 +-- .../layers/parameters/lazy_compressed.py | 12 ++++++------ vllm/model_executor/layers/sparsity/__init__.py | 9 ++++----- 6 files changed, 18 insertions(+), 25 deletions(-) diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py index 235ccad0549a7..44319b6ca45ff 100644 --- a/tests/models/compare_utils.py +++ b/tests/models/compare_utils.py @@ -21,13 +21,11 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1): assert output_id_0 in logprobs_1[idx], ( f"Test{prompt_idx}:" f"\n{name_0}:\t{output_str_0!r}" - f"\n{name_1}:\t{output_str_1!r}" - ) + f"\n{name_1}:\t{output_str_1!r}") assert output_id_1 in logprobs_0[idx], ( f"Test{prompt_idx}:" f"\n{name_0}:\t{output_str_0!r}" - f"\n{name_1}:\t{output_str_1!r}" - ) + f"\n{name_1}:\t{output_str_1!r}") # Break out since sequences will now diverge. break diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py index c331e7132272d..056452b77e020 100644 --- a/tests/models/test_compressed_memory.py +++ b/tests/models/test_compressed_memory.py @@ -36,9 +36,8 @@ def test_models( sparsity=None, dtype=dtype, max_model_len=1024) - dense_num_kv_blocks = ( - dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks - ) + dense_num_kv_blocks = (dense_model.model.llm_engine.scheduler. + block_manager.gpu_allocator.num_blocks) # Deleting just the model does not always free the GPU memory. del dense_model.model.llm_engine.driver_worker @@ -50,9 +49,8 @@ def test_models( sparsity=sparsity, dtype=dtype, max_model_len=1024) - sparse_num_kv_blocks = ( - sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks - ) + sparse_num_kv_blocks = (sparse_model.model.llm_engine.scheduler. + block_manager.gpu_allocator.num_blocks) # Deleting just the model does not always free the GPU memory. del sparse_model.model.llm_engine.driver_worker diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 35dfd7c19d8df..e524b785af389 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -21,8 +21,7 @@ from compare_utils import check_logprobs_close from dataclasses import dataclass from vllm.model_executor.layers.quantization import ( - _QUANTIZATION_CONFIG_REGISTRY -) + _QUANTIZATION_CONFIG_REGISTRY) MAX_MODEL_LEN = 1024 diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py index d05d73a79c13e..6cb53db01d3f6 100644 --- a/vllm/model_executor/layers/parameters/__init__.py +++ b/vllm/model_executor/layers/parameters/__init__.py @@ -1,6 +1,5 @@ from vllm.model_executor.layers.parameters.lazy_compressed import ( - LazyCompressedParameter -) + LazyCompressedParameter) __all__ = [ "LazyCompressedParameter", diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py index 65d44167c004a..05d6bfb27008f 100644 --- a/vllm/model_executor/layers/parameters/lazy_compressed.py +++ b/vllm/model_executor/layers/parameters/lazy_compressed.py @@ -66,8 +66,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs): def unwrap(e): nonlocal ret_storage_format_cls if isinstance(e, LazyCompressedParameter): - assert (ret_storage_format_cls is None or - ret_storage_format_cls == e.storage_format_cls) + assert (ret_storage_format_cls is None + or ret_storage_format_cls == e.storage_format_cls) ret_storage_format_cls = e.storage_format_cls if e.is_empty: @@ -100,9 +100,9 @@ def compress(self) -> None: from magic_wand import SparseSemiStructuredStorageFormat if self.storage_format_cls == SparseSemiStructuredStorageFormat: - # Semi-structured sparsity assumes a 2:4 pattern, where - # each 4 elements have at minimum 2 zeros. We need to validate - # this pattern exists, so we check the whole tensor + # Semi-structured sparsity assumes a 2:4 pattern, where + # each 4 elements have at minimum 2 zeros. We need to validate + # this pattern exists, so we check the whole tensor # before committing to compression. # Count zeros in each group of 4 @@ -116,7 +116,7 @@ def compress(self) -> None: if not has_semi_structured_sparsity: logger.warning( f"Called compress() on tensor of shape {self.shape} but " - "does not have 2:4 sparsity, skipping compression") + "does not have 2:4 sparsity, skipping compression") return else: diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py index 24292d849b905..df2ca0f1b773f 100644 --- a/vllm/model_executor/layers/sparsity/__init__.py +++ b/vllm/model_executor/layers/sparsity/__init__.py @@ -7,11 +7,10 @@ "magic_wand is not available and required for sparsity " "support. Please install it with `pip install nm-magic-wand`") -from vllm.model_executor.layers.sparsity.base_config import SparsityConfig # noqa: E402 -from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config # noqa: E402 -from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import ( # noqa: E402 - SemiStructuredSparseW16A16Config -) +from vllm.model_executor.layers.sparsity.base_config import SparsityConfig # noqa: E402 +from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config # noqa: E402 +from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import ( # noqa: E402 + SemiStructuredSparseW16A16Config) _SPARSITY_CONFIG_REGISTRY = { "sparse_w16a16": SparseW16A16Config, From e2835280cc5415b80d7597e1c5ffb681bf3c790d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 15 Mar 2024 00:08:02 +0000 Subject: [PATCH 108/113] finalized ruff --- benchmarks/backend_request_func.py | 2 + benchmarks/benchmark_prefix_caching.py | 3 + benchmarks/benchmark_serving.py | 3 + collect_env.py | 4 ++ csrc/punica/bgmv/generator.py | 2 +- examples/multilora_inference.py | 2 + examples/offline_inference_with_prefix.py | 3 + neuralmagic/benchmarks/common.py | 8 +-- .../benchmarks/run_benchmark_serving.py | 33 ++++++--- .../benchmarks/run_benchmark_throughput.py | 10 ++- .../scripts/backend_request_func.py | 5 +- .../benchmarks/scripts/benchmark_serving.py | 67 ++++++++++--------- .../scripts/benchmark_throughput.py | 8 ++- neuralmagic/benchmarks/scripts/common.py | 13 ++-- .../benchmarks/scripts/datasets_registry.py | 4 +- .../scripts/logging/benchmark_result.py | 10 +-- .../scripts/logging/gha_benchmark_logging.py | 50 +++++++------- neuralmagic/tools/call_cmd.py | 11 +-- setup.py | 2 + 19 files changed, 144 insertions(+), 96 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 26d2c24d5655c..8782f5546b21e 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation # This file has been modified by Neural Magic import json diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index a0307439cd5f1..5867e3b171919 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,3 +1,6 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation + import argparse import time diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 3f5e2d9c8f4dc..040e96458a14b 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,3 +1,6 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation + """Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/collect_env.py b/collect_env.py index a886db693e2f1..3c914795222ee 100644 --- a/collect_env.py +++ b/collect_env.py @@ -1,3 +1,7 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff. +# This file has been modified by Neural Magic + # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py # Unlike the rest of the PyTorch this file must be python2 compliant. diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py index 66de56d74f3e7..a92c67180372a 100644 --- a/csrc/punica/bgmv/generator.py +++ b/csrc/punica/bgmv/generator.py @@ -10,7 +10,7 @@ #include "bgmv_impl.cuh" FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) -""".lstrip() +""".lstrip() # noqa: E501 (UPSTREAM SYNC nm-automation) for input_dtype in DTYPES: for output_dtype in DTYPES: diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py index cd4451481ca83..7b1d580a9a7f6 100644 --- a/examples/multilora_inference.py +++ b/examples/multilora_inference.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation """ This example shows how to use the multi-LoRA functionality for offline inference. diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 1aa718b88907c..2c6c6aa63944d 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,3 +1,6 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation + from vllm import LLM, SamplingParams prefix = ( diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py index 398f8973cc8d2..089a347e49194 100644 --- a/neuralmagic/benchmarks/common.py +++ b/neuralmagic/benchmarks/common.py @@ -27,8 +27,8 @@ def max_model_length_from_model_id(model: str, def script_args_to_cla(config: NamedTuple) -> Iterable[dict]: - #config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs - + # config is a NamedTuple constructed from some JSON + # in neuralmagic/benchmarks/configs kv = vars(config.script_args) keys = kv.keys() @@ -57,8 +57,8 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[dict]: def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]: """ - Give a path to a config file in `neuralmagic/benchmarks/configs/*` return an Iterable of - (sub)configs in the file + Give a path to a config file in `neuralmagic/benchmarks/configs/*` + return an Iterable of (sub)configs in the file """ assert config_file_path.exists() diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py index 110d47e354e24..a9eccb3666d20 100644 --- a/neuralmagic/benchmarks/run_benchmark_serving.py +++ b/neuralmagic/benchmarks/run_benchmark_serving.py @@ -8,7 +8,10 @@ from typing import NamedTuple, Optional from pathlib import Path -from .common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs +from .common import ( + download_model, max_model_length_from_model_id, + script_args_to_cla, benchmark_configs +) from .scripts.common import warmup_server, num_available_gpus from ..tools.call_cmd import call_cmd @@ -56,18 +59,25 @@ def try_connection() -> bool: return False -def run_benchmark_serving_script(config: NamedTuple, - output_directory: Optional[Path] = None - ) -> None: +def run_benchmark_serving_script( + config: NamedTuple, + output_directory: Optional[Path] = None +) -> None: assert config.script_name == 'benchmark_serving' - def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: + def run_bench( + server_cmd: str, + bench_cmd: list[str], + model: str + ) -> None: try: # start server - server_process = subprocess.Popen("exec " + server_cmd, shell=True) + server_process = subprocess.Popen( + "exec " + server_cmd, shell=True) if not is_server_running(BENCH_SERVER_HOST, BENCH_SERVER_PORT): raise ValueError( - f"Aborting bench run with : server-cmd {server_cmd} , bench-cmd {bench_cmd}. Reason: Cannot start Server" + f"Aborting bench run with : server-cmd {server_cmd} , " + f"bench-cmd {bench_cmd}. Reason: Cannot start Server" ) # server warmup @@ -96,13 +106,15 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: supported_max_model_len = max_model_length_from_model_id(model) - # If the requested model-len is too big, try running with the maximum supported for this model. + # If the requested model-len is too big, try running with the + # maximum supported for this model. max_model_lens = set( map(lambda v: min(v, supported_max_model_len), config.max_model_lens)) if (config.max_model_lens != list(max_model_lens)): print( - f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}" + f"WARNING: max_model_len modified to {max_model_lens} " + f"from {config.max_model_lens} for model {model}" ) for max_model_len in max_model_lens: @@ -120,7 +132,8 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: server_args["sparsity"] = sparsity server_cmd = "python3 -m vllm.entrypoints.api_server " + \ - " ".join([f"--{k} {v}" for k, v in server_args.items()]) + " ".join([f"--{k} {v}" + for k, v in server_args.items()]) for script_args in script_args_to_cla(config): diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py index d6a505df71559..a28c1a9b73ea4 100644 --- a/neuralmagic/benchmarks/run_benchmark_throughput.py +++ b/neuralmagic/benchmarks/run_benchmark_throughput.py @@ -3,7 +3,9 @@ from pathlib import Path from typing import NamedTuple, Optional -from .common import script_args_to_cla, benchmark_configs, max_model_length_from_model_id +from .common import ( + script_args_to_cla, benchmark_configs, max_model_length_from_model_id +) from ..tools.call_cmd import call_cmd @@ -19,13 +21,15 @@ def run_benchmark_throughput_script(config: NamedTuple, supported_max_model_len = max_model_length_from_model_id(model) - # If the requested model-len is too big, try running with the maximum supported for this model. + # If the requested model-len is too big, try running with + # the maximum supported for this model. max_model_lens = set( map(lambda v: min(v, supported_max_model_len), config.max_model_lens)) if (config.max_model_lens != list(max_model_lens)): print( - f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}" + f"WARNING: max_model_len modified to {max_model_lens} " + f"from {config.max_model_lens} for model {model}" ) for max_model_len in max_model_lens: diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py index 078cfd1c6a7fc..dc3855f54418e 100644 --- a/neuralmagic/benchmarks/scripts/backend_request_func.py +++ b/neuralmagic/benchmarks/scripts/backend_request_func.py @@ -135,7 +135,7 @@ async def async_request_vllm( data = part_data output.latency = time.perf_counter() - st - # When streaming, '\0' is appended to the end of the response. + # When streaming, '\0' is appended to the end. body = trim_suffix(data.decode('utf-8'), "\0") output.generated_text = json.loads( body)["text"][0][len(request_func_input.prompt):] @@ -220,7 +220,8 @@ async def async_request_deepspeed_mii( output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len - # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder. + # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, + # will use 0 as placeholder. # https://github.com/microsoft/DeepSpeed-MII/pull/311 output.ttft = 0 diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py index f0c1d8d9951fc..5e0ca7d52aa43 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_serving.py +++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py @@ -238,18 +238,14 @@ async def benchmark(backend: str, api_url: str, model_id: str, print(f"Benchmark duration: {metrics.metadata.duration:2f} s") print(f"Total input tokens: {metrics.metadata.total_input}") print(f"Total generated tokens: {metrics.metadata.total_output}") - print( - f"Request throughput: {metrics.metrics.request_throughput:.2f} requests/s" - ) - print( - f"Input token throughput: {metrics.metrics.input_throughput:.2f} tokens/s" - ) - print( - f"Output token throughput: {metrics.metrics.output_throughput:.2f} tokens/s" - ) - print( - f"Median request latency: {metrics.metrics.median_request_latency:.2f} ms" - ) + print(f"Request throughput: " + f"{metrics.metrics.request_throughput:.2f} requests/s") + print(f"Input token throughput: " + f"{metrics.metrics.input_throughput:.2f} tokens/s") + print(f"Output token throughput: " + f"{metrics.metrics.output_throughput:.2f} tokens/s") + print(f"Median request latency: " + f"{metrics.metrics.median_request_latency:.2f} ms") print(f"P90 request latency: {metrics.metrics.p90_request_latency:.2f} ms") print(f"P99 request latency: {metrics.metrics.p99_request_latency:.2f} ms") print(f"Mean TTFT: {metrics.metrics.mean_ttft_ms:.2f} ms") @@ -349,9 +345,10 @@ def script_args_as_json_dict(script_args: argparse.Namespace): result = metrics.update_benchmark_result(result) # Add information about the derived variables as metadata - result[BenchmarkResult.METADATA_KEY_][ + metadata_key = BenchmarkResult.METADATA_KEY_ + result[metadata_key][ ResultMetadataKeys.num_prompts] = num_prompts - result[BenchmarkResult.METADATA_KEY_][ResultMetadataKeys.request_rate] = \ + result[metadata_key][ResultMetadataKeys.request_rate] = \ request_rate if request_rate < float("inf") else "inf" # Save to file @@ -388,7 +385,8 @@ def from_str(arg: str): type=str, default="benchmark-serving", help= - "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts" + "Benchmark description. This is primarily useful when " + "we log the benchmark results and process them for plotting charts" ) parser.add_argument( "--backend", @@ -437,8 +435,8 @@ def from_str(arg: str): parser.add_argument( "--tokenizer", type=str, - help= - "Name or path of the tokenizer, if not using the default model tokenizer.", + help="Name or path of the tokenizer, " + "if not using the default model tokenizer.", ) parser.add_argument( "--best-of", @@ -482,15 +480,15 @@ def from_str(arg: str): "Otherwise, we use Poisson process to synthesize " "the request arrival times.", ) - parser.add_argument("--nr-qps-pair_", - type=NumPrompts_RequestRate_T.from_str, - help=""" - First argument in the pair is num_prompts: Number of prompts to process. - Second argument in the pair is request_rate : Number of requests per second. If this is inf, - then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize - the request arrival times. - """, - default=None) + parser.add_argument( + "--nr-qps-pair_", + type=NumPrompts_RequestRate_T.from_str, + help=""" + First argument in the pair is num_prompts to process. + Second argument in the pair is request_rate per second. + If this is inf, then all the requests are sent at time 0. + Otherwise, we use Poisson process to synthesize""", + default=None) # Server command args parser.add_argument( @@ -498,29 +496,34 @@ def from_str(arg: str): type=int, default=None, help= - "tensor-parallel-size that the benchmarking script was invoked with. It is useful to log this information when storing benchmarking results" + "tensor-parallel-size that the benchmarking script was invoked with. " + "It is useful to log this information when storing benchmarking results" ) parser.add_argument( "--server-args", type=str, default=None, help= - "When we are logging the output, it is useful to log the arguments passed to the server" + "When we are logging the output, it is useful to log the " + "arguments passed to the server" ) def args_sanity_check(args): # Sanity check real-dataset vs synthetic-dataset usecase if args.dataset is None: - assert args.num_input_tokens is not None and args.num_output_tokens is not None + assert (args.num_input_tokens is not None and + args.num_output_tokens is not None) else: - assert args.num_input_tokens is None and args.num_output_tokens is None - # Sanity check num_prompts, request_rate as separate args vs joint args usecase + assert (args.num_input_tokens is None and + args.num_output_tokens is None) + # Sanity check num_prompts, request_rate as separate args vs joint args assert not all([ args.num_prompts_ is None, args.request_rate_ is None, args.nr_qps_pair_ is None ]) if args.nr_qps_pair_ is None: - assert args.num_prompts_ is not None and args.request_rate_ is not None + assert (args.num_prompts_ is not None and + args.request_rate_ is not None) else: assert args.num_prompts_ is None and args.request_rate_ is None # Sanity check required logging args diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py index 9138ea0f8ad47..f351a70abdb60 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py +++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py @@ -12,7 +12,8 @@ from pathlib import Path from typing import List, Optional, Tuple from transformers import AutoTokenizer -from .common import generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs +from .common import (generate_synthetic_requests, warmup_vllm_engine, + num_available_gpus, print_request_outputs) from .datasets_registry import get_dataset, DatasetArgs from .logging.benchmark_result import (BenchmarkResult, BenchmarkThroughputResultMetricTemplates @@ -163,7 +164,7 @@ def main(args: argparse.Namespace): current_dt_str = current_dt.strftime("%Y%m%d-%H%M%S") file_name = Path( args.save_directory - ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" + ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" # noqa: E501 result.store(file_name) @@ -174,7 +175,8 @@ def main(args: argparse.Namespace): type=str, default="benchmark-throughput", help= - "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts" + "Benchmark description. This is primarily useful when " + "we log the benchmark results and process them for plotting charts" ) parser.add_argument("--backend", type=str, diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py index d4addb99a2878..8fbe292d6abc8 100644 --- a/neuralmagic/benchmarks/scripts/common.py +++ b/neuralmagic/benchmarks/scripts/common.py @@ -12,7 +12,8 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.tokenizer import get_tokenizer from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR -from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm +from .backend_request_func import ( + RequestFuncInput, RequestFuncOutput, async_request_vllm) from ...tools.call_cmd import call_cmd @@ -23,7 +24,7 @@ def num_available_gpus() -> int: def get_benchmarking_context() -> dict: """ - Return the current python version, pytorch version and CUDA version as a dict + Return the current python, pytorch and CUDA version as a dict """ import sys import torch @@ -100,7 +101,7 @@ def warmup_requests(tokenizer: PreTrainedTokenizerBase, num_input_tokens: int = 128, num_output_tokens: int = 1) -> List[Tuple[str, int, int]]: """ - Given a tokenizer, generate `num_requests` requests that would be used for vllm engine warmup + Given a tokenizer, generate `num_requests` requests used for warmup """ words = list(tokenizer.get_vocab().keys()) requests = [] @@ -187,7 +188,7 @@ async def process_requests(input_requests): def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int, n_output_tokens: int) -> str: - return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" + return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" # noqa: E501 def print_request_outputs(results: List[RequestOutput]) -> None: @@ -202,8 +203,8 @@ def print_request_outputs(results: List[RequestOutput]) -> None: def print_serving_request_io(inputs: List[Tuple[str, int, int]], outputs: List[RequestFuncOutput]) -> None: """ - inputs: list of tuples where the tuple is [prompt, prompt_length, output_length], - outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py) + inputs: list of tuples of form [prompt, prompt_length, output_length], + outputs: list of RequestFuncOutput output from benchmark_serving.py Format and print the inputs and outputs. """ for i, o in zip(inputs, outputs): diff --git a/neuralmagic/benchmarks/scripts/datasets_registry.py b/neuralmagic/benchmarks/scripts/datasets_registry.py index b710c712d24cb..c1c4d02e725a0 100644 --- a/neuralmagic/benchmarks/scripts/datasets_registry.py +++ b/neuralmagic/benchmarks/scripts/datasets_registry.py @@ -63,8 +63,8 @@ def get_ultrachat(tokenizer: PreTrainedTokenizerBase, prompts = [] completions = [] system_message = { - "content": - "You are a chatbot with the explicit goal of helping the user as best as possible", + "content": "You are a chatbot with the explicit goal of " + "helping the user as best as possible", "role": "system", } for messages in ds["messages"]: diff --git a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py index a997cbb855698..37b9c49aa9fd4 100644 --- a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py +++ b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py @@ -1,5 +1,5 @@ """ -Defines a BenchmarkResult class that all the benchmarks use store the benchmark results. +Defines a BenchmarkResult class that all the benchmarks use to save results. """ import json @@ -16,9 +16,9 @@ # NOTE - PLEASE READ: # Any modifications that adds/removes the keys in the JSON that BenchmarkResult # produces should also update the BENCHMARK_RESULTS_SCHEMA_VERSION. -# The primary use case is to establish a set of keys that can be queried against reliably. -# TODO (varun) : Initial version is named 0.0.0 as things are under development. Update it -# when things are stable. +# The primary use case is to establish a set of keys that can be queried. +# TODO (varun) : Initial version is named 0.0.0 as things are under development. +# Update it when things are stable. BENCHMARK_RESULTS_SCHEMA_VERSION = "0.0.0" @@ -158,7 +158,7 @@ def __init__(self, description: str, date: datetime, script_name: str, dataset if dataset is not None else "synthetic", self.SCRIPT_ARGS_KEY_: script_args, - # Any metadata that the caller script wants to store should be stored here. + # Any metadata that the caller script wants to store. self.METADATA_KEY_: {}, # Any benchmarking metrics should be stored here. self.METRICS_KEY_: {} diff --git a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py index a7564417ba702..116eba43f13d2 100644 --- a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py +++ b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py @@ -10,7 +10,9 @@ from dataclasses import dataclass from typing import List, Iterable, NamedTuple -from .benchmark_result import GHABenchmarkToolName, BenchmarkResult, MetricTemplate +from .benchmark_result import ( + GHABenchmarkToolName, BenchmarkResult, MetricTemplate +) @dataclass @@ -123,29 +125,29 @@ def dump_to_json(gha_records: List[GHARecord], output_path: Path): Reference : https://github.com/benchmark-action/github-action-benchmark """) - parser.add_argument("-i", - "--input-json-directory", - required=True, - type=str, - help=""" - Path to the directory containing BenchmarkResult jsons. - This is typically the output directory passed to the benchmark - runner scripts like neuralmagic/benchmarks/run_benchmarks.py. - """) - - parser.add_argument("--bigger-is-better-output-file-path", - type=str, - required=True, - help=""" - An output file path, where the GHABenchmarkToolName BiggerIsBetter metrics are to be stored. - """) - - parser.add_argument("--smaller-is-better-output-file-path", - type=str, - required=True, - help=""" - An output file path, where the GHABenchmarkToolName SmallerIsBetter metrics are to be stored - """) + parser.add_argument( + "-i", + "--input-json-directory", + required=True, + type=str, + help="""Path to the directory containing BenchmarkResult + jsons. This is typically the output directory passed + to the benchmark runner scripts like + neuralmagic/benchmarks/run_benchmarks.py.""") + + parser.add_argument( + "--bigger-is-better-output-file-path", + type=str, + required=True, + help="""An output file path, where the GHABenchmarkToolName + BiggerIsBetter metrics are to be stored.""") + + parser.add_argument( + "--smaller-is-better-output-file-path", + type=str, + required=True, + help="""An output file path, where the GHABenchmarkToolName + SmallerIsBetter metrics are to be stored""") args = parser.parse_args() diff --git a/neuralmagic/tools/call_cmd.py b/neuralmagic/tools/call_cmd.py index 2ff84a0c02a5f..2e3f06c267fd3 100644 --- a/neuralmagic/tools/call_cmd.py +++ b/neuralmagic/tools/call_cmd.py @@ -1,6 +1,9 @@ # -# Run cmd as a sub-process. Capture stdout, stderr, return status, elapsed time and -# optionally process statistics (user time, system time, peak memory usage, etc.) +# Run cmd as a sub-process. +# +# Capture stdout, stderr, return status, elapsed time and +# optionally process statistics +# (user time, system time, peak memory usage, etc.) # import os import re @@ -12,8 +15,8 @@ def parse_process_stats(str): exp = ( - "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) " - "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)" + "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) " # noqa: E501 + "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)" # noqa: E501 ) results = re.search(exp, str) if results: diff --git a/setup.py b/setup.py index f76447c0e7424..6c1b4a91134d0 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,5 @@ +# flake8: noqa +# UPSTREAM SYNC: noqa is required for passing ruff. # This file has been modified by Neural Magic import contextlib From c5633f2a2b540b1f29ae37eeb7b7377e9aa4045e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 15 Mar 2024 00:12:46 +0000 Subject: [PATCH 109/113] yapf after ruff :) --- benchmarks/benchmark_serving.py | 1 - csrc/punica/bgmv/generator.py | 2 +- neuralmagic/benchmarks/common.py | 2 +- .../benchmarks/run_benchmark_serving.py | 35 ++++++----------- .../benchmarks/run_benchmark_throughput.py | 13 +++---- .../scripts/backend_request_func.py | 2 +- .../benchmarks/scripts/benchmark_serving.py | 38 ++++++++----------- .../scripts/benchmark_throughput.py | 10 ++--- neuralmagic/benchmarks/scripts/common.py | 6 +-- .../benchmarks/scripts/datasets_registry.py | 4 +- .../scripts/logging/gha_benchmark_logging.py | 5 +-- neuralmagic/tools/call_cmd.py | 6 +-- 12 files changed, 50 insertions(+), 74 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 040e96458a14b..7699304769653 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,6 +1,5 @@ # flake8: noqa # UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation - """Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py index a92c67180372a..7ceaf9e6892a5 100644 --- a/csrc/punica/bgmv/generator.py +++ b/csrc/punica/bgmv/generator.py @@ -10,7 +10,7 @@ #include "bgmv_impl.cuh" FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype}) -""".lstrip() # noqa: E501 (UPSTREAM SYNC nm-automation) +""".lstrip() # noqa: E501 (UPSTREAM SYNC nm-automation) for input_dtype in DTYPES: for output_dtype in DTYPES: diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py index 089a347e49194..b0fa4fbe45187 100644 --- a/neuralmagic/benchmarks/common.py +++ b/neuralmagic/benchmarks/common.py @@ -27,7 +27,7 @@ def max_model_length_from_model_id(model: str, def script_args_to_cla(config: NamedTuple) -> Iterable[dict]: - # config is a NamedTuple constructed from some JSON + # config is a NamedTuple constructed from some JSON # in neuralmagic/benchmarks/configs kv = vars(config.script_args) diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py index a9eccb3666d20..0c10219501ea1 100644 --- a/neuralmagic/benchmarks/run_benchmark_serving.py +++ b/neuralmagic/benchmarks/run_benchmark_serving.py @@ -8,10 +8,8 @@ from typing import NamedTuple, Optional from pathlib import Path -from .common import ( - download_model, max_model_length_from_model_id, - script_args_to_cla, benchmark_configs -) +from .common import (download_model, max_model_length_from_model_id, + script_args_to_cla, benchmark_configs) from .scripts.common import warmup_server, num_available_gpus from ..tools.call_cmd import call_cmd @@ -59,26 +57,19 @@ def try_connection() -> bool: return False -def run_benchmark_serving_script( - config: NamedTuple, - output_directory: Optional[Path] = None -) -> None: +def run_benchmark_serving_script(config: NamedTuple, + output_directory: Optional[Path] = None + ) -> None: assert config.script_name == 'benchmark_serving' - def run_bench( - server_cmd: str, - bench_cmd: list[str], - model: str - ) -> None: + def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None: try: # start server - server_process = subprocess.Popen( - "exec " + server_cmd, shell=True) + server_process = subprocess.Popen("exec " + server_cmd, shell=True) if not is_server_running(BENCH_SERVER_HOST, BENCH_SERVER_PORT): raise ValueError( f"Aborting bench run with : server-cmd {server_cmd} , " - f"bench-cmd {bench_cmd}. Reason: Cannot start Server" - ) + f"bench-cmd {bench_cmd}. Reason: Cannot start Server") # server warmup warmup_server(server_host=BENCH_SERVER_HOST, @@ -106,16 +97,14 @@ def run_bench( supported_max_model_len = max_model_length_from_model_id(model) - # If the requested model-len is too big, try running with the + # If the requested model-len is too big, try running with the # maximum supported for this model. max_model_lens = set( map(lambda v: min(v, supported_max_model_len), config.max_model_lens)) if (config.max_model_lens != list(max_model_lens)): - print( - f"WARNING: max_model_len modified to {max_model_lens} " - f"from {config.max_model_lens} for model {model}" - ) + print(f"WARNING: max_model_len modified to {max_model_lens} " + f"from {config.max_model_lens} for model {model}") for max_model_len in max_model_lens: @@ -132,7 +121,7 @@ def run_bench( server_args["sparsity"] = sparsity server_cmd = "python3 -m vllm.entrypoints.api_server " + \ - " ".join([f"--{k} {v}" + " ".join([f"--{k} {v}" for k, v in server_args.items()]) for script_args in script_args_to_cla(config): diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py index a28c1a9b73ea4..debb98f8a3279 100644 --- a/neuralmagic/benchmarks/run_benchmark_throughput.py +++ b/neuralmagic/benchmarks/run_benchmark_throughput.py @@ -3,9 +3,8 @@ from pathlib import Path from typing import NamedTuple, Optional -from .common import ( - script_args_to_cla, benchmark_configs, max_model_length_from_model_id -) +from .common import (script_args_to_cla, benchmark_configs, + max_model_length_from_model_id) from ..tools.call_cmd import call_cmd @@ -21,16 +20,14 @@ def run_benchmark_throughput_script(config: NamedTuple, supported_max_model_len = max_model_length_from_model_id(model) - # If the requested model-len is too big, try running with + # If the requested model-len is too big, try running with # the maximum supported for this model. max_model_lens = set( map(lambda v: min(v, supported_max_model_len), config.max_model_lens)) if (config.max_model_lens != list(max_model_lens)): - print( - f"WARNING: max_model_len modified to {max_model_lens} " - f"from {config.max_model_lens} for model {model}" - ) + print(f"WARNING: max_model_len modified to {max_model_lens} " + f"from {config.max_model_lens} for model {model}") for max_model_len in max_model_lens: for script_args in script_args_to_cla(config): diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py index dc3855f54418e..b5e0308848e25 100644 --- a/neuralmagic/benchmarks/scripts/backend_request_func.py +++ b/neuralmagic/benchmarks/scripts/backend_request_func.py @@ -220,7 +220,7 @@ async def async_request_deepspeed_mii( output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len - # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, + # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, # will use 0 as placeholder. # https://github.com/microsoft/DeepSpeed-MII/pull/311 output.ttft = 0 diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py index 5e0ca7d52aa43..6dc32e9d552ea 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_serving.py +++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py @@ -346,8 +346,7 @@ def script_args_as_json_dict(script_args: argparse.Namespace): # Add information about the derived variables as metadata metadata_key = BenchmarkResult.METADATA_KEY_ - result[metadata_key][ - ResultMetadataKeys.num_prompts] = num_prompts + result[metadata_key][ResultMetadataKeys.num_prompts] = num_prompts result[metadata_key][ResultMetadataKeys.request_rate] = \ request_rate if request_rate < float("inf") else "inf" @@ -384,10 +383,8 @@ def from_str(arg: str): "--description", type=str, default="benchmark-serving", - help= - "Benchmark description. This is primarily useful when " - "we log the benchmark results and process them for plotting charts" - ) + help="Benchmark description. This is primarily useful when " + "we log the benchmark results and process them for plotting charts") parser.add_argument( "--backend", type=str, @@ -436,7 +433,7 @@ def from_str(arg: str): "--tokenizer", type=str, help="Name or path of the tokenizer, " - "if not using the default model tokenizer.", + "if not using the default model tokenizer.", ) parser.add_argument( "--best-of", @@ -480,15 +477,14 @@ def from_str(arg: str): "Otherwise, we use Poisson process to synthesize " "the request arrival times.", ) - parser.add_argument( - "--nr-qps-pair_", - type=NumPrompts_RequestRate_T.from_str, - help=""" + parser.add_argument("--nr-qps-pair_", + type=NumPrompts_RequestRate_T.from_str, + help=""" First argument in the pair is num_prompts to process. Second argument in the pair is request_rate per second. If this is inf, then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize""", - default=None) + default=None) # Server command args parser.add_argument( @@ -503,27 +499,25 @@ def from_str(arg: str): "--server-args", type=str, default=None, - help= - "When we are logging the output, it is useful to log the " - "arguments passed to the server" - ) + help="When we are logging the output, it is useful to log the " + "arguments passed to the server") def args_sanity_check(args): # Sanity check real-dataset vs synthetic-dataset usecase if args.dataset is None: - assert (args.num_input_tokens is not None and - args.num_output_tokens is not None) + assert (args.num_input_tokens is not None + and args.num_output_tokens is not None) else: - assert (args.num_input_tokens is None and - args.num_output_tokens is None) + assert (args.num_input_tokens is None + and args.num_output_tokens is None) # Sanity check num_prompts, request_rate as separate args vs joint args assert not all([ args.num_prompts_ is None, args.request_rate_ is None, args.nr_qps_pair_ is None ]) if args.nr_qps_pair_ is None: - assert (args.num_prompts_ is not None and - args.request_rate_ is not None) + assert (args.num_prompts_ is not None + and args.request_rate_ is not None) else: assert args.num_prompts_ is None and args.request_rate_ is None # Sanity check required logging args diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py index f351a70abdb60..ba586772d5d09 100644 --- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py +++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py @@ -12,7 +12,7 @@ from pathlib import Path from typing import List, Optional, Tuple from transformers import AutoTokenizer -from .common import (generate_synthetic_requests, warmup_vllm_engine, +from .common import (generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs) from .datasets_registry import get_dataset, DatasetArgs from .logging.benchmark_result import (BenchmarkResult, @@ -164,7 +164,7 @@ def main(args: argparse.Namespace): current_dt_str = current_dt.strftime("%Y%m%d-%H%M%S") file_name = Path( args.save_directory - ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" # noqa: E501 + ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" # noqa: E501 result.store(file_name) @@ -174,10 +174,8 @@ def main(args: argparse.Namespace): "--description", type=str, default="benchmark-throughput", - help= - "Benchmark description. This is primarily useful when " - "we log the benchmark results and process them for plotting charts" - ) + help="Benchmark description. This is primarily useful when " + "we log the benchmark results and process them for plotting charts") parser.add_argument("--backend", type=str, choices=["vllm"], diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py index 8fbe292d6abc8..9333939300e92 100644 --- a/neuralmagic/benchmarks/scripts/common.py +++ b/neuralmagic/benchmarks/scripts/common.py @@ -12,8 +12,8 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.tokenizer import get_tokenizer from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR -from .backend_request_func import ( - RequestFuncInput, RequestFuncOutput, async_request_vllm) +from .backend_request_func import (RequestFuncInput, RequestFuncOutput, + async_request_vllm) from ...tools.call_cmd import call_cmd @@ -188,7 +188,7 @@ async def process_requests(input_requests): def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int, n_output_tokens: int) -> str: - return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" # noqa: E501 + return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" # noqa: E501 def print_request_outputs(results: List[RequestOutput]) -> None: diff --git a/neuralmagic/benchmarks/scripts/datasets_registry.py b/neuralmagic/benchmarks/scripts/datasets_registry.py index c1c4d02e725a0..919abb72ee39b 100644 --- a/neuralmagic/benchmarks/scripts/datasets_registry.py +++ b/neuralmagic/benchmarks/scripts/datasets_registry.py @@ -63,8 +63,8 @@ def get_ultrachat(tokenizer: PreTrainedTokenizerBase, prompts = [] completions = [] system_message = { - "content": "You are a chatbot with the explicit goal of " - "helping the user as best as possible", + "content": "You are a chatbot with the explicit goal of " + "helping the user as best as possible", "role": "system", } for messages in ds["messages"]: diff --git a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py index 116eba43f13d2..a89820da7dae9 100644 --- a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py +++ b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py @@ -10,9 +10,8 @@ from dataclasses import dataclass from typing import List, Iterable, NamedTuple -from .benchmark_result import ( - GHABenchmarkToolName, BenchmarkResult, MetricTemplate -) +from .benchmark_result import (GHABenchmarkToolName, BenchmarkResult, + MetricTemplate) @dataclass diff --git a/neuralmagic/tools/call_cmd.py b/neuralmagic/tools/call_cmd.py index 2e3f06c267fd3..1168ab5043bfd 100644 --- a/neuralmagic/tools/call_cmd.py +++ b/neuralmagic/tools/call_cmd.py @@ -1,8 +1,8 @@ # -# Run cmd as a sub-process. +# Run cmd as a sub-process. # # Capture stdout, stderr, return status, elapsed time and -# optionally process statistics +# optionally process statistics # (user time, system time, peak memory usage, etc.) # import os @@ -15,7 +15,7 @@ def parse_process_stats(str): exp = ( - "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) " # noqa: E501 + "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) " # noqa: E501 "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)" # noqa: E501 ) results = re.search(exp, str) From 1271e3c735b8bdf952d2c89ca1e6ec61e53ed052 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 15 Mar 2024 00:54:54 +0000 Subject: [PATCH 110/113] yapf after ruff :) --- tests/models/test_marlin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index e524b785af389..34bc6d0e77f61 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -48,6 +48,7 @@ class ModelPair: @pytest.mark.flaky(reruns=2) +@pytest.mark.skip(reason="OOM Again in Automation") @pytest.mark.skipif(marlin_not_supported, reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) From c47bd6b71096e07ecba05936f61908f8212e41f9 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 15 Mar 2024 02:09:18 +0000 Subject: [PATCH 111/113] fixed tests post update --- tests/models/test_compressed.py | 4 ---- tests/models/test_compressed_memory.py | 4 ---- tests/models/test_marlin.py | 8 -------- 3 files changed, 16 deletions(-) diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py index c6fce5ef8ae7d..aa885661a0af3 100644 --- a/tests/models/test_compressed.py +++ b/tests/models/test_compressed.py @@ -41,8 +41,6 @@ def test_models( sparse_outputs = sparse_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Deleting just the model does not always free the GPU memory. - del sparse_model.model.llm_engine.driver_worker del sparse_model gc.collect() @@ -53,8 +51,6 @@ def test_models( dense_outputs = dense_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # Deleting just the model does not always free the GPU memory. - del dense_model.model.llm_engine.driver_worker del dense_model gc.collect() diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py index 056452b77e020..fddebd58104e3 100644 --- a/tests/models/test_compressed_memory.py +++ b/tests/models/test_compressed_memory.py @@ -39,8 +39,6 @@ def test_models( dense_num_kv_blocks = (dense_model.model.llm_engine.scheduler. block_manager.gpu_allocator.num_blocks) - # Deleting just the model does not always free the GPU memory. - del dense_model.model.llm_engine.driver_worker del dense_model torch.cuda.empty_cache() gc.collect() @@ -52,8 +50,6 @@ def test_models( sparse_num_kv_blocks = (sparse_model.model.llm_engine.scheduler. block_manager.gpu_allocator.num_blocks) - # Deleting just the model does not always free the GPU memory. - del sparse_model.model.llm_engine.driver_worker del sparse_model torch.cuda.empty_cache() gc.collect() diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index 34bc6d0e77f61..7c0382dfa7b34 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -48,7 +48,6 @@ class ModelPair: @pytest.mark.flaky(reruns=2) -@pytest.mark.skip(reason="OOM Again in Automation") @pytest.mark.skipif(marlin_not_supported, reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("model_pair", model_pairs) @@ -69,11 +68,7 @@ def test_models( marlin_outputs = marlin_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) - # vllm memory cleanup is poor. This seems to fix things. - # NOTE: upstream sync should use downstream version. - del marlin_model.model.llm_engine.driver_worker del marlin_model - gc.collect() torch.cuda.empty_cache() @@ -84,9 +79,6 @@ def test_models( max_tokens, num_logprobs) - # vllm memory cleanup is poor. This seems to fix things. - # NOTE: upstream sync should use downstream version. - del gptq_model.model.llm_engine.driver_worker del gptq_model gc.collect() torch.cuda.empty_cache() From b9c3578f1dbb72979a3cf238b07509bc5fb8fd42 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Fri, 15 Mar 2024 02:25:01 +0000 Subject: [PATCH 112/113] missed one test --- tests/models/test_models_logprobs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index 80cbf2a48efc4..8878510bd0a93 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -51,7 +51,6 @@ def test_models( max_tokens, num_logprobs) - del vllm_model.model.llm_engine.driver_worker del vllm_model # loop through the prompts From 1e36b51af1b9fda33e1bdf09ce246704571bcef3 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 15 Mar 2024 11:56:26 -0400 Subject: [PATCH 113/113] Update test-pipeline.yaml removed duplicated test in buildkite (bad merge) --- .buildkite/test-pipeline.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fa3487c9f12cd..42a1eacb6de57 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -17,9 +17,6 @@ steps: - label: Core Test command: pytest -v -s core -- label: Core Test - command: pytest -v -s core - - label: Distributed Comm Ops Test command: pytest -v -s --forked test_comm_ops.py working_dir: "/vllm-workspace/tests/distributed"