From d7f396486e3e9b4dd31020c81c6eb446593b586d Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Thu, 22 Feb 2024 04:18:37 +0200
Subject: [PATCH 001/113] Update comment (#2934)

---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index cdcfb8582143c..ff5609c37febf 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -7,7 +7,7 @@
         --disable-log-requests
 
     (TGI backend)
-    ./launch_hf_server.sh <your_model>
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 
 On the client side, run:
     python benchmarks/benchmark_serving.py \

From 5574081c49c9a5ac51662981aff80250119a97bd Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <maxusmusti@gmail.com>
Date: Wed, 21 Feb 2024 21:24:01 -0500
Subject: [PATCH 002/113] Added early stopping to completion APIs (#2939)

---
 vllm/entrypoints/openai/protocol.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 727fec870293c..7c2aa707775ff 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -72,6 +72,7 @@ class ChatCompletionRequest(BaseModel):
     top_k: Optional[int] = -1
     ignore_eos: Optional[bool] = False
     use_beam_search: Optional[bool] = False
+    early_stopping: Optional[bool] = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
@@ -99,6 +100,7 @@ def to_sampling_params(self) -> SamplingParams:
             top_k=self.top_k,
             ignore_eos=self.ignore_eos,
             use_beam_search=self.use_beam_search,
+            early_stopping=self.early_stopping,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
@@ -129,6 +131,7 @@ class CompletionRequest(BaseModel):
     top_k: Optional[int] = -1
     ignore_eos: Optional[bool] = False
     use_beam_search: Optional[bool] = False
+    early_stopping: Optional[bool] = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
@@ -157,6 +160,7 @@ def to_sampling_params(self):
             max_tokens=self.max_tokens if not echo_without_generation else 1,
             logprobs=self.logprobs,
             use_beam_search=self.use_beam_search,
+            early_stopping=self.early_stopping,
             prompt_logprobs=self.logprobs if self.echo else None,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=(self.spaces_between_special_tokens),

From 344020c926ad19d9d147f5ab6b8929669296edcb Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Thu, 22 Feb 2024 10:25:05 +0800
Subject: [PATCH 003/113] Migrate MistralForCausalLM to LlamaForCausalLM
 (#2868)

---
 vllm/model_executor/models/__init__.py |   2 +-
 vllm/model_executor/models/llama.py    |   6 +-
 vllm/model_executor/models/mistral.py  | 377 -------------------------
 3 files changed, 6 insertions(+), 379 deletions(-)
 delete mode 100644 vllm/model_executor/models/mistral.py

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 17d8d69ba8672..411814f2f5d09 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -30,7 +30,7 @@
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
-    "MistralForCausalLM": ("mistral", "MistralForCausalLM"),
+    "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 1d0353d7d396e..b7f6b8f3ec374 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -92,6 +92,7 @@ def __init__(
         max_position_embeddings: int = 8192,
         linear_method: Optional[LinearMethodBase] = None,
         bias: bool = False,
+        sliding_window: Optional[int] = None,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -141,7 +142,8 @@ def __init__(
         self.attn = PagedAttention(self.num_heads,
                                    self.head_dim,
                                    self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+                                   num_kv_heads=self.num_kv_heads,
+                                   sliding_window=sliding_window)
 
     def forward(
         self,
@@ -172,6 +174,7 @@ def __init__(
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings",
                                           8192)
+        sliding_window = getattr(config, "sliding_window", None)
         self.self_attn = LlamaAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -182,6 +185,7 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             linear_method=linear_method,
             bias=getattr(config, "bias", False),
+            sliding_window=sliding_window,
         )
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,
diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py
deleted file mode 100644
index 2347ed752d781..0000000000000
--- a/vllm/model_executor/models/mistral.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Mistral model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-from transformers import MistralConfig
-
-from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_world_size)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
-                                              hf_model_weights_iterator)
-from vllm.sequence import SamplerOutput
-from vllm.config import LoRAConfig
-
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
-
-class MistralMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            linear_method=linear_method)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           linear_method=linear_method)
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class MistralAttention(nn.Module):
-
-    def __init__(self,
-                 hidden_size: int,
-                 num_heads: int,
-                 num_kv_heads: int,
-                 max_position: int = 4096 * 32,
-                 rope_theta: float = 10000,
-                 linear_method: Optional[LinearMethodBase] = None,
-                 sliding_window: Optional[int] = None) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = num_heads
-        assert self.total_num_heads % tp_size == 0
-        self.num_heads = self.total_num_heads // tp_size
-        self.total_num_kv_heads = num_kv_heads
-        if self.total_num_kv_heads >= tp_size:
-            # Number of KV heads is greater than TP size, so we partition
-            # the KV heads across multiple tensor parallel GPUs.
-            assert self.total_num_kv_heads % tp_size == 0
-        else:
-            # Number of KV heads is less than TP size, so we replicate
-            # the KV heads across multiple tensor parallel GPUs.
-            assert tp_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-        self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.sliding_window = sliding_window
-
-        self.qkv_proj = QKVParallelLinear(
-            hidden_size,
-            self.head_dim,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=False,
-            linear_method=linear_method,
-        )
-        self.o_proj = RowParallelLinear(
-            self.total_num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-            linear_method=linear_method,
-        )
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position,
-            base=self.rope_theta,
-        )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=self.sliding_window)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q, k = self.rotary_emb(positions, q, k)
-        k_cache, v_cache = kv_cache
-        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class MistralDecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        self.self_attn = MistralAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            max_position=config.max_position_embeddings,
-            num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            linear_method=linear_method,
-            sliding_window=config.sliding_window)
-        self.mlp = MistralMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-            linear_method=linear_method,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: KVCache,
-        input_metadata: InputMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-class MistralModel(nn.Module):
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.padding_idx = config.pad_token_id
-        lora_vocab = (lora_config.lora_extra_vocab_size *
-                      (lora_config.max_loras or 1)) if lora_config else 0
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-        )
-        self.layers = nn.ModuleList([
-            MistralDecoderLayer(config, linear_method)
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
-        residual = None
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i],
-                input_metadata,
-                residual,
-            )
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class MistralForCausalLM(nn.Module):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-    embedding_modules = {
-        "embed_tokens": "input_embeddings",
-        "lm_head": "output_embeddings",
-    }
-    embedding_padding_modules = ["lm_head"]
-
-    def __init__(
-        self,
-        config: MistralConfig,
-        linear_method: Optional[LinearMethodBase] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.config = config
-        self.linear_method = linear_method
-        self.model = MistralModel(config,
-                                  linear_method,
-                                  lora_config=lora_config)
-        unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            unpadded_vocab_size += lora_config.lora_extra_vocab_size
-        self.lm_head = ParallelLMHead(
-            unpadded_vocab_size,
-            config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config else lora_config.lora_vocab_padding_size,
-        )
-        self.sampler = Sampler(unpadded_vocab_size, config.vocab_size)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   input_metadata)
-        return hidden_states
-
-    def sample(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   sampling_metadata)
-        return next_tokens
-
-    def load_weights(self,
-                     model_name_or_path: str,
-                     cache_dir: Optional[str] = None,
-                     load_format: str = "auto",
-                     revision: Optional[str] = None):
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in hf_model_weights_iterator(
-                model_name_or_path, cache_dir, load_format, revision):
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)

From 95529e32537287831cddd800280a20d7c2417163 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 21 Feb 2024 18:28:23 -0800
Subject: [PATCH 004/113] Use Llama RMSNorm custom op for Gemma (#2974)

---
 vllm/model_executor/models/gemma.py | 60 +++++++++++++----------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index affe54c448a2c..03bd149c001d3 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -22,6 +22,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -40,21 +41,6 @@
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
-class GemmaRMSNorm(nn.Module):
-
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.zeros(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * (1 + self.weight)
-
-
 class GemmaMLP(nn.Module):
 
     def __init__(
@@ -185,10 +171,10 @@ def __init__(
             intermediate_size=config.intermediate_size,
             linear_method=linear_method,
         )
-        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
-                                            eps=config.rms_norm_eps)
-        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
-                                                     eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -196,25 +182,27 @@ def forward(
         hidden_states: torch.Tensor,
         kv_cache: KVCache,
         input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
             kv_cache=kv_cache,
             input_metadata=input_metadata,
         )
-        hidden_states = residual + hidden_states
 
         # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        return hidden_states
+        return hidden_states, residual
 
 
 class GemmaModel(nn.Module):
@@ -235,7 +223,7 @@ def __init__(
             GemmaDecoderLayer(config, linear_method)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
     def forward(
         self,
@@ -246,17 +234,19 @@ def forward(
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
         # Normalize the embedding by sqrt(hidden_size)
-        hidden_states = hidden_states * (self.config.hidden_size**0.5)
+        hidden_states *= self.config.hidden_size**0.5
 
+        residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
-            hidden_states = layer(
+            hidden_states, residual = layer(
                 positions,
                 hidden_states,
                 kv_caches[i],
                 input_metadata,
+                residual,
             )
-        hidden_states = self.norm(hidden_states)
+        hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
 
@@ -321,6 +311,10 @@ def load_weights(self,
                 # Skip loading extra layer for lora models.
                 if "lm_head" in name:
                     continue
+                # GemmaRMSNorm is different from Llama's in that it multiplies
+                # (1 + weight) to the output, instead of just weight.
+                if "norm.weight" in name:
+                    loaded_weight += 1.0
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
@@ -329,5 +323,5 @@ def load_weights(self,
         unloaded_params = params_dict.keys() - loaded_params
         if unloaded_params:
             raise RuntimeError(
-                f"Some weights are not initialized from checkpoints: {unloaded_params}"
-            )
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")

From 93dc5a287086299a124e9f1f6fac75458ae0acbd Mon Sep 17 00:00:00 2001
From: Massimiliano Pronesti <massimiliano.pronesti@gmail.com>
Date: Thu, 22 Feb 2024 02:56:01 +0000
Subject: [PATCH 005/113] chore(vllm): codespell for spell checking  (#2820)

---
 .github/workflows/ruff.yml                    |  5 +-
 benchmarks/benchmark_serving.py               |  2 +-
 format.sh                                     | 51 +++++++++++++++++--
 mypy.ini                                      |  8 ---
 pyproject.toml                                | 18 +++++++
 requirements-dev.txt                          |  2 +
 tests/lora/test_layers.py                     |  2 +-
 tests/lora/test_llama.py                      |  4 +-
 vllm/core/block_manager.py                    |  2 +-
 vllm/core/scheduler.py                        |  2 +-
 vllm/lora/punica.py                           |  2 +-
 .../layers/triton_kernel/prefix_prefill.py    |  2 +-
 vllm/model_executor/models/decilm.py          |  2 +-
 .../parallel_utils/custom_all_reduce.py       |  4 +-
 .../parallel_utils/parallel_state.py          |  2 +-
 vllm/utils.py                                 |  2 +-
 16 files changed, 85 insertions(+), 25 deletions(-)
 delete mode 100644 mypy.ini

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index bd38d11872dc4..8f8f5ee3cc70c 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -25,7 +25,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ruff==0.1.5
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
     - name: Analysing the code with ruff
       run: |
         ruff vllm tests
+    - name: Spelling check with codespell
+      run: |
+         codespell --toml pyproject.toml
\ No newline at end of file
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ff5609c37febf..7d389a9c7d703 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -375,7 +375,7 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--disable-tqdm",
         action="store_true",
-        help="Specify to disbale tqdm progress bar.",
+        help="Specify to disable tqdm progress bar.",
     )
     parser.add_argument(
         "--save-result",
diff --git a/format.sh b/format.sh
index c78108869659d..eb2c5ab031626 100755
--- a/format.sh
+++ b/format.sh
@@ -24,6 +24,7 @@ builtin cd "$ROOT" || exit 1
 YAPF_VERSION=$(yapf --version | awk '{print $2}')
 RUFF_VERSION=$(ruff --version | awk '{print $2}')
 MYPY_VERSION=$(mypy --version | awk '{print $2}')
+CODESPELL_VERSION=$(codespell --version)
 
 # # params: tool name, tool version, required version
 tool_version_check() {
@@ -36,6 +37,7 @@ tool_version_check() {
 tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
 
 YAPF_FLAGS=(
     '--recursive'
@@ -93,6 +95,47 @@ echo 'vLLM yapf: Done'
 # echo 'vLLM mypy:'
 # mypy
 
+# check spelling of specified files
+spell_check() {
+    codespell "$@"
+}
+
+spell_check_all(){
+  codespell --toml pyproject.toml
+}
+
+# Spelling  check of files that differ from main branch.
+spell_check_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause ruff to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+             codespell
+    fi
+}
+
+# Run Codespell
+## This flag runs spell check of individual files. --files *must* be the first command line
+## arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   spell_check "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is linted.
+elif [[ "$1" == '--all' ]]; then
+   spell_check_all
+else
+   # Check spelling only of the files that changed in last commit.
+   spell_check_changed
+fi
+echo 'vLLM codespell: Done'
+
+
 # Lint specified files
 lint() {
     ruff "$@"
@@ -117,9 +160,9 @@ lint_changed() {
 }
 
 # Run Ruff
-echo 'vLLM Ruff:'
-## This flag lints individual files. --files *must* be the first command line
-## arg to use this option.
+echo 'vLLM ruff:'
+### This flag lints individual files. --files *must* be the first command line
+### arg to use this option.
 if [[ "$1" == '--files' ]]; then
    lint "${@:2}"
    # If `--all` is passed, then any further arguments are ignored and the
@@ -139,3 +182,5 @@ if ! git diff --quiet &>/dev/null; then
 
     exit 1
 fi
+
+
diff --git a/mypy.ini b/mypy.ini
deleted file mode 100644
index 55c4248ea9d26..0000000000000
--- a/mypy.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-[mypy]
-python_version = 3.8
-
-ignore_missing_imports = True
-
-files = vllm
-# TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
diff --git a/pyproject.toml b/pyproject.toml
index b197256f6ff55..c5db016cebdb7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,4 +31,22 @@ ignore = [
     "E731",
     # line too long, handled by black formatting
     "E501",
+    # .strip() with multi-character strings
+    "B005",
+    # Loop control variable not used within loop body
+    "B007",
 ]
+
+[tool.mypy]
+python_version = "3.8"
+
+ignore_missing_imports = true
+
+files = "vllm"
+# TODO(woosuk): Include the code from Megatron and HuggingFace.
+exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
+
+
+[tool.codespell]
+ignore-words-list = "dout, te, indicies"
+skip = "./tests/prompts"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f8126008d0794..b54a2773249cf 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,9 @@
 # formatting
 yapf==0.32.0
 toml==0.10.2
+tomli==2.0.1
 ruff==0.1.5
+codespell==2.2.6
 
 # type checking
 mypy==0.991
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index f739bbeaab334..18ce300449dbf 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -279,7 +279,7 @@ def create_random_embedding_layer():
             256,
             org_num_embeddings=512)
         expanded_embedding.weight.data[:512, :] = embedding_data
-        # We need to deepcopy the embedding as it will be modifed
+        # We need to deepcopy the embedding as it will be modified
         # in place
         lora_embedding = VocabParallelEmbeddingWithLoRA(
             deepcopy(expanded_embedding))
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index 06fbf19eea824..dfaf8c700695a 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -15,7 +15,7 @@ def do_sample(llm, lora_path: str, lora_id: int):
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]"
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"
     ]
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
@@ -53,7 +53,7 @@ def test_llama_lora(sql_lora_files, tp_size):
         "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",
         " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",
         " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the womens doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",
     ]
     expected_lora_output = [
         "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 7f91051f03ac1..3946096d4296a 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -178,7 +178,7 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
         if len(block_table) < len(logical_blocks):
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
-                # re-use a block
+                # reuse a block
                 block_table.append(block_table[len(block_table) %
                                                self.block_sliding_window])
             else:
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f4ac2d6dc59fe..5e7cc3091d775 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -158,7 +158,7 @@ def get_num_unfinished_seq_groups(self) -> int:
         return len(self.waiting) + len(self.running) + len(self.swapped)
 
     def _schedule(self) -> SchedulerOutputs:
-        # Blocks that need to be swaped or copied before model execution.
+        # Blocks that need to be swapped or copied before model execution.
         blocks_to_swap_in: Dict[int, int] = {}
         blocks_to_swap_out: Dict[int, int] = {}
         blocks_to_copy: Dict[int, List[int]] = {}
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 307a33dcf2820..fc74269e55876 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -87,7 +87,7 @@ def add_lora(y: torch.Tensor,
     r = wb_t_all.size(-1)
     if buffer is None:
         # We set the buffer to be float32 by default to avoid
-        # numerical innacuracies that would otherwise happen
+        # numerical inaccuracies that would otherwise happen
         # due to downcasting.
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index ba40d42307fab..a1a2ab0c4805c 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -537,7 +537,7 @@ def _fwd_kernel_alibi(
         alibi_start_q = tl.arange(
             0, BLOCK_M) + block_start_loc + cur_batch_ctx_len
         alibi_start_k = cur_batch_ctx_len
-        # # init debuger
+        # # init debugger
         # offset_db_q = tl.arange(0, BLOCK_M) + block_start_loc
         # offset_db_k = tl.arange(0, BLOCK_N)
         # calc q[BLOCK_M, BLOCK_MODEL] mul k[prefix_len: , BLOCK_DMODEL]
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index 07aa4b72bf7a0..abf4a462871b0 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -41,7 +41,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
     Based on the llama executor.
 
     The main difference is that DeciLM uses Variable Grouped Query Attention.
-    The constant number of GQA heads in the decoder is overriden with a value
+    The constant number of GQA heads in the decoder is overridden with a value
     per layer.
 
     Usually, in the HuggingFace implementation, instead of
diff --git a/vllm/model_executor/parallel_utils/custom_all_reduce.py b/vllm/model_executor/parallel_utils/custom_all_reduce.py
index ce4c8d02f7694..0c749c0484fc5 100644
--- a/vllm/model_executor/parallel_utils/custom_all_reduce.py
+++ b/vllm/model_executor/parallel_utils/custom_all_reduce.py
@@ -36,14 +36,14 @@ def init_custom_ar() -> None:
     if world_size not in _SUPPORTED_WORLD_SIZES:
         logger.warn(
             "Custom allreduce is disabled due to an unsupported world size: "
-            "%d. Supported world sizes: %s. To slience this warning, specify"
+            "%d. Supported world sizes: %s. To silence this warning, specify"
             "disable_custom_all_reduce=True explicitly.", world_size,
             str(_SUPPORTED_WORLD_SIZES))
         return
     if not _can_p2p(rank, world_size):
         logger.warn(
             "Custom allreduce is disabled because your platform lacks GPU P2P"
-            " capability. To slience this warning, specify"
+            " capability. To silence this warning, specify"
             "disable_custom_all_reduce=True explicitly.")
         return
     _CA_HANDLE = CustomAllreduce(rank, world_size)
diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
index aeb07f64c37dc..c821936d06e4e 100644
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -189,7 +189,7 @@ def get_pipeline_model_parallel_next_rank():
 
 
 def get_pipeline_model_parallel_prev_rank():
-    """Return the global rank that preceeds the caller in the pipeline"""
+    """Return the global rank that precedes the caller in the pipeline"""
     assert _PIPELINE_GLOBAL_RANKS is not None, (
         "Pipeline parallel group is not initialized")
     rank_in_pipeline = get_pipeline_model_parallel_rank()
diff --git a/vllm/utils.py b/vllm/utils.py
index d7a3a3a2a9ef9..6206879929061 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -204,7 +204,7 @@ def _generate_random_fp8_e5m2(
     # NOTE(zhaoyang): Due to NaN and Inf representation for fp8 data type,
     # it may occur Inf or NaN if we directly use torch.randint
     # to generate random data for fp8 data.
-    # For example, s.11111.00 in fp8e5m2 format repesents Inf.
+    # For example, s.11111.00 in fp8e5m2 format represents Inf.
     #     | E4M3        | E5M2
     #-----|-------------|-------------------
     # Inf | N/A         | s.11111.00

From fd5dcc5c816b7392821d3d4c02b13a7cf820d962 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 21 Feb 2024 20:17:52 -0800
Subject: [PATCH 006/113] Optimize GeGLU layer in Gemma (#2975)

---
 csrc/activation_kernels.cu               | 73 ++++++++++++++++--------
 csrc/ops.h                               |  4 ++
 csrc/pybind.cpp                          |  4 ++
 tests/kernels/test_activation.py         | 50 +++++-----------
 vllm/model_executor/layers/activation.py | 23 ++++++++
 vllm/model_executor/models/gemma.py      | 31 +++++-----
 6 files changed, 108 insertions(+), 77 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 5ba9ab178d5a4..22b10f0571d1c 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -2,19 +2,16 @@
 #include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#include <cmath>
+
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 
 namespace vllm {
 
-template<typename T>
-__device__ __forceinline__ T silu(const T& x) {
-  // x * sigmoid(x)
-  return (T) (((float) x) / (1.0f + expf((float) -x)));
-}
-
-template<typename scalar_t>
-__global__ void silu_and_mul_kernel(
+// Activation and gating kernel template.
+template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void act_and_mul_kernel(
   scalar_t* __restrict__ out,               // [..., d]
   const scalar_t* __restrict__ input,       // [..., 2, d]
   const int d) {
@@ -22,32 +19,58 @@ __global__ void silu_and_mul_kernel(
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
     const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = silu(x) * y;
+    out[token_idx * d + idx] = ACT_FN(x) * y;
   }
 }
 
+template<typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T) (((float) x) / (1.0f + expf((float) -x)));
+}
+
+template<typename T>
+__device__ __forceinline__ T gelu_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'none' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
+  const float f = (float) x;
+  constexpr float ALPHA = M_SQRT1_2;
+  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+}
+
 } // namespace vllm
 
+// Launch activation and gating kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
+  int d = input.size(-1) / 2;                                                             \
+  int64_t num_tokens = input.numel() / input.size(-1);                                    \
+  dim3 grid(num_tokens);                                                                  \
+  dim3 block(std::min(d, 1024));                                                          \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
+    input.scalar_type(),                                                                  \
+    "act_and_mul_kernel",                                                                 \
+    [&] {                                                                                 \
+      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
+        out.data_ptr<scalar_t>(),                                                         \
+        input.data_ptr<scalar_t>(),                                                       \
+        d);                                                                               \
+    });
+
 void silu_and_mul(
   torch::Tensor& out,      // [..., d]
   torch::Tensor& input)    // [..., 2 * d]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
-  int d = input.size(-1) / 2;
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(d, 1024));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-    input.scalar_type(),
-    "silu_and_mul_kernel",
-    [&] {
-      vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(),
-        input.data_ptr<scalar_t>(),
-        d);
-    });
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
+
+void gelu_and_mul(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input)    // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
 
 namespace vllm {
diff --git a/csrc/ops.h b/csrc/ops.h
index 2bcd0c2efc5c6..dbdd2c2c57945 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -57,6 +57,10 @@ void silu_and_mul(
   torch::Tensor& out,
   torch::Tensor& input);
 
+void gelu_and_mul(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
 void gelu_new(
   torch::Tensor& out,
   torch::Tensor& input);
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index b36d259697167..24c22020131e8 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -22,6 +22,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "silu_and_mul",
     &silu_and_mul,
     "Activation function used in SwiGLU.");
+  ops.def(
+    "gelu_and_mul",
+    &gelu_and_mul,
+    "Activation function used in GeGLU.");
   ops.def(
     "gelu_new",
     &gelu_new,
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 8e216c293f070..e0dec144eba11 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -1,7 +1,10 @@
+from typing import Type
+
 import pytest
 import torch
 
-from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul
+from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul,
+                                                   NewGELU, SiluAndMul)
 from allclose_default import get_default_atol, get_default_rtol
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -13,13 +16,15 @@
 ]
 
 
+@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_silu_and_mul(
+def test_act_and_mul(
+    activation: Type[torch.nn.Module],
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
@@ -31,48 +36,23 @@ def test_silu_and_mul(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
-    layer = SiluAndMul()
+    layer = activation()
     out = layer(x)
     ref_out = layer._forward(x)
-    assert torch.allclose(out,
-                          ref_out,
-                          atol=get_default_atol(out),
-                          rtol=get_default_rtol(out))
+    # The SiLU and GELU implementations are equivalent to the native PyTorch
+    # implementations, so we can do exact comparison.
+    assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
 
 
+@pytest.mark.parametrize("activation", [FastGELU, NewGELU])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_gelu_new(
-    num_tokens: int,
-    d: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-    torch.set_default_device(device)
-    x = torch.randn(num_tokens, d, dtype=dtype)
-    layer = NewGELU()
-    out = layer(x)
-    ref_out = layer._forward(x)
-    assert torch.allclose(out,
-                          ref_out,
-                          atol=get_default_atol(out),
-                          rtol=get_default_rtol(out))
-
-
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
-@pytest.mark.parametrize("d", D)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_gelu_fast(
+def test_activation(
+    activation: Type[torch.nn.Module],
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
@@ -84,7 +64,7 @@ def test_gelu_fast(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, d, dtype=dtype)
-    layer = FastGELU()
+    layer = activation()
     out = layer(x)
     ref_out = layer._forward(x)
     assert torch.allclose(out,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 95902ae38e256..5a3a7b2dbaee7 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -37,6 +37,29 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
+class GeluAndMul(nn.Module):
+    """An activation function for GeGLU.
+
+    The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d]) * x[..., d:]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.gelu_and_mul(out, x)
+        return out
+
+
 class NewGELU(nn.Module):
 
     def _forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 03bd149c001d3..d8b515993d8ff 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -21,10 +21,11 @@
 from transformers import GemmaConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearMethodBase,
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -50,27 +51,21 @@ def __init__(
         linear_method: Optional[LinearMethodBase] = None,
     ) -> None:
         super().__init__()
-        self.gate_proj = ColumnParallelLinear(hidden_size,
-                                              intermediate_size,
-                                              bias=False,
-                                              linear_method=linear_method)
-        self.up_proj = ColumnParallelLinear(hidden_size,
-                                            intermediate_size,
-                                            bias=False,
-                                            linear_method=linear_method)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
         self.down_proj = RowParallelLinear(intermediate_size,
                                            hidden_size,
                                            bias=False,
                                            linear_method=linear_method)
-        self.act_fn = nn.GELU()
+        self.act_fn = GeluAndMul()
 
     def forward(self, x):
-        gate, _ = self.gate_proj(x)
-        gate = self.act_fn(gate)
-        up, _ = self.up_proj(x)
-        fuse = gate * up
-        outputs, _ = self.down_proj(fuse)
-        return outputs
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
 
 
 class GemmaAttention(nn.Module):
@@ -294,6 +289,8 @@ def load_weights(self,
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params = set()

From c530e2cfe3b3d7e60130ff817cee7f3a395af232 Mon Sep 17 00:00:00 2001
From: 44670 <44670@users.noreply.github.com>
Date: Thu, 22 Feb 2024 17:40:05 +0800
Subject: [PATCH 007/113] [FIX] Fix a bug in initializing Yarn RoPE (#2983)

---
 vllm/model_executor/layers/rotary_embedding.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 93ec5c12536fb..87068644112c0 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -245,13 +245,11 @@ def _yarn_find_correction_range(low_rot: int,
 
 
 def _yarn_linear_ramp_mask(low: float, high: float, dim: int,
-                           dtype: torch.dtype,
-                           device: torch.device) -> torch.Tensor:
+                           dtype: torch.dtype) -> torch.Tensor:
     if low == high:
         high += 0.001  # Prevent singularity
 
-    linear_func = (torch.arange(dim, dtype=dtype, device=device) -
-                   low) / (high - low)
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
     ramp_func = torch.clamp(linear_func, 0, 1)
     return ramp_func
 

From 6f32cddf1c795e74a47e84620462431154718f49 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 22 Feb 2024 09:58:29 -0800
Subject: [PATCH 008/113] Remove Flash Attention in test env (#2982)

---
 requirements-dev.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index b54a2773249cf..80d66530f47f0 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,7 +17,6 @@ pytest-forked
 pytest-asyncio
 httpx
 einops # required for MPT
-flash_attn # required for HuggingFace's llama implementation
 openai
 requests
-ray
\ No newline at end of file
+ray

From 4caf7044e052399f07089aa8f586d5bd641f7d53 Mon Sep 17 00:00:00 2001
From: Ronen Schaffer <ronen.schaffer@ibm.com>
Date: Fri, 23 Feb 2024 00:00:12 +0200
Subject: [PATCH 009/113] Include tokens from prompt phase in
 `counter_generation_tokens` (#2802)

---
 .buildkite/test-pipeline.yaml |  3 +++
 tests/metrics/test_metrics.py | 34 +++++++++++++++++++++++++++++++++-
 vllm/engine/llm_engine.py     |  3 +++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a91dcdfaf2ea5..efcc4d2d07a12 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -52,6 +52,9 @@ steps:
 - label: LoRA Test
   command: pytest -v -s lora
 
+- label: Metrics Test
+  command: pytest -v -s metrics
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   commands:
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index da608a6a18f92..fe09aa8237f24 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -9,13 +9,16 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_metrics(
+def test_metric_counter_prompt_tokens(
     vllm_runner,
     example_prompts,
     model: str,
     dtype: str,
     max_tokens: int,
 ) -> None:
+    # Reset metric
+    vllm.engine.metrics.counter_prompt_tokens.set_value({}, 0)
+
     vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
     tokenizer = vllm_model.model.get_tokenizer()
     prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
@@ -31,3 +34,32 @@ def test_metrics(
     assert vllm_prompt_token_count == metric_count, (
         f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
     )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metric_counter_generation_tokens(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # Reset metric
+    vllm.engine.metrics.counter_generation_tokens.set_value({}, 0)
+
+    vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    tokenizer = vllm_model.model.get_tokenizer()
+    metric_count = vllm.engine.metrics.counter_generation_tokens.get_value({})
+    vllm_generation_count = 0
+    for i in range(len(example_prompts)):
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        prompt_ids = tokenizer.encode(example_prompts[i])
+        # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens.
+        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+
+    assert vllm_generation_count == metric_count, (
+        f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}"
+    )
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f0de40f54db61..81c9281c55416 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -872,6 +872,9 @@ def _get_stats(self,
                 num_prompt_tokens = sum(
                     len(seq_group.prompt_token_ids)
                     for seq_group in scheduler_outputs.scheduled_seq_groups)
+                num_generation_tokens = sum(
+                    seq_group.num_seqs()
+                    for seq_group in scheduler_outputs.scheduled_seq_groups)
             else:
                 num_generation_tokens = scheduler_outputs.num_batched_tokens
 

From 57f044945f25d90d1b434014b2719ba6b06fdc44 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Fri, 23 Feb 2024 06:25:07 +0800
Subject: [PATCH 010/113] Fix nvcc not found in vlm-openai image (#2781)

---
 vllm/config.py |  2 +-
 vllm/utils.py  | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 0b8a2a27f6d43..bd0dc89b585f7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -319,7 +319,7 @@ def _verify_cache_dtype(self) -> None:
             pass
         elif self.cache_dtype == "fp8_e5m2":
             nvcc_cuda_version = get_nvcc_cuda_version()
-            if nvcc_cuda_version < Version("11.8"):
+            if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"):
                 raise ValueError(
                     "FP8 is not supported when cuda version is lower than 11.8."
                 )
diff --git a/vllm/utils.py b/vllm/utils.py
index 6206879929061..8ca95e148eb39 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -181,13 +181,18 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None:
     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))
 
 
-def get_nvcc_cuda_version() -> Version:
+def get_nvcc_cuda_version() -> Optional[Version]:
     cuda_home = os.environ.get('CUDA_HOME')
     if not cuda_home:
         cuda_home = '/usr/local/cuda'
-        logger.info(
-            f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
-        )
+        if os.path.isfile(cuda_home + '/bin/nvcc'):
+            logger.info(
+                f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
+            )
+        else:
+            logger.warning(
+                f'Not found nvcc in {cuda_home}. Skip cuda version check!')
+            return None
     nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
                                           universal_newlines=True)
     output = nvcc_output.split()

From f7c1234990793008f3d44790fd274040f26c4ee4 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 23 Feb 2024 12:57:48 -0800
Subject: [PATCH 011/113] [Fix] Fissertion on YaRN model len (#2984)

---
 vllm/model_executor/layers/rotary_embedding.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 87068644112c0..13749570f28a2 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -354,7 +354,6 @@ def get_rope(
         elif scaling_type == "yarn":
             original_max_position = rope_scaling[
                 "original_max_position_embeddings"]
-            assert max_position == original_max_position * scaling_factor
             extra_kwargs = {
                 k: v
                 for k, v in rope_scaling.items()

From ef978fe4111b0eb91c81eceba4d9791b94c7ffbf Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sun, 25 Feb 2024 19:54:00 +0000
Subject: [PATCH 012/113] Port metrics from `aioprometheus` to
 `prometheus_client` (#2730)

---
 docs/source/conf.py                   |   2 +-
 requirements-neuron.txt               |   2 +-
 requirements-rocm.txt                 |   2 +-
 requirements.txt                      |   2 +-
 tests/conftest.py                     |   2 +
 tests/metrics/test_metrics.py         |  25 ++--
 vllm/engine/llm_engine.py             |   3 +-
 vllm/engine/metrics.py                | 170 ++++++++++++++++----------
 vllm/entrypoints/openai/api_server.py |  12 +-
 9 files changed, 133 insertions(+), 87 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index adbe67b21a0c8..5a45c6f9d1e0a 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -72,7 +72,7 @@
 
 # Mock out external dependencies here.
 autodoc_mock_imports = [
-    "torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
+    "torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
     "vllm.cuda_utils", "vllm._C"
 ]
 
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 3f30ed08f037d..36e629add664d 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -6,4 +6,4 @@ neuronx-cc
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-aioprometheus[starlette]
+prometheus_client
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 42b89ae84aa45..e759ba7d028d9 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -10,4 +10,4 @@ transformers >= 4.38.0  # Required for Gemma.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-aioprometheus[starlette]
+prometheus_client
diff --git a/requirements.txt b/requirements.txt
index de08bd29beaf9..de93ba6354cda 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-aioprometheus[starlette]
+prometheus_client
 pynvml == 11.5.0
 triton >= 2.1.0
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
diff --git a/tests/conftest.py b/tests/conftest.py
index 6af9b36b6febe..30a3df89d9f12 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -165,6 +165,7 @@ def __init__(
         dtype: str = "half",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
+        **kwargs,
     ) -> None:
         self.model = LLM(
             model=model_name,
@@ -174,6 +175,7 @@ def __init__(
             swap_space=0,
             disable_log_stats=disable_log_stats,
             tensor_parallel_size=tensor_parallel_size,
+            **kwargs,
         )
 
     def generate(
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index fe09aa8237f24..410bdfa5c69e2 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,5 +1,4 @@
 import pytest
-import vllm.engine.metrics
 
 MODELS = [
     "facebook/opt-125m",
@@ -16,10 +15,10 @@ def test_metric_counter_prompt_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    # Reset metric
-    vllm.engine.metrics.counter_prompt_tokens.set_value({}, 0)
-
-    vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.4)
     tokenizer = vllm_model.model.get_tokenizer()
     prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
     # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
@@ -29,7 +28,9 @@ def test_metric_counter_prompt_tokens(
     vllm_prompt_token_count = sum(prompt_token_counts)
 
     _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-    metric_count = vllm.engine.metrics.counter_prompt_tokens.get_value({})
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+        **stat_logger.labels)._value.get()
 
     assert vllm_prompt_token_count == metric_count, (
         f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
@@ -46,13 +47,15 @@ def test_metric_counter_generation_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    # Reset metric
-    vllm.engine.metrics.counter_generation_tokens.set_value({}, 0)
-
-    vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
+    vllm_model = vllm_runner(model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.4)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
     tokenizer = vllm_model.model.get_tokenizer()
-    metric_count = vllm.engine.metrics.counter_generation_tokens.get_value({})
+    stat_logger = vllm_model.model.llm_engine.stat_logger
+    metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+        **stat_logger.labels)._value.get()
     vllm_generation_count = 0
     for i in range(len(example_prompts)):
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 81c9281c55416..c1a75924c6d72 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -128,7 +128,8 @@ def __init__(
         # Metric Logging.
         if self.log_stats:
             self.stat_logger = StatLogger(
-                local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
+                local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
+                labels=dict(model_name=model_config.model))
 
         self.forward_dag = None
         if USE_RAY_COMPILED_DAG:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index e613b9f551b2f..83e66a9372272 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,66 +1,94 @@
 from vllm.logger import init_logger
-from aioprometheus import Counter, Gauge, Histogram
+from prometheus_client import Counter, Gauge, Histogram, REGISTRY, disable_created_metrics
 
 import time
 import numpy as np
-from typing import List
+from typing import Dict, List
 from dataclasses import dataclass
 
 logger = init_logger(__name__)
 
-labels = {}
-
-
-def add_global_metrics_labels(**kwargs):
-    labels.update(kwargs)
-
+disable_created_metrics()
 
 # The begin-* and end* here are used by the documentation generator
 # to extract the metrics definitions.
 
+
 # begin-metrics-definitions
-gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
-                                    "Average prefill throughput in tokens/s.")
-gauge_avg_generation_throughput = Gauge(
-    "vllm:avg_generation_throughput_toks_per_s",
-    "Average generation throughput in tokens/s.")
-counter_prompt_tokens = Counter("vllm:prompt_tokens_total",
-                                "Number of prefill tokens processed.")
-counter_generation_tokens = Counter("vllm:generation_tokens_total",
-                                    "Number of generation tokens processed.")
-
-gauge_scheduler_running = Gauge(
-    "vllm:num_requests_running",
-    "Number of requests currently running on GPU.")
-gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
-                                "Number of requests swapped to CPU.")
-gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
-                                "Number of requests waiting to be processed.")
-
-gauge_gpu_cache_usage = Gauge(
-    "vllm:gpu_cache_usage_perc",
-    "GPU KV-cache usage. 1 means 100 percent usage.")
-gauge_cpu_cache_usage = Gauge(
-    "vllm:cpu_cache_usage_perc",
-    "CPU KV-cache usage. 1 means 100 percent usage.")
-
-histogram_time_to_first_token = Histogram(
-    "vllm:time_to_first_token_seconds",
-    "Histogram of time to first token in seconds.",
-    buckets=[
-        0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0,
-        2.5, 5.0, 7.5, 10.0
-    ])
-histogram_time_per_output_tokens = Histogram(
-    "vllm:time_per_output_token_seconds",
-    "Histogram of time per output token in seconds.",
-    buckets=[
-        0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5
-    ])
-histogram_e2e_request_latency = Histogram(
-    "vllm:e2e_request_latency_seconds",
-    "Histogram of end to end request latency in seconds.",
-    buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
+class Metrics:
+
+    def __init__(self, labelnames: List[str]):
+        # Unregister any existing vLLM collectors
+        for collector in list(REGISTRY._collector_to_names):
+            if hasattr(collector, "_name") and "vllm" in collector._name:
+                REGISTRY.unregister(collector)
+
+        # System stats
+        self.gauge_scheduler_running = Gauge(
+            name="vllm:num_requests_running",
+            documentation="Number of requests currently running on GPU.",
+            labelnames=labelnames)
+        self.gauge_scheduler_swapped = Gauge(
+            name="vllm:num_requests_swapped",
+            documentation="Number of requests swapped to CPU.",
+            labelnames=labelnames)
+        self.gauge_scheduler_waiting = Gauge(
+            name="vllm:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames)
+        self.gauge_gpu_cache_usage = Gauge(
+            name="vllm:gpu_cache_usage_perc",
+            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames)
+        self.gauge_cpu_cache_usage = Gauge(
+            name="vllm:cpu_cache_usage_perc",
+            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+            labelnames=labelnames)
+
+        # Raw stats from last model iteration
+        self.counter_prompt_tokens = Counter(
+            name="vllm:prompt_tokens_total",
+            documentation="Number of prefill tokens processed.",
+            labelnames=labelnames)
+        self.counter_generation_tokens = Counter(
+            name="vllm:generation_tokens_total",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames)
+        self.histogram_time_to_first_token = Histogram(
+            name="vllm:time_to_first_token_seconds",
+            documentation="Histogram of time to first token in seconds.",
+            labelnames=labelnames,
+            buckets=[
+                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+            ])
+        self.histogram_time_per_output_token = Histogram(
+            name="vllm:time_per_output_token_seconds",
+            documentation="Histogram of time per output token in seconds.",
+            labelnames=labelnames,
+            buckets=[
+                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
+                1.0, 2.5
+            ])
+        self.histogram_e2e_request_latency = Histogram(
+            name="vllm:e2e_request_latency_seconds",
+            documentation="Histogram of end to end request latency in seconds.",
+            labelnames=labelnames,
+            buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
+
+        # Legacy metrics
+        self.gauge_avg_prompt_throughput = Gauge(
+            name="vllm:avg_prompt_throughput_toks_per_s",
+            documentation="Average prefill throughput in tokens/s.",
+            labelnames=labelnames,
+        )
+        self.gauge_avg_generation_throughput = Gauge(
+            name="vllm:avg_generation_throughput_toks_per_s",
+            documentation="Average generation throughput in tokens/s.",
+            labelnames=labelnames,
+        )
+
+
 # end-metrics-definitions
 
 
@@ -87,7 +115,7 @@ class Stats:
 class StatLogger:
     """StatLogger is used LLMEngine to log to Promethus and Stdout."""
 
-    def __init__(self, local_interval: float) -> None:
+    def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
         # Metadata for logging locally.
         self.last_local_log = time.monotonic()
         self.local_interval = local_interval
@@ -96,6 +124,10 @@ def __init__(self, local_interval: float) -> None:
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
 
+        # Prometheus metrics
+        self.labels = labels
+        self.metrics = Metrics(labelnames=list(labels.keys()))
+
     def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
         return float(np.sum(tracked_stats) / (now - self.last_local_log))
 
@@ -105,23 +137,33 @@ def _local_interval_elapsed(self, now: float) -> bool:
 
     def _log_prometheus(self, stats: Stats) -> None:
         # Set system stat gauges.
-        gauge_scheduler_running.set(labels, stats.num_running)
-        gauge_scheduler_swapped.set(labels, stats.num_swapped)
-        gauge_scheduler_waiting.set(labels, stats.num_waiting)
-        gauge_gpu_cache_usage.set(labels, stats.gpu_cache_usage)
-        gauge_cpu_cache_usage.set(labels, stats.cpu_cache_usage)
+        self.metrics.gauge_scheduler_running.labels(**self.labels).set(
+            stats.num_running)
+        self.metrics.gauge_scheduler_swapped.labels(**self.labels).set(
+            stats.num_swapped)
+        self.metrics.gauge_scheduler_waiting.labels(**self.labels).set(
+            stats.num_waiting)
+        self.metrics.gauge_gpu_cache_usage.labels(**self.labels).set(
+            stats.gpu_cache_usage)
+        self.metrics.gauge_cpu_cache_usage.labels(**self.labels).set(
+            stats.cpu_cache_usage)
 
         # Add to token counters.
-        counter_prompt_tokens.add(labels, stats.num_prompt_tokens)
-        counter_generation_tokens.add(labels, stats.num_generation_tokens)
+        self.metrics.counter_prompt_tokens.labels(**self.labels).inc(
+            stats.num_prompt_tokens)
+        self.metrics.counter_generation_tokens.labels(**self.labels).inc(
+            stats.num_generation_tokens)
 
         # Observe request level latencies in histograms.
         for ttft in stats.time_to_first_tokens:
-            histogram_time_to_first_token.observe(labels, ttft)
+            self.metrics.histogram_time_to_first_token.labels(
+                **self.labels).observe(ttft)
         for tpot in stats.time_per_output_tokens:
-            histogram_time_per_output_tokens.observe(labels, tpot)
+            self.metrics.histogram_time_per_output_token.labels(
+                **self.labels).observe(tpot)
         for e2e in stats.time_e2e_requests:
-            histogram_e2e_request_latency.observe(labels, e2e)
+            self.metrics.histogram_e2e_request_latency.labels(
+                **self.labels).observe(e2e)
 
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
@@ -130,8 +172,10 @@ def _log_prometheus_interval(self, prompt_throughput: float,
         # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens
         # Which log raw data and calculate summaries using rate() on the grafana/prometheus side.
         # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
-        gauge_avg_prompt_throughput.set(labels, prompt_throughput)
-        gauge_avg_generation_throughput.set(labels, generation_throughput)
+        self.metrics.gauge_avg_prompt_throughput.labels(
+            **self.labels).set(prompt_throughput)
+        self.metrics.gauge_avg_generation_throughput.labels(
+            **self.labels).set(generation_throughput)
 
     def log(self, stats: Stats) -> None:
         """Called by LLMEngine.
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a217605452e3a..b2f040114a078 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -6,8 +6,7 @@
 import importlib
 import inspect
 
-from aioprometheus import MetricsMiddleware
-from aioprometheus.asgi.starlette import metrics
+from prometheus_client import make_asgi_app
 import fastapi
 import uvicorn
 from http import HTTPStatus
@@ -18,7 +17,6 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.metrics import add_global_metrics_labels
 from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
 from vllm.logger import init_logger
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -141,8 +139,9 @@ def parse_args():
     return parser.parse_args()
 
 
-app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
-app.add_route("/metrics", metrics)  # Exposes HTTP metrics
+# Add prometheus asgi middleware to route /metrics requests
+metrics_app = make_asgi_app()
+app.mount("/metrics", metrics_app)
 
 
 @app.exception_handler(RequestValidationError)
@@ -242,9 +241,6 @@ async def authentication(request: Request, call_next):
     openai_serving_completion = OpenAIServingCompletion(
         engine, served_model, args.lora_modules)
 
-    # Register labels for metrics
-    add_global_metrics_labels(model_name=engine_args.model)
-
     app.root_path = args.root_path
     uvicorn.run(app,
                 host=args.host,

From 70f3e8e3a1ed081003c0a2b70de151bb144f98e0 Mon Sep 17 00:00:00 2001
From: Jared Moore <27744679+jlcmoore@users.noreply.github.com>
Date: Sun, 25 Feb 2024 18:39:34 -0800
Subject: [PATCH 013/113] Add LogProbs for Chat Completions in OpenAI (#2918)

---
 tests/entrypoints/test_openai_server.py | 25 ++++++++--------
 vllm/entrypoints/openai/protocol.py     |  8 ++++++
 vllm/entrypoints/openai/serving_chat.py | 38 +++++++++++++++++++++++--
 3 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 3a359502c39d5..29d0e6fd537d5 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -155,15 +155,18 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
     }]
 
     # test single completion
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_tokens=10,
-    )
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=10)
     assert chat_completion.id is not None
     assert chat_completion.choices is not None and len(
         chat_completion.choices) == 1
     assert chat_completion.choices[0].message is not None
+    assert chat_completion.choices[0].logprobs is not None
+    assert chat_completion.choices[0].logprobs.top_logprobs is not None
+    assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 10
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 10
     assert message.role == "assistant"
@@ -198,13 +201,11 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
     single_output = single_completion.choices[0].text
     single_usage = single_completion.usage
 
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-    )
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
     chunks = []
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7c2aa707775ff..f57a2fb775783 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -63,6 +63,8 @@ class ChatCompletionRequest(BaseModel):
     seed: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    logprobs: Optional[bool] = False
+    top_logprobs: Optional[int] = None
     presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
@@ -84,6 +86,8 @@ class ChatCompletionRequest(BaseModel):
     length_penalty: Optional[float] = 1.0
 
     def to_sampling_params(self) -> SamplingParams:
+        if self.logprobs and not self.top_logprobs:
+            raise ValueError("Top logprobs must be set when logprobs is.")
         return SamplingParams(
             n=self.n,
             presence_penalty=self.presence_penalty,
@@ -96,6 +100,8 @@ def to_sampling_params(self) -> SamplingParams:
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             max_tokens=self.max_tokens,
+            logprobs=self.top_logprobs if self.logprobs else None,
+            prompt_logprobs=self.top_logprobs if self.echo else None,
             best_of=self.best_of,
             top_k=self.top_k,
             ignore_eos=self.ignore_eos,
@@ -216,6 +222,7 @@ class ChatMessage(BaseModel):
 class ChatCompletionResponseChoice(BaseModel):
     index: int
     message: ChatMessage
+    logprobs: Optional[LogProbs] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
 
 
@@ -236,6 +243,7 @@ class DeltaMessage(BaseModel):
 class ChatCompletionResponseStreamChoice(BaseModel):
     index: int
     delta: DeltaMessage
+    logprobs: Optional[LogProbs] = None
     finish_reason: Optional[Literal["stop", "length"]] = None
 
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 850797ae4b9b6..dd152583c2329 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -101,7 +101,10 @@ async def chat_completion_stream_generator(
         role = self.get_chat_request_role(request)
         for i in range(request.n):
             choice_data = ChatCompletionResponseStreamChoice(
-                index=i, delta=DeltaMessage(role=role), finish_reason=None)
+                index=i,
+                delta=DeltaMessage(role=role),
+                logprobs=None,
+                finish_reason=None)
             chunk = ChatCompletionStreamResponse(id=request_id,
                                                  object=chunk_object_type,
                                                  created=created_time,
@@ -118,6 +121,7 @@ async def chat_completion_stream_generator(
                         "content") and request.messages[-1].get(
                             "role") == role:
                 last_msg_content = request.messages[-1]["content"]
+
             if last_msg_content:
                 for i in range(request.n):
                     choice_data = ChatCompletionResponseStreamChoice(
@@ -129,6 +133,7 @@ async def chat_completion_stream_generator(
                         object=chunk_object_type,
                         created=created_time,
                         choices=[choice_data],
+                        logprobs=None,
                         model=model_name)
                     data = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {data}\n\n"
@@ -145,15 +150,29 @@ async def chat_completion_stream_generator(
                 if finish_reason_sent[i]:
                     continue
 
+                delta_token_ids = output.token_ids[previous_num_tokens[i]:]
+                top_logprobs = output.logprobs[
+                    previous_num_tokens[i]:] if output.logprobs else None
+
+                if request.logprobs:
+                    logprobs = self._create_logprobs(
+                        token_ids=delta_token_ids,
+                        top_logprobs=top_logprobs,
+                        num_output_top_logprobs=request.logprobs,
+                        initial_text_offset=len(previous_texts[i]),
+                    )
+                else:
+                    logprobs = None
+
                 delta_text = output.text[len(previous_texts[i]):]
                 previous_texts[i] = output.text
                 previous_num_tokens[i] = len(output.token_ids)
-
                 if output.finish_reason is None:
                     # Send token-by-token response for each request.n
                     choice_data = ChatCompletionResponseStreamChoice(
                         index=i,
                         delta=DeltaMessage(content=delta_text),
+                        logprobs=logprobs,
                         finish_reason=None)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
@@ -174,6 +193,7 @@ async def chat_completion_stream_generator(
                     choice_data = ChatCompletionResponseStreamChoice(
                         index=i,
                         delta=DeltaMessage(content=delta_text),
+                        logprobs=logprobs,
                         finish_reason=output.finish_reason)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
@@ -208,11 +228,25 @@ async def chat_completion_full_generator(
         assert final_res is not None
 
         choices = []
+
         role = self.get_chat_request_role(request)
         for output in final_res.outputs:
+            token_ids = output.token_ids
+            top_logprobs = output.logprobs
+
+            if request.logprobs:
+                logprobs = self._create_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=top_logprobs,
+                    num_output_top_logprobs=request.logprobs,
+                )
+            else:
+                logprobs = None
+
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,
                 message=ChatMessage(role=role, content=output.text),
+                logprobs=logprobs,
                 finish_reason=output.finish_reason,
             )
             choices.append(choice_data)

From cfc15a1031ef0197a1b291d2ed93717a9bdad268 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Mon, 26 Feb 2024 13:48:56 -0800
Subject: [PATCH 014/113] Optimize Triton MoE Kernel (#2979)

Co-authored-by: Cade Daniel <edacih@gmail.com>
---
 benchmarks/kernels/benchmark_mixtral_moe.py   | 172 ++++++++++++++++++
 setup.py                                      |   4 +-
 .../layers/fused_moe/__init__.py              |   5 +
 ...584,device_name=NVIDIA_A100-SXM4-80GB.json |  20 ++
 ...168,device_name=NVIDIA_H100_80GB_HBM3.json |  24 +++
 .../layers/fused_moe/configs/README           |  10 +
 .../layers/{ => fused_moe}/fused_moe.py       |  77 ++++++--
 7 files changed, 297 insertions(+), 15 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_mixtral_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/__init__.py
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/README
 rename vllm/model_executor/layers/{ => fused_moe}/fused_moe.py (85%)

diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
new file mode 100644
index 0000000000000..9e08df76947f8
--- /dev/null
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -0,0 +1,172 @@
+import json
+import os
+import sys
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+from vllm.model_executor.layers.fused_moe import fused_moe
+import torch
+import torch.nn.functional as F
+import triton
+
+
+def main():
+    method = fused_moe
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        run_grid(bs, method=method)
+
+
+def run_grid(bs, method):
+    d_model = 4096
+    num_total_experts = 8
+    top_k = 2
+    tp_size = 2
+    model_intermediate_size = 14336
+    num_layers = 32
+    num_calls = 100
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    configs = []
+    if bs <= 16:
+        BLOCK_SIZES_M = [16]
+    elif bs <= 32:
+        BLOCK_SIZES_M = [16, 32]
+    elif bs <= 64:
+        BLOCK_SIZES_M = [16, 32, 64]
+    elif bs <= 128:
+        BLOCK_SIZES_M = [16, 32, 64, 128]
+    else:
+        BLOCK_SIZES_M = [16, 32, 64, 128, 256]
+
+    for block_size_n in [32, 64, 128, 256]:
+        for block_size_m in BLOCK_SIZES_M:
+            for block_size_k in [64, 128, 256]:
+                for group_size_m in [1, 16, 32, 64]:
+                    for num_warps in [4, 8]:
+                        configs.append({
+                            "BLOCK_SIZE_M": block_size_m,
+                            "BLOCK_SIZE_N": block_size_n,
+                            "BLOCK_SIZE_K": block_size_k,
+                            "GROUP_SIZE_M": group_size_m,
+                            "num_warps": num_warps,
+                            "num_stages": 4,
+                        })
+
+    best_config = None
+    best_time_us = 1e20
+
+    for config in configs:
+        print(f'{tp_size=} {bs=}')
+        print(f'{config}')
+        # warmup
+        print(f'warming up')
+        try:
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        print(f'benchmarking')
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+            print(
+                f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}'
+            )
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    filename = "/tmp/config.jsonl"
+    print(f"writing config to file {filename}")
+    with open(filename, "a") as f:
+        f.write(json.dumps({str(bs): best_config}) + "\n")
+
+
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+               top_k: int, tp_size: int, model_intermediate_size: int, method,
+               config) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=torch.bfloat16,
+    )
+
+    ws = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2s = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    gating_output = F.softmax(torch.rand(
+        (num_calls, bs, num_total_experts),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    ),
+                              dim=-1)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=ws,
+            w2=w2s,
+            gating_output=gating_output[i],
+            topk=2,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+        )
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/setup.py b/setup.py
index 8fcb86394f76d..16978d74e0425 100644
--- a/setup.py
+++ b/setup.py
@@ -432,7 +432,9 @@ def get_requirements() -> List[str]:
     return requirements
 
 
-package_data = {"vllm": ["py.typed"]}
+package_data = {
+    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+}
 if os.environ.get("VLLM_USE_PRECOMPILED"):
     ext_modules = []
     package_data["vllm"].append("*.so")
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
new file mode 100644
index 0000000000000..1391d43c8abeb
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -0,0 +1,5 @@
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
+
+__all__ = [
+    "fused_moe",
+]
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000..1fefb5ff7e42d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,20 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7},
+    "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6},
+    "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7},
+    "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7},
+    "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "96": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6},
+    "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4},
+    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4},
+    "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
+    "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4},
+    "2048": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4},
+    "3072": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4},
+    "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000..64d49ca66c1c8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,24 @@
+{
+    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4},
+    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 4},
+    "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4},
+    "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4},
+    "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "80": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
+    "200": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4},
+    "208": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4},
+    "216": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4},
+    "224": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4},
+    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4},
+    "512": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
+    "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
+    "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
+    "2048": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
+    "3072": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
+    "4096": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README
new file mode 100644
index 0000000000000..45d40cbfb1a2e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/README
@@ -0,0 +1,10 @@
+This directory contains tuned configurations for different settings of the fused_moe kernel.
+For different settings of
+- E (number of experts)
+- N (intermediate size)
+- device_name (torch.cuda.get_device_name())
+the JSON file contains a mapping from M (batch size) to the chosen configuration.
+
+The example configurations provided are for the Mixtral model for TP2 on H100
+and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
+N = 7168 and for TP4 we have N = 3584.
diff --git a/vllm/model_executor/layers/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
similarity index 85%
rename from vllm/model_executor/layers/fused_moe.py
rename to vllm/model_executor/layers/fused_moe/fused_moe.py
index bc3aef1887ef8..830fde6c4eb6d 100644
--- a/vllm/model_executor/layers/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,11 +1,19 @@
 """Fused MoE kernel."""
+import functools
+import json
+import os
+from typing import Any, Dict, Optional
+
 import torch
 import triton
 import triton.language as tl
 
 from vllm._C import ops
+from vllm.logger import init_logger
 from vllm.utils import is_hip
 
+logger = init_logger(__name__)
+
 
 @triton.jit
 def fused_moe_kernel(
@@ -210,6 +218,34 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     )
 
 
+@functools.lru_cache
+def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the fused MoE kernel.
+
+    The return value will be a dictionary that maps an irregular grid of batch sizes
+    to configurations of the fused_moe kernel. To evaluate the kernel on a given batch
+    size bs, the closest batch size in the grid should be picked and the associated
+    configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs directory
+    device_name = torch.cuda.get_device_name().replace(" ", "_")
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs",
+        f"E={E},N={N},device_name={device_name}.json")
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                f"Using configuration from {config_file_path} for MoE layer.")
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default configuration
+    return None
+
+
 def fused_moe(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -218,6 +254,7 @@ def fused_moe(
     topk: int,
     renormalize: bool,
     inplace: bool = False,
+    override_config: Optional[Dict[str, Any]] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism.
@@ -230,6 +267,7 @@ def fused_moe(
     - topk (int): The number of top-k experts to select.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
     - inplace (bool): If True, perform the operation in-place. Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration.
     
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -279,20 +317,31 @@ def fused_moe(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
-    config = {
-        'BLOCK_SIZE_M': 64,
-        'BLOCK_SIZE_N': 64,
-        'BLOCK_SIZE_K': 32,
-        'GROUP_SIZE_M': 8
-    }
-
-    if topk_ids.numel() <= w1.shape[0]:
-        config = {
-            'BLOCK_SIZE_M': 16,
-            'BLOCK_SIZE_N': 32,
-            'BLOCK_SIZE_K': 64,
-            'GROUP_SIZE_M': 1
-        }
+    if override_config:
+        config = override_config
+    else:
+        # First try to load optimal config from the file
+        configs = get_moe_configs(E, w2.shape[2])
+
+        if configs:
+            # If an optimal configuration map has been found, look up the optimal config
+            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+        else:
+            # Else use the default config
+            config = {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32,
+                'GROUP_SIZE_M': 8
+            }
+
+            if M <= E:
+                config = {
+                    'BLOCK_SIZE_M': 16,
+                    'BLOCK_SIZE_N': 32,
+                    'BLOCK_SIZE_K': 64,
+                    'GROUP_SIZE_M': 1
+                }
 
     intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
                                       device=hidden_states.device,

From d6e4a130b028f42a7f413d99eb91a4395fa7a04a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 26 Feb 2024 15:00:54 -0800
Subject: [PATCH 015/113] [Minor] Remove gather_cached_kv kernel (#3043)

---
 csrc/cache.h          |   7 --
 csrc/cache_kernels.cu | 161 ------------------------------------------
 csrc/pybind.cpp       |   4 --
 3 files changed, 172 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index 21c71830f7942..765e231abd26f 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -23,13 +23,6 @@ void reshape_and_cache(
   torch::Tensor& slot_mapping,
   const std::string& kv_cache_dtype);
 
-void gather_cached_kv(
-  torch::Tensor& key,
-  torch::Tensor& value,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& slot_mapping);
-
 // Just for unittest
 void convert_fp8_e5m2(
   torch::Tensor& src_cache,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index ceb7347d94670..7254010b8e3a9 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -269,167 +269,6 @@ void reshape_and_cache(
 
 namespace vllm {
 
-// Grid: (num_blocks, block_size).
-template<typename scalar_t>
-__global__ void gather_cached_kv_kernel(
-  scalar_t* __restrict__ key,             // [num_tokens, [stride], num_heads, head_size]
-  scalar_t* __restrict__ value,           // [num_tokens, [stride], num_heads, head_size]
-  const scalar_t* __restrict__ key_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ value_cache,   // [num_blocks, num_heads, head_size, block_size]
-  const int* __restrict__ slot_mapping,   // [num_tokens]
-  const int key_stride,
-  const int value_stride,
-  const int num_heads,
-  const int head_size,
-  const int block_size,
-  const int x) {
-    const int token_idx = blockIdx.x;
-    const int slot_idx = slot_mapping[token_idx];
-    const int block_idx = slot_idx / block_size;
-    const int block_offset = slot_idx % block_size;
-
-    const int num_tokens = num_heads * head_size;
-    for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) {
-      const int tgt_key_idx = token_idx * key_stride + i;
-      const int tgt_value_idx = token_idx * value_stride + i;
-
-      const int head_idx = i / head_size;
-      const int head_offset = i % head_size;
-      const int x_idx = head_offset / x;  // the offset of the [head_size/x] dimension
-      const int x_offset = head_offset % x;
-
-      const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
-                              + head_idx * (head_size / x) * block_size * x
-                              + x_idx * block_size * x
-                              + block_offset * x
-                              + x_offset;
-      const int src_value_idx = block_idx * num_heads * head_size * block_size
-                                + head_idx * head_size * block_size
-                                + head_offset * block_size
-                                + block_offset;
-
-      key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]);
-      value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]);
-    }
-}
-
-template <typename scalar_t>
-__global__ void gather_cached_kv_kernel_optimized(
-    scalar_t *__restrict__ key,             // [num_tokens, [stride], num_heads, head_size]
-    scalar_t *__restrict__ value,           // [num_tokens, [stride], num_heads, head_size]
-    const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
-    const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
-    const int *__restrict__ slot_mapping,   // [num_tokens]
-    const int key_stride,
-    const int value_stride,
-    const int num_heads,
-    const int head_size,
-    const int block_size,
-    const int x)
-{
-    const int token_idx = blockIdx.x;
-    const int slot_idx = slot_mapping[token_idx];
-    const int block_idx = slot_idx / block_size;
-    const int block_offset = slot_idx % block_size;
-
-    const int dim = num_heads * head_size;
-    assert(dim % 4 == 0);  // this is true for known use cases
-    const int unroll_factor = 4;
-    const int unrolled_dim = dim / unroll_factor;
-
-    for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x)
-    {
-        int tgt_key_indices[unroll_factor];
-        int tgt_value_indices[unroll_factor];
-        int src_key_indices[unroll_factor];
-        int src_value_indices[unroll_factor];
-        scalar_t keys_to_store[unroll_factor];
-        scalar_t values_to_store[unroll_factor];
-
-        #pragma unroll
-        for (int j = 0; j < unroll_factor; ++j)
-        {
-            int index = i + j * unrolled_dim;
-
-            const int tgt_key_idx = token_idx * key_stride + index;
-            const int tgt_value_idx = token_idx * value_stride + index;
-
-            const int head_idx = index / head_size;
-            const int head_offset = index % head_size;
-            const int x_idx = head_offset / x;
-            const int x_offset = head_offset % x;
-
-            const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
-                                    + head_idx * (head_size / x) * block_size * x
-                                    + x_idx * block_size * x
-                                    + block_offset * x
-                                    + x_offset;
-            const int src_value_idx = block_idx * num_heads * head_size * block_size
-                                      + head_idx * head_size * block_size
-                                      + head_offset * block_size
-                                      + block_offset;
-
-            tgt_key_indices[j] = tgt_key_idx;
-            tgt_value_indices[j] = tgt_value_idx;
-            src_key_indices[j] = src_key_idx;
-            src_value_indices[j] = src_value_idx;
-
-            keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]);
-            values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]);
-        }
-
-        #pragma unroll
-        for (int j = 0; j < unroll_factor; ++j)
-        {
-            key[tgt_key_indices[j]] = keys_to_store[j];
-            value[tgt_value_indices[j]] = values_to_store[j];
-        }
-    }
-}
-
-} // namespace vllm
-
-void gather_cached_kv(
-  torch::Tensor& key,           // [out] [num_tokens, num_heads, head_size]
-  torch::Tensor& value,         // [out] [num_tokens, num_heads, head_size]
-  torch::Tensor& key_cache,     // [in]  [num_blocks, num_heads, head_size/x, block_size, x]
-  torch::Tensor& value_cache,   // [in]  [num_blocks, num_heads, head_size, block_size]
-  torch::Tensor& slot_mapping)  // [in]  [num_tokens]
-{
-  int num_tokens = key.size(0);
-  int num_heads = key.size(1);
-  int head_size = key.size(2);
-  int block_size = key_cache.size(3);
-  int x = key_cache.size(4);
-
-  int key_stride = key.stride(0);
-  int value_stride = value.stride(0);
-
-  dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * head_size, 512));
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-    key.scalar_type(),
-    "gather_cached_kv_kernel_optimized",
-    [&] {
-      vllm::gather_cached_kv_kernel_optimized<scalar_t><<<grid, block, 0, stream>>>(
-        key.data_ptr<scalar_t>(),
-        value.data_ptr<scalar_t>(),
-        key_cache.data_ptr<scalar_t>(),
-        value_cache.data_ptr<scalar_t>(),
-        slot_mapping.data_ptr<int>(),
-        key_stride,
-        value_stride,
-        num_heads,
-        head_size,
-        block_size,
-        x);
-    });
-}
-
-namespace vllm {
-
 template<typename Tout, typename Tin>
 __global__ void convert_fp8_e5m2_kernel(
   const Tin* __restrict__ src_cache,
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 24c22020131e8..5d062bb5700bc 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -79,10 +79,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     "reshape_and_cache",
     &reshape_and_cache,
     "Reshape the key and value tensors and cache them");
-  cache_ops.def(
-    "gather_cached_kv",
-    &gather_cached_kv,
-    "Gather key and value from the cache into contiguous QKV tensors");
   cache_ops.def(
     "convert_fp8_e5m2",
     &convert_fp8_e5m2,

From d9f726c4d0920e705069c005fb3b1042368961ae Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Tue, 27 Feb 2024 09:25:22 +0800
Subject: [PATCH 016/113] [Minor] Remove unused config files (#3039)

---
 vllm/model_executor/models/baichuan.py      |  6 +-
 vllm/model_executor/models/olmo.py          |  4 +-
 vllm/model_executor/models/qwen.py          |  8 +--
 vllm/transformers_utils/config.py           |  2 -
 vllm/transformers_utils/configs/__init__.py |  6 --
 vllm/transformers_utils/configs/baichuan.py | 62 ------------------
 vllm/transformers_utils/configs/olmo.py     | 72 ---------------------
 vllm/transformers_utils/configs/qwen.py     | 60 -----------------
 8 files changed, 10 insertions(+), 210 deletions(-)
 delete mode 100644 vllm/transformers_utils/configs/baichuan.py
 delete mode 100644 vllm/transformers_utils/configs/olmo.py
 delete mode 100644 vllm/transformers_utils/configs/qwen.py

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index f08c3c8d257ff..550dec6487f9e 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -23,6 +23,7 @@
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -42,7 +43,6 @@
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -186,7 +186,7 @@ def forward(
 class BaiChuanDecoderLayer(nn.Module):
 
     def __init__(self,
-                 config: BaiChuanConfig,
+                 config: PretrainedConfig,
                  position_embedding: str,
                  linear_method: Optional[LinearMethodBase] = None):
         super().__init__()
@@ -245,7 +245,7 @@ def forward(
 class BaiChuanModel(nn.Module):
 
     def __init__(self,
-                 config: BaiChuanConfig,
+                 config: PretrainedConfig,
                  position_embedding: str,
                  linear_method: Optional[LinearMethodBase] = None):
         super().__init__()
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 2eb42935e8bfd..9d563039208c8 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -61,7 +61,9 @@
     hf_model_weights_iterator,
 )
 from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.olmo import OLMoConfig
+
+# this model must need this dependency
+from hf_olmo import OLMoConfig
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index fbc7320fb45a4..37af84c7cd53f 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -8,6 +8,7 @@
 
 import torch
 from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -27,7 +28,6 @@
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
-from vllm.transformers_utils.configs.qwen import QWenConfig
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -127,7 +127,7 @@ class QWenBlock(nn.Module):
 
     def __init__(
         self,
-        config: QWenConfig,
+        config: PretrainedConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
@@ -179,7 +179,7 @@ class QWenModel(nn.Module):
 
     def __init__(
         self,
-        config: QWenConfig,
+        config: PretrainedConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
@@ -222,7 +222,7 @@ class QWenLMHeadModel(nn.Module):
 
     def __init__(
         self,
-        config: QWenConfig,
+        config: PretrainedConfig,
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 491cb4d9a427c..6b0413f440a0e 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -5,10 +5,8 @@
 from vllm.transformers_utils.configs import *
 
 _CONFIG_REGISTRY = {
-    "baichuan": BaiChuanConfig,
     "chatglm": ChatGLMConfig,
     "mpt": MPTConfig,
-    "qwen": QWenConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
 }
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 47bcc2b9594be..ef955f75cedaa 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -1,18 +1,12 @@
-from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.mpt import MPTConfig
-from vllm.transformers_utils.configs.olmo import OLMoConfig
-from vllm.transformers_utils.configs.qwen import QWenConfig
 # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 
 __all__ = [
-    "BaiChuanConfig",
     "ChatGLMConfig",
     "MPTConfig",
-    "OLMoConfig",
-    "QWenConfig",
     "RWConfig",
 ]
diff --git a/vllm/transformers_utils/configs/baichuan.py b/vllm/transformers_utils/configs/baichuan.py
deleted file mode 100644
index 869817525c11a..0000000000000
--- a/vllm/transformers_utils/configs/baichuan.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from transformers.configuration_utils import PretrainedConfig
-
-
-class BaiChuanConfig(PretrainedConfig):
-    model_type = "baichuan"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=64000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=4096,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/vllm/transformers_utils/configs/olmo.py b/vllm/transformers_utils/configs/olmo.py
deleted file mode 100644
index a9dfc6ec88ca6..0000000000000
--- a/vllm/transformers_utils/configs/olmo.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# coding=utf-8
-# adapted from https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/configuration_olmo.py
-"""OLMo configuration"""
-from transformers import PretrainedConfig
-
-
-class OLMoConfig(PretrainedConfig):
-    model_type = 'olmo'
-    attribute_map = {
-        'num_attention_heads': 'n_heads',
-        'hidden_size': 'd_model',
-        'num_hidden_layers': 'n_layers',
-    }
-
-    # Note that the defaults for these attributes are equivalent to the base GPT2 model.
-    def __init__(
-        self,
-        d_model=768,
-        n_heads=12,
-        n_layers=12,
-        mlp_ratio=4,
-        mlp_hidden_size=None,
-        activation_type="swiglu",
-        block_type="sequential",
-        block_group_size=1,
-        alibi=False,
-        alibi_bias_max=8.0,
-        rope=False,
-        rope_full_precision=True,
-        multi_query_attention=False,
-        attention_layer_norm=False,
-        layer_norm_type="default",
-        layer_norm_with_affine=True,
-        attention_layer_norm_with_affine=True,
-        max_sequence_length=1024,
-        include_bias=True,
-        bias_for_layer_norm=None,
-        scale_logits=False,
-        vocab_size=50257,
-        embedding_size=50304,
-        weight_tying=True,
-        eos_token_id=50256,
-        pad_token_id=50256,
-        **kwargs,
-    ):
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.mlp_ratio = mlp_ratio
-        self.mlp_hidden_size = mlp_hidden_size
-        self.activation_type = activation_type
-        self.block_type = block_type
-        self.block_group_size = block_group_size
-        self.alibi = alibi
-        self.alibi_bias_max = alibi_bias_max
-        self.rope = rope
-        self.rope_full_precision = rope_full_precision
-        self.multi_query_attention = multi_query_attention
-        self.attention_layer_norm = attention_layer_norm
-        self.layer_norm_type = layer_norm_type
-        self.layer_norm_with_affine = layer_norm_with_affine
-        self.attention_layer_norm_with_affine = attention_layer_norm_with_affine
-        self.max_sequence_length = max_sequence_length
-        self.include_bias = include_bias
-        self.bias_for_layer_norm = bias_for_layer_norm
-        self.scale_logits = scale_logits
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.weight_tying = weight_tying
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/qwen.py b/vllm/transformers_utils/configs/qwen.py
deleted file mode 100644
index bb033a337ad04..0000000000000
--- a/vllm/transformers_utils/configs/qwen.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Alibaba Cloud.
-# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
-
-from transformers import PretrainedConfig
-
-
-class QWenConfig(PretrainedConfig):
-    model_type = "qwen"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        emb_dropout_prob=0.0,
-        attn_dropout_prob=0.0,
-        layer_norm_epsilon=1e-6,
-        initializer_range=0.02,
-        max_position_embeddings=8192,
-        scale_attn_weights=True,
-        use_cache=True,
-        bf16=False,
-        fp16=False,
-        fp32=False,
-        kv_channels=128,
-        rotary_pct=1.0,
-        rotary_emb_base=10000,
-        use_dynamic_ntk=True,
-        use_logn_attn=True,
-        use_flash_attn="auto",
-        intermediate_size=22016,
-        no_bias=True,
-        tie_word_embeddings=False,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.emb_dropout_prob = emb_dropout_prob
-        self.attn_dropout_prob = attn_dropout_prob
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-        self.scale_attn_weights = scale_attn_weights
-        self.use_cache = use_cache
-        self.max_position_embeddings = max_position_embeddings
-        self.bf16 = bf16
-        self.fp16 = fp16
-        self.fp32 = fp32
-        self.kv_channels = kv_channels
-        self.rotary_pct = rotary_pct
-        self.rotary_emb_base = rotary_emb_base
-        self.use_dynamic_ntk = use_dynamic_ntk
-        self.use_logn_attn = use_logn_attn
-        self.use_flash_attn = use_flash_attn
-        self.no_bias = no_bias
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

From c1c0d00b88320f97e00a3175fac235a232893da5 Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Tue, 27 Feb 2024 09:33:38 +0800
Subject: [PATCH 017/113] Don't use cupy when `enforce_eager=True` (#3037)

---
 vllm/engine/llm_engine.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index c1a75924c6d72..f5b2145c22d6f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -284,7 +284,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
             is_driver_worker=True,
         )
 
-        self._run_workers("init_model", cupy_port=get_open_port())
+        # don't use cupy for eager mode
+        self._run_workers("init_model",
+                          cupy_port=get_open_port()
+                          if not model_config.enforce_eager else None)
         self._run_workers(
             "load_model",
             max_concurrent_workers=self.parallel_config.

From 4dd6416faf7cc3035ac3f5c8375eb27e6b0eee80 Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Tue, 27 Feb 2024 10:31:10 +0800
Subject: [PATCH 018/113] Fix stablelm (#3038)

---
 vllm/model_executor/models/__init__.py |  1 +
 vllm/model_executor/models/stablelm.py | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 411814f2f5d09..40b375bb6fbea 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -43,6 +43,7 @@
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
+    "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
 }
 
 # Models not supported by ROCm.
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 95e5ad8ede63e..44c57e5a6d4f9 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -94,7 +94,9 @@ def __init__(self,
             1, self.total_num_key_value_heads // tp_size)
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
+        rope_pct = getattr(config, "rope_pct",
+                           getattr(config, "partial_rotary_factor", 1))
+        self.rotary_ndims = int(self.head_dim * rope_pct)
         self.scaling = self.head_dim**-0.5
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_key_value_heads * self.head_dim
@@ -114,7 +116,6 @@ def __init__(self,
                                         self.hidden_size,
                                         bias=False,
                                         linear_method=linear_method)
-        self.rotary_ndims = int(self.head_dim * self.config.rope_pct)
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.rotary_ndims,
@@ -152,10 +153,11 @@ def __init__(
         super().__init__()
         self.self_attn = StablelmAttention(config)
         self.mlp = StablelmMLP(config, linear_method)
-        self.input_layernorm = nn.LayerNorm(config.hidden_size,
-                                            eps=config.norm_eps)
+        norm_eps = getattr(config, "norm_eps",
+                           getattr(config, "layer_norm_eps", 1e-05))
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
-                                                     eps=config.norm_eps)
+                                                     eps=norm_eps)
 
     def forward(
         self,
@@ -199,7 +201,9 @@ def __init__(self,
             StablelmDecoderLayer(config, linear_method)
             for _ in range(config.num_hidden_layers)
         ])
-        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps)
+        norm_eps = getattr(config, "norm_eps",
+                           getattr(config, "layer_norm_eps", 1e-05))
+        self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
 
     def forward(
         self,

From 48a8f4a7fd18d516ffc0a304219ef722613ea792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E5=A4=A7=E6=88=90?= <1345739055@qq.com>
Date: Tue, 27 Feb 2024 11:17:06 +0800
Subject: [PATCH 019/113] Support Orion model (#2539)

Co-authored-by: zhangdacheng <zhangdacheng@ainirobot.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 README.md                               |   1 +
 docs/source/models/supported_models.rst |   3 +
 vllm/model_executor/models/__init__.py  |   1 +
 vllm/model_executor/models/orion.py     | 322 ++++++++++++++++++++++++
 4 files changed, 327 insertions(+)
 create mode 100644 vllm/model_executor/models/orion.py

diff --git a/README.md b/README.md
index 7a16bb1fef044..f771788db2b89 100644
--- a/README.md
+++ b/README.md
@@ -73,6 +73,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
 - OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index c1639ca9e056a..35b548d2737ce 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -71,6 +71,9 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`OPTForCausalLM`
     - OPT, OPT-IML
     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
+  * - :code:`OrionForCausalLM`
+    - Orion
+    - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
   * - :code:`PhiForCausalLM`
     - Phi
     - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 40b375bb6fbea..66d28207d664f 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -38,6 +38,7 @@
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
     "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
     "OPTForCausalLM": ("opt", "OPTForCausalLM"),
+    "OrionForCausalLM": ("orion", "OrionForCausalLM"),
     "PhiForCausalLM": ("phi", "PhiForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
new file mode 100644
index 0000000000000..0b067d4fc8802
--- /dev/null
+++ b/vllm/model_executor/models/orion.py
@@ -0,0 +1,322 @@
+# coding=utf-8
+# Adapted from
+# https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
+# Copyright (c) OrionStar Inc.
+# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
+"""Inference-only Orion-14B model compatible with HuggingFace weights."""
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class OrionMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           linear_method=linear_method)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OrionAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OrionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.self_attn = OrionAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        self.mlp = OrionMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, None
+
+
+class OrionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList([
+            OrionDecoderLayer(config, linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class OrionForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = OrionModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From 2410e320b35cd704059b7c6ba8d8ba7643fe46ee Mon Sep 17 00:00:00 2001
From: Jingru <niejingru@hotmail.com>
Date: Tue, 27 Feb 2024 11:22:16 +0800
Subject: [PATCH 020/113] fix `get_ip` error in pure ipv6 environment (#2931)

---
 vllm/utils.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 8ca95e148eb39..c8ac57de6f5f5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -162,9 +162,16 @@ def _async_wrapper(*args, **kwargs) -> asyncio.Future:
 
 
 def get_ip() -> str:
+    # try ipv4
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-    s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
-    return s.getsockname()[0]
+    try:
+        s.connect(("dns.google", 80))  # Doesn't need to be reachable
+        return s.getsockname()[0]
+    except OSError:
+        # try ipv6
+        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
+        s.connect(("dns.google", 80))
+        return s.getsockname()[0]
 
 
 def get_distributed_init_method(ip: str, port: int) -> str:
@@ -172,9 +179,16 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 
 
 def get_open_port() -> int:
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
+    # try ipv4
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        # try ipv6
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
 
 
 def set_cuda_visible_devices(device_ids: List[int]) -> None:

From 4bd18ec0c719d2910040e22fa60503fdbfce1332 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 26 Feb 2024 19:44:29 -0800
Subject: [PATCH 021/113] [Minor] Fix type annotation in fused moe (#3045)

---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 830fde6c4eb6d..08e3c2d5b706e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -2,7 +2,7 @@
 import functools
 import json
 import os
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Tuple
 
 import torch
 import triton
@@ -137,7 +137,7 @@ def fused_moe_kernel(
 
 def moe_align_block_size(
         topk_ids: torch.Tensor, block_size: int,
-        num_experts: int) -> (torch.Tensor, torch.Tensor, torch.Tensor):
+        num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block size for matrix multiplication.
 
@@ -185,7 +185,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
-                            mul_routed_weight: bool, top_k: int, config: dict):
+                            mul_routed_weight: bool, top_k: int,
+                            config: Dict[str, Any]) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 

From e0ade06d6305cf84b41c1962cdd9dfdbfee16ac9 Mon Sep 17 00:00:00 2001
From: Dylan Hawk <51147702+dylanwhawk@users.noreply.github.com>
Date: Mon, 26 Feb 2024 19:51:53 -0800
Subject: [PATCH 022/113] Support logit bias for OpenAI API (#3027)

---
 tests/entrypoints/test_openai_server.py       | 48 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 33 +++++++++++++
 vllm/entrypoints/openai/serving_chat.py       |  8 +---
 vllm/entrypoints/openai/serving_completion.py |  6 +--
 4 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 29d0e6fd537d5..72e2374899793 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -9,6 +9,8 @@
 import openai  # use the official client for correctness check
 from huggingface_hub import snapshot_download  # downloading lora to test lora requests
 
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
@@ -310,5 +312,51 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
     assert texts[0] == texts[1]
 
 
+async def test_logits_bias(server, client: openai.AsyncOpenAI):
+    prompt = "Hello, my name is"
+    max_tokens = 5
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+
+    # Test exclusive selection
+    token_id = 1000
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token_id): 100},
+    )
+    assert completion.choices[0].text is not None and len(
+        completion.choices[0].text) >= 5
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    expected_tokens = tokenizer(tokenizer.decode([token_id] * 5),
+                                add_special_tokens=False)["input_ids"]
+    assert all([
+        response == expected
+        for response, expected in zip(response_tokens, expected_tokens)
+    ])
+
+    # Test ban
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+    )
+    response_tokens = tokenizer(completion.choices[0].text,
+                                add_special_tokens=False)["input_ids"]
+    first_response = completion.choices[0].text
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=prompt,
+        max_tokens=max_tokens,
+        temperature=0.0,
+        logit_bias={str(token): -100
+                    for token in response_tokens},
+    )
+    assert first_response != completion.choices[0].text
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f57a2fb775783..e85e7e2b1ede9 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -8,6 +8,8 @@
 from vllm.utils import random_uuid
 from vllm.sampling_params import SamplingParams
 
+import torch
+
 
 class ErrorResponse(BaseModel):
     object: str = "error"
@@ -88,6 +90,21 @@ class ChatCompletionRequest(BaseModel):
     def to_sampling_params(self) -> SamplingParams:
         if self.logprobs and not self.top_logprobs:
             raise ValueError("Top logprobs must be set when logprobs is.")
+
+        logits_processors = None
+        if self.logit_bias:
+
+            def logit_bias_logits_processor(
+                    token_ids: List[int],
+                    logits: torch.Tensor) -> torch.Tensor:
+                for token_id, bias in self.logit_bias.items():
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    bias = min(100, max(-100, bias))
+                    logits[int(token_id)] += bias
+                return logits
+
+            logits_processors = [logit_bias_logits_processor]
+
         return SamplingParams(
             n=self.n,
             presence_penalty=self.presence_penalty,
@@ -111,6 +128,7 @@ def to_sampling_params(self) -> SamplingParams:
             spaces_between_special_tokens=self.spaces_between_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
             length_penalty=self.length_penalty,
+            logits_processors=logits_processors,
         )
 
 
@@ -149,6 +167,20 @@ class CompletionRequest(BaseModel):
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
 
+        logits_processors = None
+        if self.logit_bias:
+
+            def logit_bias_logits_processor(
+                    token_ids: List[int],
+                    logits: torch.Tensor) -> torch.Tensor:
+                for token_id, bias in self.logit_bias.items():
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    bias = min(100, max(-100, bias))
+                    logits[int(token_id)] += bias
+                return logits
+
+            logits_processors = [logit_bias_logits_processor]
+
         return SamplingParams(
             n=self.n,
             best_of=self.best_of,
@@ -172,6 +204,7 @@ def to_sampling_params(self):
             spaces_between_special_tokens=(self.spaces_between_special_tokens),
             include_stop_str_in_output=self.include_stop_str_in_output,
             length_penalty=self.length_penalty,
+            logits_processors=logits_processors,
         )
 
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index dd152583c2329..5635ac6c9e106 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -39,19 +39,13 @@ async def create_chat_completion(
         See  https://platform.openai.com/docs/api-reference/chat/create
         for the API specification. This API mimics the OpenAI ChatCompletion API.
 
-        NOTE: Currently we do not support the following features:
+        NOTE: Currently we do not support the following feature:
             - function_call (Users should implement this by themselves)
-            - logit_bias (to be supported by vLLM engine)
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
 
-        if request.logit_bias is not None and len(request.logit_bias) > 0:
-            # TODO: support logit_bias in vLLM engine.
-            return self.create_error_response(
-                "logit_bias is not currently supported")
-
         try:
             prompt = self.tokenizer.apply_chat_template(
                 conversation=request.messages,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 667b659f81e9e..610f53549da48 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -264,10 +264,9 @@ async def create_completion(self, request: CompletionRequest,
         See https://platform.openai.com/docs/api-reference/completions/create
         for the API specification. This API mimics the OpenAI Completion API.
 
-        NOTE: Currently we do not support the following features:
+        NOTE: Currently we do not support the following feature:
             - suffix (the language models we currently support do not support
             suffix)
-            - logit_bias (to be supported by vLLM engine)
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
@@ -277,9 +276,6 @@ async def create_completion(self, request: CompletionRequest,
         if request.suffix is not None:
             return self.create_error_response(
                 "suffix is not currently supported")
-        if request.logit_bias is not None and len(request.logit_bias) > 0:
-            return self.create_error_response(
-                "logit_bias is not currently supported")
 
         model_name = request.model
         request_id = f"cmpl-{random_uuid()}"

From 8b430d7dea5695324636fc458c1cce52213bd499 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 26 Feb 2024 20:23:50 -0800
Subject: [PATCH 023/113] [Minor] Fix StableLMEpochForCausalLM ->
 StableLmForCausalLM (#3046)

---
 docs/source/models/supported_models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 35b548d2737ce..9d4ec663a16e5 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -83,7 +83,7 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
     - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
-  * - :code:`StableLMEpochForCausalLM`
+  * - :code:`StableLmForCausalLM`
     - StableLM
     - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
 

From 71bcaf99e2cb2c677bf3a9addb9e8039cbcab22a Mon Sep 17 00:00:00 2001
From: Tao He <sighingnow@gmail.com>
Date: Tue, 27 Feb 2024 17:14:31 +0800
Subject: [PATCH 024/113] Enable GQA support in the prefix prefill kernels
 (#3007)

Signed-off-by: Tao He <sighingnow@gmail.com>
---
 tests/kernels/test_prefix_prefill.py          | 61 +++++++++++++------
 vllm/model_executor/layers/attention.py       | 34 ++++++-----
 .../layers/triton_kernel/prefix_prefill.py    | 39 ++++++++----
 3 files changed, 87 insertions(+), 47 deletions(-)

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index ac93b32588cca..c068b38a66910 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -8,7 +8,8 @@
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
-NUM_HEADS = [12]
+NUM_HEADS = [64]
+NUM_QUERIES_PER_KV = [1, 8, 64]
 HEAD_SIZES = [128]
 DTYPES = [torch.float16]
 CUDA_DEVICES = [
@@ -17,12 +18,14 @@
 
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_queries_per_kv", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_contexted_kv_attention(
     num_heads: int,
+    num_queries_per_kv: int,
     head_size: int,
     dtype: torch.dtype,
     device: str,
@@ -41,28 +44,29 @@ def test_contexted_kv_attention(
     subquery_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
     ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
     seq_lens = [a + b for a, b in zip(subquery_lens, ctx_lens)]
+    num_kv_heads = num_heads // num_queries_per_kv
 
     num_tokens = sum(subquery_lens)
     query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
     query.uniform_(-1e-3, 1e-3)
     output = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
 
-    kv = torch.empty(sum(seq_lens), 2, num_heads, head_size, dtype=dtype)
+    kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype)
     kv.uniform_(-1e-3, 1e-3)
     key, value = kv.unbind(dim=1)
 
     k_cache = torch.zeros(cache_size,
                           block_size,
-                          num_heads,
+                          num_kv_heads,
                           head_size,
                           dtype=dtype)
     v_cache = torch.zeros(cache_size,
                           block_size,
-                          num_heads,
+                          num_kv_heads,
                           head_size,
                           dtype=dtype)
-    k = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
-    v = torch.zeros(sum(subquery_lens), num_heads, head_size, dtype=dtype)
+    k = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
+    v = torch.zeros(sum(subquery_lens), num_kv_heads, head_size, dtype=dtype)
     values = torch.arange(0, cache_size, dtype=torch.long)
     values = values[torch.randperm(cache_size)]
     block_table = values[:BS * max_block_per_request].view(
@@ -93,19 +97,21 @@ def test_contexted_kv_attention(
                 end_loc = start_loc + block_size
             start_slot = block_table[i, block_id] * block_size
             end_slot = start_slot + end_loc - start_loc
-            k_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_(
-                key[start_loc:end_loc])
-            v_cache.view(-1, num_heads, head_size)[start_slot:end_slot].copy_(
-                value[start_loc:end_loc])
+            k_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             key[start_loc:end_loc])
+            v_cache.view(-1, num_kv_heads,
+                         head_size)[start_slot:end_slot].copy_(
+                             value[start_loc:end_loc])
             cur_ctx += block_size
             block_id += 1
     # transpose K_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to K_cache[num_blocks, num_kv_heads, head_size/8, block_size, 8]
-    k_cache = k_cache.view(-1, block_size, num_heads, head_size // 8,
+    k_cache = k_cache.view(-1, block_size, num_kv_heads, head_size // 8,
                            8).permute(0, 2, 3, 1, 4).contiguous()
     # transpose V_cache[num_blocks, block_size, num_kv_heads, head_size]
     # to V_cache[num_blocks, num_kv_heads, head_size, block_size]
-    v_cache = v_cache.view(-1, block_size, num_heads,
+    v_cache = v_cache.view(-1, block_size, num_kv_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
 
     # Warm up the Triton kernel by calling it once before actually measuring generation time
@@ -123,12 +129,29 @@ def test_contexted_kv_attention(
 
     attn_op = xops.fmha.cutlass.FwOp()
 
+    if num_kv_heads != num_heads:
+        # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+        # project the key and value tensors to the desired number of
+        # heads.
+        #
+        # see also: vllm/model_executor/layers/attention.py
+        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
+                           query.shape[-1])
+        key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
+                                        num_queries_per_kv, key.shape[-1])
+        value = value[:, :,
+                      None, :].expand(value.shape[0], num_kv_heads,
+                                      num_queries_per_kv, value.shape[-1])
+    query = query.unsqueeze(0)
+    key = key.unsqueeze(0)
+    value = value.unsqueeze(0)
+
     attn_bias = BlockDiagonalCausalFromBottomRightMask.from_seqlens(
         subquery_lens, seq_lens)
     output_ref = xops.memory_efficient_attention_forward(
-        query.unsqueeze(0),
-        key.unsqueeze(0),
-        value.unsqueeze(0),
+        query,
+        key,
+        value,
         attn_bias=attn_bias,
         p=0.0,
         scale=scale,
@@ -137,9 +160,9 @@ def test_contexted_kv_attention(
     torch.cuda.synchronize()
     start_time = time.time()
     output_ref = xops.memory_efficient_attention_forward(
-        query.unsqueeze(0),
-        key.unsqueeze(0),
-        value.unsqueeze(0),
+        query,
+        key,
+        value,
         attn_bias=attn_bias,
         p=0.0,
         scale=scale,
@@ -148,5 +171,5 @@ def test_contexted_kv_attention(
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
-    output_ref = output_ref.squeeze(0)
+    output_ref = output_ref.squeeze(0, 2)
     assert torch.allclose(output_ref, output, atol=1e-6, rtol=0)
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 0622a54db1bc0..2a82325b80213 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -137,25 +137,27 @@ def forward(
             )
 
         if input_metadata.is_prompt:
-            # Prompt run.
-            if self.num_kv_heads != self.num_heads:
-                # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
-                # project the key and value tensors to the desired number of
-                # heads.
-                # TODO(woosuk): Use MQA/GQA kernels for higher performance.
-                query = query.view(query.shape[0], self.num_kv_heads,
-                                   self.num_queries_per_kv, query.shape[-1])
-                key = key[:, :,
-                          None, :].expand(key.shape[0], self.num_kv_heads,
-                                          self.num_queries_per_kv,
-                                          key.shape[-1])
-                value = value[:, :, None, :].expand(value.shape[0],
-                                                    self.num_kv_heads,
-                                                    self.num_queries_per_kv,
-                                                    value.shape[-1])
             # normal attention
             if (key_cache is None or value_cache is None
                     or input_metadata.block_tables.numel() == 0):
+                if self.num_kv_heads != self.num_heads:
+                    # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+                    # project the key and value tensors to the desired number of
+                    # heads.
+                    # TODO(woosuk): Use MQA/GQA kernels for higher performance.
+                    query = query.view(query.shape[0], self.num_kv_heads,
+                                       self.num_queries_per_kv,
+                                       query.shape[-1])
+                    key = key[:, :,
+                              None, :].expand(key.shape[0], self.num_kv_heads,
+                                              self.num_queries_per_kv,
+                                              key.shape[-1])
+                    value = value[:, :,
+                                  None, :].expand(value.shape[0],
+                                                  self.num_kv_heads,
+                                                  self.num_queries_per_kv,
+                                                  value.shape[-1])
+
                 # Set attention bias if not provided. This typically happens at
                 # the very attention layer of every iteration.
                 # FIXME(woosuk): This is a hack.
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
index a1a2ab0c4805c..70f09224f1cf6 100644
--- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
+++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -45,6 +45,7 @@ def _fwd_kernel(
         stride_v_cache_h,
         stride_v_cache_d,
         stride_v_cache_bl,
+        num_queries_per_kv: int,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,
         BLOCK_N: tl.constexpr,
@@ -53,6 +54,8 @@ def _fwd_kernel(
         cur_head = tl.program_id(1)
         start_m = tl.program_id(2)
 
+        cur_kv_head = cur_head // num_queries_per_kv
+
         cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
@@ -85,13 +88,14 @@ def _fwd_kernel(
                          mask=(start_n + offs_n) < cur_batch_ctx_len,
                          other=0)
             off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_head * stride_k_cache_h +
+                     cur_kv_head * stride_k_cache_h +
                      (offs_d[:, None] // x) * stride_k_cache_d +
                      ((start_n + offs_n[None, :]) % block_size) *
                      stride_k_cache_bl +
                      (offs_d[:, None] % x) * stride_k_cache_x)
             off_v = (
-                bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h +
+                bn[:, None] * stride_v_cache_bs +
+                cur_kv_head * stride_v_cache_h +
                 offs_d[None, :] * stride_v_cache_d +
                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
             k = tl.load(K_cache + off_k,
@@ -131,9 +135,9 @@ def _fwd_kernel(
             l_i = l_i_new
             m_i = m_i_new
 
-        off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh +
+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
                  offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh +
+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
                  offs_d[None, :] * stride_vd)
         k_ptrs = K + off_k
         v_ptrs = V + off_v
@@ -232,6 +236,7 @@ def _fwd_kernel_flash_attn_v2(
         stride_v_cache_h,
         stride_v_cache_d,
         stride_v_cache_bl,
+        num_queries_per_kv: int,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,
         BLOCK_N: tl.constexpr,
@@ -240,6 +245,8 @@ def _fwd_kernel_flash_attn_v2(
         cur_head = tl.program_id(1)
         start_m = tl.program_id(2)
 
+        cur_kv_head = cur_head // num_queries_per_kv
+
         cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
@@ -272,13 +279,14 @@ def _fwd_kernel_flash_attn_v2(
                          mask=(start_n + offs_n) < cur_batch_ctx_len,
                          other=0)
             off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_head * stride_k_cache_h +
+                     cur_kv_head * stride_k_cache_h +
                      (offs_d[:, None] // x) * stride_k_cache_d +
                      ((start_n + offs_n[None, :]) % block_size) *
                      stride_k_cache_bl +
                      (offs_d[:, None] % x) * stride_k_cache_x)
             off_v = (
-                bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h +
+                bn[:, None] * stride_v_cache_bs +
+                cur_kv_head * stride_v_cache_h +
                 offs_d[None, :] * stride_v_cache_d +
                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
             k = tl.load(K_cache + off_k,
@@ -317,9 +325,9 @@ def _fwd_kernel_flash_attn_v2(
             l_i = l_i_new
             m_i = m_i_new
 
-        off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh +
+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
                  offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh +
+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
                  offs_d[None, :] * stride_vd)
         k_ptrs = K + off_k
         v_ptrs = V + off_v
@@ -420,6 +428,7 @@ def _fwd_kernel_alibi(
         stride_v_cache_h,
         stride_v_cache_d,
         stride_v_cache_bl,
+        num_queries_per_kv: int,
         BLOCK_M: tl.constexpr,
         BLOCK_DMODEL: tl.constexpr,
         BLOCK_N: tl.constexpr,
@@ -429,6 +438,8 @@ def _fwd_kernel_alibi(
         cur_head = tl.program_id(1)
         start_m = tl.program_id(2)
 
+        cur_kv_head = cur_head // num_queries_per_kv
+
         # cur_batch_seq_len: the length of prompts
         # cur_batch_ctx_len: the length of prefix
         # cur_batch_in_all_start_index: the start id of the dim=0
@@ -468,13 +479,14 @@ def _fwd_kernel_alibi(
                          mask=(start_n + offs_n) < cur_batch_ctx_len,
                          other=0)
             off_k = (bn[None, :] * stride_k_cache_bs +
-                     cur_head * stride_k_cache_h +
+                     cur_kv_head * stride_k_cache_h +
                      (offs_d[:, None] // x) * stride_k_cache_d +
                      ((start_n + offs_n[None, :]) % block_size) *
                      stride_k_cache_bl +
                      (offs_d[:, None] % x) * stride_k_cache_x)
             off_v = (
-                bn[:, None] * stride_v_cache_bs + cur_head * stride_v_cache_h +
+                bn[:, None] * stride_v_cache_bs +
+                cur_kv_head * stride_v_cache_h +
                 offs_d[None, :] * stride_v_cache_d +
                 (start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
             k = tl.load(K_cache + off_k,
@@ -522,9 +534,9 @@ def _fwd_kernel_alibi(
             l_i = l_i_new
             m_i = m_i_new
 
-        off_k = (offs_n[None, :] * stride_kbs + cur_head * stride_kh +
+        off_k = (offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh +
                  offs_d[:, None] * stride_kd)
-        off_v = (offs_n[:, None] * stride_vbs + cur_head * stride_vh +
+        off_v = (offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh +
                  offs_d[None, :] * stride_vd)
         k_ptrs = K + off_k
         v_ptrs = V + off_v
@@ -628,6 +640,7 @@ def context_attention_fwd(q,
 
         sm_scale = 1.0 / (Lq**0.5)
         batch, head = b_seq_len.shape[0], q.shape[1]
+        num_queries_per_kv = q.shape[1] // k.shape[1]
 
         grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
 
@@ -674,6 +687,7 @@ def context_attention_fwd(q,
                 v_cache.stride(2),
                 v_cache.stride(
                     3),  #[num_blocks, num_kv_heads, head_size, block_size]
+                num_queries_per_kv=num_queries_per_kv,
                 BLOCK_M=BLOCK,
                 BLOCK_DMODEL=Lk,
                 BLOCK_N=BLOCK,
@@ -721,6 +735,7 @@ def context_attention_fwd(q,
             v_cache.stride(2),
             v_cache.stride(
                 3),  #[num_blocks, num_kv_heads, head_size, block_size]
+            num_queries_per_kv=num_queries_per_kv,
             BLOCK_M=BLOCK,
             BLOCK_DMODEL=Lk,
             BLOCK_N=BLOCK,

From a8683102cc0ab9c1a0c3ae1ba2b7954f78eba1b3 Mon Sep 17 00:00:00 2001
From: Ganesh Jagadeesan <ganesh.jcs@gmail.com>
Date: Wed, 28 Feb 2024 00:26:15 -0500
Subject: [PATCH 025/113] multi-lora documentation fix (#3064)

---
 docs/source/models/lora.rst | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 1910f26506611..21b18c75fc552 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -58,7 +58,7 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.
 
 .. code-block:: bash
 
-    python -m vllm.entrypoints.api_server \
+    python -m vllm.entrypoints.openai.api_server \
         --model meta-llama/Llama-2-7b-hf \
         --enable-lora \
         --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
@@ -89,3 +89,15 @@ with its base model:
 Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
 LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
+
+The following is an example request 
+
+.. code-block::bash 
+    curl http://localhost:8000/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "sql-lora",
+            "prompt": "San Francisco is a",
+            "max_tokens": 7,
+            "temperature": 0
+        }' | jq

From e46fa5d52e02ee48d5fdd12b35e39993008b4bd6 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Wed, 28 Feb 2024 13:38:26 +0800
Subject: [PATCH 026/113] Restrict prometheus_client >= 0.18.0 to prevent
 errors when importing pkgs (#3070)

---
 requirements-neuron.txt | 2 +-
 requirements-rocm.txt   | 2 +-
 requirements.txt        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 36e629add664d..858472c20ca8c 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -6,4 +6,4 @@ neuronx-cc
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-prometheus_client
+prometheus_client >= 0.18.0
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index e759ba7d028d9..53bd11de7c9de 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -10,4 +10,4 @@ transformers >= 4.38.0  # Required for Gemma.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-prometheus_client
+prometheus_client >= 0.18.0
diff --git a/requirements.txt b/requirements.txt
index de93ba6354cda..d4599ec95d945 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
-prometheus_client
+prometheus_client >= 0.18.0
 pynvml == 11.5.0
 triton >= 2.1.0
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.

From 3b7178cfa4a317922d4aef9dd3b2647b8d950e7d Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Wed, 28 Feb 2024 09:34:34 -0800
Subject: [PATCH 027/113] [Neuron] Support inference with transformers-neuronx
 (#2569)

---
 examples/offline_inference_neuron.py       |  33 ++++
 tests/lora/conftest.py                     |   8 +-
 vllm/config.py                             |  41 ++++-
 vllm/engine/arg_utils.py                   |  16 +-
 vllm/engine/llm_engine.py                  |  21 ++-
 vllm/lora/layers.py                        |   4 +
 vllm/model_executor/__init__.py            |   3 +-
 vllm/model_executor/layers/sampler.py      |  18 +-
 vllm/model_executor/model_loader.py        |  10 +-
 vllm/model_executor/models/__init__.py     |  12 +-
 vllm/model_executor/models/neuron/llama.py |  79 +++++++++
 vllm/model_executor/neuron_model_loader.py |  66 +++++++
 vllm/model_executor/sampling_metadata.py   |   4 +-
 vllm/model_executor/utils.py               |  17 ++
 vllm/utils.py                              |   8 +
 vllm/worker/cache_engine.py                |  11 +-
 vllm/worker/model_runner.py                |  16 +-
 vllm/worker/neuron_worker.py               | 191 +++++++++++++++++++++
 18 files changed, 516 insertions(+), 42 deletions(-)
 create mode 100644 examples/offline_inference_neuron.py
 create mode 100644 vllm/model_executor/models/neuron/llama.py
 create mode 100644 vllm/model_executor/neuron_model_loader.py
 create mode 100644 vllm/worker/neuron_worker.py

diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
new file mode 100644
index 0000000000000..9b9dc4d94892f
--- /dev/null
+++ b/examples/offline_inference_neuron.py
@@ -0,0 +1,33 @@
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(
+    model="openlm-research/open_llama_3b",
+    max_num_seqs=8,
+    # The max_model_len and block_size arguments are required to be same as max sequence length,
+    # when targeting neuron device. Currently, this is a known limitation in continuous batching
+    # support in transformers-neuronx.
+    # TODO(liangfu): Support paged-attention in transformers-neuronx.
+    max_model_len=128,
+    block_size=128,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection, or explicitly assigned.
+    device="neuron")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 0ca0715334c25..75f4e41290c36 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -131,9 +131,11 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
     cleanup()
     get_model_old = get_model
 
-    def get_model_patched(model_config, device_config, lora_config=None):
-        return get_model_old(model_config, device_config,
-                             LoRAConfig(max_loras=4, max_lora_rank=8))
+    def get_model_patched(model_config, device_config, **kwargs):
+        return get_model_old(model_config,
+                             device_config,
+                             lora_config=LoRAConfig(max_loras=4,
+                                                    max_lora_rank=8))
 
     with patch("vllm.worker.model_runner.get_model", get_model_patched):
         engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
diff --git a/vllm/config.py b/vllm/config.py
index bd0dc89b585f7..fc848b72d7f2a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -8,7 +8,7 @@
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
-from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version
+from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version
 
 logger = init_logger(__name__)
 
@@ -380,13 +380,21 @@ def __init__(
         disable_custom_all_reduce: bool = False,
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
-        self.tensor_parallel_size = tensor_parallel_size
+        if is_neuron():
+            # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly.
+            # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload
+            # to multiple NeuronCores.
+            self.tensor_parallel_size = 1
+            self.neuron_tp_degree = tensor_parallel_size
+        else:
+            self.tensor_parallel_size = tensor_parallel_size
         self.worker_use_ray = worker_use_ray
         self.max_parallel_loading_workers = max_parallel_loading_workers
         self.disable_custom_all_reduce = disable_custom_all_reduce
 
-        self.world_size = pipeline_parallel_size * tensor_parallel_size
-        if self.world_size > 1:
+        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
+        # Ray worker is not supported for Neuron backend.
+        if self.world_size > 1 and not is_neuron():
             self.worker_use_ray = True
         self._verify_args()
 
@@ -465,8 +473,29 @@ def _verify_args(self) -> None:
 
 class DeviceConfig:
 
-    def __init__(self, device: str = "cuda") -> None:
-        self.device = torch.device(device)
+    def __init__(self, device: str = "auto") -> None:
+        if device == "auto":
+            # Automated device type detection
+            if torch.cuda.is_available():
+                self.device_type = "cuda"
+            elif is_neuron():
+                self.device_type = "neuron"
+            else:
+                raise RuntimeError("No supported device detected.")
+        else:
+            # Device type is assigned explicitly
+            self.device_type = device
+
+        # Some device types require processing inputs on CPU
+        if self.device_type in ["neuron"]:
+            self.device = torch.device("cpu")
+        else:
+            # Set device with device type
+            self.device = torch.device(self.device_type)
+
+    @property
+    def is_neuron(self):
+        return self.device_type == "neuron"
 
 
 @dataclass
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a4efd171b871d..c01e7311fb89a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -44,7 +44,7 @@ class EngineArgs:
     lora_extra_vocab_size: int = 256
     lora_dtype = 'auto'
     max_cpu_loras: Optional[int] = None
-    device: str = 'cuda'
+    device: str = 'auto'
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -171,7 +171,7 @@ def add_cli_args(
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 128],
                             help='token block size')
         parser.add_argument('--seed',
                             type=int,
@@ -264,13 +264,11 @@ def add_cli_args(
             help=('Maximum number of LoRAs to store in CPU memory. '
                   'Must be >= than max_num_seqs. '
                   'Defaults to max_num_seqs.'))
-        parser.add_argument(
-            "--device",
-            type=str,
-            default=EngineArgs.device,
-            choices=["cuda"],
-            help=('Device type for vLLM execution. '
-                  'Currently, only CUDA-compatible devices are supported.'))
+        parser.add_argument("--device",
+                            type=str,
+                            default=EngineArgs.device,
+                            choices=["auto", "cuda", "neuron"],
+                            help='Device type for vLLM execution.')
         return parser
 
     @classmethod
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f5b2145c22d6f..f0fd7efdef813 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -3,6 +3,7 @@
 import os
 import time
 import pickle
+import importlib
 from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
                     Union)
 
@@ -20,7 +21,8 @@
                            SequenceGroupOutput, SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                                TokenizerGroup)
-from vllm.utils import Counter, set_cuda_visible_devices, get_ip, get_open_port, get_distributed_init_method
+from vllm.utils import (Counter, set_cuda_visible_devices, get_ip,
+                        get_open_port, get_distributed_init_method)
 
 if ray:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -31,6 +33,12 @@
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
+# A map between the device type (in device config) to its worker module.
+DEVICE_TO_WORKER_MODULE_MAP = {
+    "cuda": "vllm.worker.worker",
+    "neuron": "vllm.worker.neuron_worker",
+}
+
 # If the env var is set, it uses the Ray's compiled DAG API
 # which optimizes the control plane overhead.
 # Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -138,10 +146,17 @@ def __init__(
     def get_tokenizer_for_seq(self, sequence: Sequence):
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
 
+    def _dispatch_worker(self):
+        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
+            self.device_config.device_type]
+        imported_worker = importlib.import_module(worker_module)
+        Worker = imported_worker.Worker
+        return Worker
+
     def _init_workers(self):
         # Lazy import the Worker to avoid importing torch.cuda/xformers
         # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker
+        Worker = self._dispatch_worker()
 
         assert self.parallel_config.world_size == 1, (
             "Ray is required if parallel_config.world_size > 1.")
@@ -243,7 +258,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
 
         # Lazy import the Worker to avoid importing torch.cuda/xformers
         # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker
+        Worker = self._dispatch_worker()
 
         # Initialize torch distributed process group for the workers.
         model_config = copy.deepcopy(self.model_config)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e1aac20b038b4..e667d70f71e39 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -795,6 +795,10 @@ def __init__(
         self.dtype = dtype
         self.device = device
 
+    @property
+    def logits_as_hidden_states(self):
+        return self.base_layer.logits_as_hidden_states
+
     @property
     def vocab_size(self):
         return self.base_layer.vocab_size
diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py
index 0d5b2004ad7cb..cd6dbde5f54cf 100644
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
@@ -1,7 +1,6 @@
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
+from vllm.model_executor.utils import set_random_seed, get_model
 
 __all__ = [
     "InputMetadata",
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 884d84387e505..71655b216fb3d 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -10,6 +10,7 @@
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceData, SequenceGroupOutput, SequenceOutput)
+from vllm.utils import is_neuron
 
 
 class Sampler(nn.Module):
@@ -32,6 +33,8 @@ def __init__(self,
                  org_vocab_size: Optional[int] = None) -> None:
         super().__init__()
         self.vocab_size = vocab_size
+        # Transformers-neuronx generate outputs as logits directly.
+        self.logits_as_hidden_states = is_neuron()
         # original vocabulary size (without LoRA).
         self.org_vocab_size = org_vocab_size or vocab_size
 
@@ -55,10 +58,14 @@ def forward(
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[SamplerOutput]:
         # Get the hidden states that we use for sampling.
-        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
+        if self.logits_as_hidden_states:
+            logits = hidden_states
+        else:
+            hidden_states = _prune_hidden_states(hidden_states,
+                                                 sampling_metadata)
 
-        # Get the logits for the next tokens.
-        logits = self._get_logits(hidden_states, embedding, embedding_bias)
+            # Get the logits for the next tokens.
+            logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
         # Only perform sampling in the driver worker.
         # Note: `_get_logits` is still distributed across TP workers because
@@ -395,7 +402,8 @@ def _sample(
         sample_metadata[sampling_type] = (seq_group_ids, seq_groups,
                                           is_prompts, sample_indices)
         if sampling_type == SamplingType.GREEDY:
-            greedy_samples = torch.argmax(logprobs[sample_indices], dim=-1)
+            greedy_samples = torch.argmax(logprobs[sample_indices.long()],
+                                          dim=-1)
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             max_best_of = 1
             for seq_group, is_prompt in zip(seq_groups, is_prompts):
@@ -407,7 +415,7 @@ def _sample(
                 "generators": sampling_metadata.generators,
             }
             multinomial_samples[sampling_type] = _multinomial(
-                probs[sample_indices], max_best_of, **seeded_args)
+                probs[sample_indices.long()], max_best_of, **seeded_args)
         elif sampling_type == SamplingType.BEAM:
             beam_search_logprobs = logprobs[sample_indices]
         else:
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
index ebe092b5d62ba..cb64d80c8147d 100644
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@@ -1,11 +1,11 @@
 """Utilities for selecting and loading models."""
 import contextlib
-from typing import Optional, Type
+from typing import Type
 
 import torch
 import torch.nn as nn
 
-from vllm.config import DeviceConfig, ModelConfig, LoRAConfig
+from vllm.config import DeviceConfig, ModelConfig
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.weight_utils import (get_quant_config,
                                               initialize_dummy_weights)
@@ -37,9 +37,9 @@ def _get_model_architecture(model_config: ModelConfig) -> Type[nn.Module]:
         f"Supported architectures: {ModelRegistry.get_supported_archs()}")
 
 
-def get_model(model_config: ModelConfig,
-              device_config: DeviceConfig,
-              lora_config: Optional[LoRAConfig] = None) -> nn.Module:
+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
+              **kwargs) -> nn.Module:
+    lora_config = kwargs.get("lora_config", None)
     model_class = _get_model_architecture(model_config)
 
     # Get the (maybe quantized) linear method.
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 66d28207d664f..e4f3a785cd99a 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
-from vllm.utils import is_hip
+from vllm.utils import is_hip, is_neuron
 
 logger = init_logger(__name__)
 
@@ -61,6 +61,9 @@
     "Sliding window attention is not yet supported in ROCm's flash attention",
 }
 
+# Models not supported by Neuron.
+_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"}
+
 
 class ModelRegistry:
 
@@ -77,8 +80,15 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
                 logger.warning(
                     f"Model architecture {model_arch} is partially supported "
                     "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
+        elif is_neuron():
+            if model_arch not in _NEURON_SUPPORTED_MODELS:
+                raise ValueError(
+                    f"Model architecture {model_arch} is not supported by "
+                    "Neuron for now.")
 
         module_name, model_cls_name = _MODELS[model_arch]
+        if is_neuron():
+            module_name = _NEURON_SUPPORTED_MODELS[model_arch]
         module = importlib.import_module(
             f"vllm.model_executor.models.{module_name}")
         return getattr(module, model_cls_name, None)
diff --git a/vllm/model_executor/models/neuron/llama.py b/vllm/model_executor/models/neuron/llama.py
new file mode 100644
index 0000000000000..e2856da99d9b1
--- /dev/null
+++ b/vllm/model_executor/models/neuron/llama.py
@@ -0,0 +1,79 @@
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+import os
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class LlamaForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method=None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = None
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        with torch.inference_mode():
+            block_size = self.model.context_buckets[-1]
+            if input_metadata.is_prompt:
+                seq_ids = input_metadata.slot_mapping[:, 0] // block_size
+            else:
+                seq_ids = input_metadata.block_tables
+            logits = self.model(input_ids,
+                                cache_ids=positions,
+                                start_ids=seq_ids.flatten())
+        return logits
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.model.chkpt_model.lm_head,
+                                   hidden_states, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None,
+                     **kwargs):
+        from transformers_neuronx.llama.model import LlamaForSampling
+
+        split_model_dir = f"{model_name_or_path}-split"
+        if os.path.isdir(os.path.join(model_name_or_path,
+                                      "pytorch_model.bin")):
+            split_model_dir = model_name_or_path
+        elif not os.path.exists(f"{model_name_or_path}-split"):
+            from transformers.models.llama import LlamaForCausalLM
+            from transformers_neuronx.module import save_pretrained_split
+
+            hf_model = LlamaForCausalLM.from_pretrained(model_name_or_path,
+                                                        low_cpu_mem_usage=True)
+            save_pretrained_split(hf_model, f"{model_name_or_path}-split")
+
+        self.model = LlamaForSampling.from_pretrained(split_model_dir,
+                                                      **kwargs)
+        self.model.to_neuron()
diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py
new file mode 100644
index 0000000000000..b8d63d4ff12fc
--- /dev/null
+++ b/vllm/model_executor/neuron_model_loader.py
@@ -0,0 +1,66 @@
+"""Utilities for selecting and loading models."""
+from typing import Type
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.config import ModelConfig, DeviceConfig
+from vllm.model_executor.models import ModelRegistry
+
+TORCH_DTYPE_TO_NEURON_AMP = {
+    "auto": "f32",
+    "half": "f16",
+    "float16": "f16",
+    "bfloat16": "bf16",
+    "float": "f32",
+    "float32": "f32",
+    torch.float16: "f16",
+    torch.bfloat16: "bf16",
+    torch.float32: "f32",
+}
+
+
+def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        model_cls = ModelRegistry.load_model_cls(arch)
+        if model_cls is not None:
+            return model_cls
+    raise ValueError(
+        f"Model architectures {architectures} are not supported for now. "
+        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
+              **kwargs) -> nn.Module:
+    from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig
+
+    parallel_config = kwargs.get("parallel_config")
+    scheduler_config = kwargs.get("scheduler_config")
+
+    model_class = _get_model_architecture(model_config.hf_config)
+    linear_method = None
+
+    # Create a model instance.
+    model = model_class(model_config.hf_config, linear_method)
+
+    continuous_batching_config = ContinuousBatchingConfig(
+        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
+    neuron_config = NeuronConfig(
+        continuous_batching=continuous_batching_config)
+
+    # Load the weights from the cached or downloaded files.
+    model.load_weights(
+        model_config.model,
+        model_config.download_dir,
+        model_config.load_format,
+        model_config.revision,
+        tp_degree=parallel_config.neuron_tp_degree,
+        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
+        neuron_config=neuron_config,
+        context_length_estimate=[scheduler_config.max_model_len],
+        n_positions=[scheduler_config.max_model_len],
+        batch_size=scheduler_config.max_num_seqs)
+
+    return model.eval()
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index d0ffeecd2d74d..7deb80801856e 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -5,7 +5,7 @@
 
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import SequenceData
-from vllm.utils import in_wsl
+from vllm.utils import in_wsl, is_neuron
 
 _SAMPLING_EPS = 1e-5
 
@@ -155,7 +155,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                    dtype: torch.dtype) -> "SamplingTensors":
         # Note that the performance will be very bad without
         # pinned memory.
-        pin_memory = not in_wsl()
+        pin_memory = not in_wsl() and not is_neuron()
         prompt_max_len = max(len(tokens) for tokens in prompt_tokens)
         prompt_padded_tokens = [
             tokens + [vocab_size] * (prompt_max_len - len(tokens))
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 336bc1cd005cf..0113e3edf0675 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -1,10 +1,18 @@
 """Utils for model executor."""
 import random
+import importlib
 from typing import Any, Dict, Optional
 
 import numpy as np
 import torch
 
+from vllm.config import DeviceConfig, ModelConfig
+
+DEVICE_TO_MODEL_LOADER_MAP = {
+    "cuda": "model_loader",
+    "neuron": "neuron_model_loader",
+}
+
 
 def set_random_seed(seed: int) -> None:
     random.seed(seed)
@@ -33,3 +41,12 @@ def set_weight_attrs(
         assert not hasattr(
             weight, key), (f"Overwriting existing tensor attribute: {key}")
         setattr(weight, key, value)
+
+
+def get_model(model_config: ModelConfig, device_config: DeviceConfig,
+              **kwargs) -> torch.nn.Module:
+    model_loader_module = DEVICE_TO_MODEL_LOADER_MAP[device_config.device_type]
+    imported_model_loader = importlib.import_module(
+        f"vllm.model_executor.{model_loader_module}")
+    get_model_fn = imported_model_loader.get_model
+    return get_model_fn(model_config, device_config, **kwargs)
diff --git a/vllm/utils.py b/vllm/utils.py
index c8ac57de6f5f5..a4f9bfe6aac99 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -118,6 +118,14 @@ def is_hip() -> bool:
     return torch.version.hip is not None
 
 
+def is_neuron() -> bool:
+    try:
+        import transformers_neuronx
+    except ImportError:
+        transformers_neuronx = None
+    return transformers_neuronx is not None
+
+
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
     # NOTE: This import statement should be executed lazily since
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index bbe33989fc2a4..880299783935c 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -3,10 +3,9 @@
 
 import torch
 
-from vllm._C import cache_ops
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE
 
 logger = init_logger(__name__)
 
@@ -39,6 +38,10 @@ def __init__(
         self.num_gpu_blocks = cache_config.num_gpu_blocks
         self.num_cpu_blocks = cache_config.num_cpu_blocks
 
+        # Skip initializing CUDA stream and buffer for Neuron backend.
+        if is_neuron():
+            return
+
         if cache_config.cache_dtype == "auto":
             self.dtype = model_config.dtype
         else:
@@ -121,6 +124,8 @@ def _swap(
         dst: List[KVCache],
         src_to_dst: Dict[int, int],
     ) -> None:
+        from vllm._C import cache_ops
+
         with torch.cuda.stream(self.cache_stream):
             for i in range(self.num_layers):
                 src_key_cache, src_value_cache = src[i]
@@ -140,6 +145,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
         self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
 
     def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
+        from vllm._C import cache_ops
+
         key_caches = [key_cache for key_cache, _ in self.gpu_cache]
         value_caches = [value_cache for _, value_cache in self.gpu_cache]
         # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index b99a409e02d1e..efe570778fb43 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -80,9 +80,16 @@ def __init__(
         self.in_wsl = in_wsl()
         self.kv_cache_dtype = kv_cache_dtype
 
+        # Set enforce_eager to True for Neuron backend, to avoid capturing graph
+        if self.device_config.is_neuron:
+            self.model_config.enforce_eager = True
+
     def load_model(self) -> None:
-        self.model = get_model(self.model_config, self.device_config,
-                               self.lora_config)
+        self.model = get_model(self.model_config,
+                               self.device_config,
+                               lora_config=self.lora_config,
+                               parallel_config=self.parallel_config,
+                               scheduler_config=self.scheduler_config)
 
         vocab_size = self.model.config.vocab_size
 
@@ -393,6 +400,7 @@ def _prepare_sample(
         selected_token_start_idx = 0
         categorized_sample_indices = {t: [] for t in SamplingType}
         categorized_sample_indices_start_idx = 0
+        pin_memory = not self.in_wsl and not self.device_config.is_neuron
 
         max_subquery_len = max(subquery_lens) if subquery_lens else 1
         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
@@ -443,12 +451,12 @@ def _prepare_sample(
         selected_token_indices = _async_h2d(selected_token_indices,
                                             dtype=torch.long,
                                             target_device=self.device,
-                                            pin_memory=not self.in_wsl)
+                                            pin_memory=pin_memory)
         categorized_sample_indices = {
             t: _async_h2d(seq_ids,
                           dtype=torch.int,
                           target_device=self.device,
-                          pin_memory=not self.in_wsl)
+                          pin_memory=pin_memory)
             for t, seq_ids in categorized_sample_indices.items()
         }
 
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
new file mode 100644
index 0000000000000..3229a21c11a38
--- /dev/null
+++ b/vllm/worker/neuron_worker.py
@@ -0,0 +1,191 @@
+"""A Neuron worker class."""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed
+
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.parallel_utils.communication_op import (
+    broadcast_tensor_dict)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    ensure_model_parallel_initialized)
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.model_runner import ModelRunner
+
+
+class Worker:
+    """A worker class that executes the model on a group of neuron cores.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        kv_cache_dtype: Optional[str] = "auto",
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.is_driver_worker = is_driver_worker
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+
+        self.model_runner = ModelRunner(model_config,
+                                        parallel_config,
+                                        scheduler_config,
+                                        device_config,
+                                        lora_config=self.lora_config,
+                                        is_driver_worker=is_driver_worker)
+        # Uninitialized cache engine. Will be initialized by
+        # self.init_cache_engine().
+        self.cache_config = None
+        self.cache_engine = None
+        self.cache_events = None
+        self.gpu_cache = None
+
+    def init_model(self) -> None:
+        # Initialize the distributed environment.
+        _init_distributed_environment(self.parallel_config,
+                                      self.rank,
+                                      self.distributed_init_method,
+                                      distributed_backend="gloo")
+
+        # Initialize the model.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    @torch.inference_mode()
+    def profile_num_available_blocks(
+        self,
+        block_size: int = 128,
+        gpu_memory_utilization: float = 0.9,
+        cpu_swap_space: int = 0,
+        cache_dtype: str = "float16",
+    ) -> Tuple[int, int]:
+        """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
+        num_gpu_blocks = self.scheduler_config.max_num_seqs
+        num_cpu_blocks = 0
+        return num_gpu_blocks, num_cpu_blocks
+
+    def init_cache_engine(self, cache_config: CacheConfig) -> None:
+        self.cache_config = cache_config
+        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
+                                        self.parallel_config)
+        self.model_runner.set_block_size(self.cache_engine.block_size)
+
+    def warm_up_model(self) -> None:
+        # Warm up is maintained in transformers-neuronx
+        pass
+
+    def cache_swap(
+        self,
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        # Issue cache operations.
+        issued_cache_op = False
+        if blocks_to_swap_in:
+            self.cache_engine.swap_in(blocks_to_swap_in)
+            issued_cache_op = True
+        if blocks_to_swap_out:
+            self.cache_engine.swap_out(blocks_to_swap_out)
+            issued_cache_op = True
+        if blocks_to_copy:
+            self.cache_engine.copy(blocks_to_copy)
+            issued_cache_op = True
+
+        cache_events = self.cache_events if issued_cache_op else None
+
+        # Wait for cache operations to finish.
+        if cache_events is not None:
+            raise NotImplementedError(
+                "cache operations are not implemented for neuron backend.")
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
+        blocks_to_swap_in: Optional[Dict[int, int]] = None,
+        blocks_to_swap_out: Optional[Dict[int, int]] = None,
+        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
+    ) -> Optional[SamplerOutput]:
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups = len(seq_group_metadata_list)
+            assert blocks_to_swap_in is not None
+            assert blocks_to_swap_out is not None
+            assert blocks_to_copy is not None
+            data = {
+                "num_seq_groups": num_seq_groups,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            }
+            broadcast_tensor_dict(data, src=0)
+        else:
+            data = broadcast_tensor_dict(src=0)
+            num_seq_groups = data["num_seq_groups"]
+            blocks_to_swap_in = data["blocks_to_swap_in"]
+            blocks_to_swap_out = data["blocks_to_swap_out"]
+            blocks_to_copy = data["blocks_to_copy"]
+
+        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return {}
+
+        output = self.model_runner.execute_model(seq_group_metadata_list,
+                                                 self.gpu_cache)
+        return output
+
+
+def _init_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    distributed_backend: Optional[str] = None,
+) -> None:
+    """Initialize the distributed environment."""
+    if torch.distributed.is_initialized():
+        torch_world_size = torch.distributed.get_world_size()
+        if torch_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "torch.distributed is already initialized but the torch world "
+                "size does not match parallel_config.world_size "
+                f"({torch_world_size} vs. {parallel_config.world_size}).")
+    elif not distributed_init_method:
+        raise ValueError(
+            "distributed_init_method must be set if torch.distributed "
+            "is not already initialized")
+    else:
+        distributed_backend = distributed_backend if distributed_backend else "nccl"
+        torch.distributed.init_process_group(
+            backend=distributed_backend,
+            world_size=parallel_config.world_size,
+            rank=rank,
+            init_method=distributed_init_method,
+        )
+
+    # A small all_reduce for warmup.
+    torch.distributed.all_reduce(torch.zeros(1))
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)

From 929b4f2973ec6a53ea4f0f03d21147ef8b8278be Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 28 Feb 2024 13:03:28 -0800
Subject: [PATCH 028/113] Add LoRA support for Gemma (#3050)

---
 .buildkite/test-pipeline.yaml       |  2 +-
 csrc/punica/bgmv/bgmv_config.h      |  2 ++
 tests/lora/conftest.py              |  5 ++++
 tests/lora/test_gemma.py            | 46 +++++++++++++++++++++++++++++
 tests/lora/test_punica.py           |  4 +--
 vllm/model_executor/models/gemma.py | 28 ++++++++++++++++--
 vllm/model_executor/models/llama.py |  2 +-
 7 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 tests/lora/test_gemma.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index efcc4d2d07a12..c65ab04b8ddda 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -50,7 +50,7 @@ steps:
   command: pytest -v -s worker
 
 - label: LoRA Test
-  command: pytest -v -s lora
+  command: pytest -v -s lora --forked
 
 - label: Metrics Test
   command: pytest -v -s metrics
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index ebf638f104c3f..d5fee9c40d00c 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -28,6 +28,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 5120) \
     f(in_T, out_T, W_T, narrow, 5504) \
     f(in_T, out_T, W_T, narrow, 5632) \
+    f(in_T, out_T, W_T, narrow, 6144) \
     f(in_T, out_T, W_T, narrow, 6912) \
     f(in_T, out_T, W_T, narrow, 7168) \
     f(in_T, out_T, W_T, narrow, 8192) \
@@ -39,6 +40,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 14336) \
     f(in_T, out_T, W_T, narrow, 16384) \
     f(in_T, out_T, W_T, narrow, 20480) \
+    f(in_T, out_T, W_T, narrow, 24576) \
     f(in_T, out_T, W_T, narrow, 28672) \
     f(in_T, out_T, W_T, narrow, 32000) \
     f(in_T, out_T, W_T, narrow, 32256) \
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 75f4e41290c36..67273144ecd02 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -126,6 +126,11 @@ def mixtral_lora_files():
     return snapshot_download(repo_id="terrysun/mixtral-lora-adapter")
 
 
+@pytest.fixture(scope="session")
+def gemma_lora_files():
+    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
+
+
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings() -> nn.Module:
     cleanup()
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
new file mode 100644
index 0000000000000..0082c6e74e888
--- /dev/null
+++ b/tests/lora/test_gemma.py
@@ -0,0 +1,46 @@
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "google/gemma-7b"
+
+
+def do_sample(llm, lora_path: str, lora_id: int) -> str:
+    prompts = [
+        "Quote: Imagination is",
+        "Quote: Be yourself;",
+        "Quote: So many books,",
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+def test_gemma_lora(gemma_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4)
+
+    expected_lora_output = [
+        "more important than knowledge.\nAuthor: Albert Einstein\n",
+        "everyone else is already taken.\nAuthor: Oscar Wilde\n",
+        "so little time\nAuthor: Frank Zappa\n",
+    ]
+
+    output1 = do_sample(llm, gemma_lora_files, lora_id=1)
+    for i in range(len(expected_lora_output)):
+        assert output1[i].startswith(expected_lora_output[i])
+    output2 = do_sample(llm, gemma_lora_files, lora_id=2)
+    for i in range(len(expected_lora_output)):
+        assert output2[i].startswith(expected_lora_output[i])
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index 903814faa5dc7..cbe0f6fa2e851 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -44,8 +44,8 @@ def _lora_ref_impl(
 
 H1 = H2 = [
     128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120,
-    5504, 5632, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, 32000,
-    32256, 32512, 32768, 33024
+    5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336,
+    24576, 32000, 32256, 32512, 32768, 33024
 ]
 SEED = [0xabcdabcd987]
 
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index d8b515993d8ff..03948132d32c3 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -20,6 +20,7 @@
 from torch import nn
 from transformers import GemmaConfig
 
+from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import GeluAndMul
 from vllm.model_executor.layers.attention import PagedAttention
@@ -246,12 +247,36 @@ def forward(
 
 
 class GemmaForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(
         self,
         config: GemmaConfig,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
+        del lora_config  # Unused.
         super().__init__()
         self.config = config
         self.linear_method = linear_method
@@ -305,9 +330,6 @@ def load_weights(self,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # Skip loading extra layer for lora models.
-                if "lm_head" in name:
-                    continue
                 # GemmaRMSNorm is different from Llama's in that it multiplies
                 # (1 + weight) to the output, instead of just weight.
                 if "norm.weight" in name:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index b7f6b8f3ec374..d35887cc0f6a3 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -27,6 +27,7 @@
 from torch import nn
 from transformers import LlamaConfig
 
+from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.attention import PagedAttention
@@ -45,7 +46,6 @@
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
-from vllm.config import LoRAConfig
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 

From 01a5d18a537b65a156cfa1a77706693a24c869c1 Mon Sep 17 00:00:00 2001
From: CHU Tianxiang <tianxiang.ctx@alibaba-inc.com>
Date: Thu, 29 Feb 2024 13:52:23 +0800
Subject: [PATCH 029/113] Add Support for 2/3/8-bit GPTQ Quantization Models
 (#2330)

---
 csrc/ops.h                                    |    6 +-
 csrc/quantization/gptq/matrix_view.cuh        |  123 ++
 csrc/quantization/gptq/q_gemm.cu              | 1452 +++++++++++++++--
 csrc/quantization/gptq/qdq_2.cuh              |   87 +
 csrc/quantization/gptq/qdq_3.cuh              |  141 ++
 csrc/quantization/gptq/qdq_4.cuh              |  100 +-
 csrc/quantization/gptq/qdq_8.cuh              |   40 +
 .../layers/quantization/gptq.py               |   16 +-
 8 files changed, 1736 insertions(+), 229 deletions(-)
 create mode 100644 csrc/quantization/gptq/qdq_2.cuh
 create mode 100644 csrc/quantization/gptq/qdq_3.cuh
 create mode 100644 csrc/quantization/gptq/qdq_8.cuh

diff --git a/csrc/ops.h b/csrc/ops.h
index dbdd2c2c57945..08dfb0e8604f1 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -98,11 +98,13 @@ torch::Tensor gptq_gemm(
   torch::Tensor b_gptq_qzeros,
   torch::Tensor b_gptq_scales,
   torch::Tensor b_g_idx,
-  bool use_exllama);
+  bool use_exllama,
+  int bit);
 
 void gptq_shuffle(
   torch::Tensor q_weight,
-  torch::Tensor q_perm);
+  torch::Tensor q_perm,
+  int bit);
 
 void moe_align_block_size(
   torch::Tensor topk_ids,
diff --git a/csrc/quantization/gptq/matrix_view.cuh b/csrc/quantization/gptq/matrix_view.cuh
index 1fdf019b29028..eda3436eb5375 100644
--- a/csrc/quantization/gptq/matrix_view.cuh
+++ b/csrc/quantization/gptq/matrix_view.cuh
@@ -146,6 +146,129 @@ public:
     __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
 };
 
+class MatrixView_q2_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q2_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int shift = (column & 0x0f) * 2;
+        return (data[row * width / 16 + column / 16] >> shift) & 0x03;
+    }
+
+    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+    {
+        int shift = (column & 0x0f) * 2;
+        uint32_t d = data[row * width / 16 + column / 16] >> shift;
+        items[0] = d & 0x03;
+        items[1] = (d >> 2) & 0x03;
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x0f) * 2;
+        uint32_t d = data[row * width / 16 + column / 16] >> shift;
+        items[0] = d & 0x03;
+        items[1] = (d >> 2) & 0x03;
+        items[2] = (d >> 4) & 0x03;
+        items[3] = (d >> 6) & 0x03;
+    }
+};
+
+class MatrixView_q3_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q3_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int z_w = column * 3 / 32;
+        int z_mod =  column & 0x1f;
+
+        if (z_mod == 10) {
+            return (data[row * width * 3 / 32 + z_w] >> 30) | ((data[row * width * 3 / 32 + (z_w + 1)] << 2) & 0x4);
+        } else if (z_mod == 21) {
+            return (data[row * width * 3 / 32 + z_w] >> 31) | ((data[row * width * 3 / 32 + (z_w + 1)] << 1) & 0x6);
+        } else if (z_mod < 10) {
+            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3)) & 0x07;
+        } else if (z_mod < 21) {
+            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3  - 32)) & 0x07;
+        } else {
+            return (data[row * width * 3 / 32 + z_w] >> (z_mod * 3  - 64)) & 0x07;
+        }
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x1f);
+        uint32_t d;
+        if (shift <= 4) {
+            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3);
+        } else if (shift == 8) {
+            d = (data[row * width / 32 * 3 + column * 3 / 32] >> 24) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0x0f) << 8);
+        } else if (shift <= 16) {
+            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 32);
+        } else if (shift == 20) {
+            d = (data[row * width / 32 * 3 + column * 3 / 32] >> 28) | ((data[row * width / 32 * 3 + column * 3 / 32 + 1] & 0xff) << 4);
+        } else {
+            d = data[row * width / 32 * 3 + column * 3 / 32] >> (shift * 3 - 64);
+        }
+        items[0] = d & 0x07;
+        items[1] = (d >> 3) & 0x07;
+        items[2] = (d >> 6) & 0x07;
+        items[3] = (d >> 9) & 0x07;
+    }
+};
+
+class MatrixView_q8_row
+{
+public:
+    const uint32_t* data;
+    const int height;
+    const int width;
+
+    __device__ __forceinline__ MatrixView_q8_row(const uint32_t* data, const int height, const int width)
+        : data(data), height(height), width(width)
+    { }
+
+    __device__ __forceinline__ int item(int row, int column) const
+    {
+        int shift = (column & 0x03) * 8;
+        return (data[row * width / 4 + column / 4] >> shift) & 0xff;
+    }
+
+    __device__ __forceinline__ void item2(int (&items)[2], int row, int column) const
+    {
+        int shift = (column & 0x03) * 8;
+        uint32_t d = data[row * width / 4 + column / 4] >> shift;
+        items[0] = d & 0xff;
+        items[1] = (d >> 8) & 0xff;
+    }
+
+    __device__ __forceinline__ void item4(int (&items)[4], int row, int column) const
+    {
+        int shift = (column & 0x03) * 2;
+        uint32_t d = data[row * width / 4 + column / 4] >> shift;
+        items[0] = d & 0xff;
+        items[1] = (d >> 8) & 0xff;
+        items[2] = (d >> 16) & 0xff;
+        items[3] = (d >> 24) & 0xff;
+    }
+};
+
 }  // namespace gptq
 }  // namespace vllm
 #endif
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index a5d2345f1e7fd..655158e38f557 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -13,7 +13,10 @@ Adapted from https://github.com/turboderp/exllamav2 and https://github.com/qwopq
 
 #include "compat.cuh"
 #include "matrix_view.cuh"
+#include "qdq_2.cuh"
+#include "qdq_3.cuh"
 #include "qdq_4.cuh"
+#include "qdq_8.cuh"
 
 namespace vllm {
 namespace gptq {
@@ -22,6 +25,7 @@ namespace gptq {
 #define BLOCK_M_SIZE_MAX 8
 #define MAX_GROUPS_IN_BLOCK (BLOCK_KN_SIZE / 32)
 #define MAX_Q_GEMM_ROWS 50
+#define MAX_Q_GEMM_ROWS_8BIT 24
 #define MAX_ALT_GEMM_ROWS 8
 #define THREADS_X 32
 #define THREADS_Y 32
@@ -75,6 +79,106 @@ __forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr)
     return __half2float(__low2half(result)) + __half2float(__high2half(result));
 }
 
+__forceinline__ __device__ half2 dot22_8(half2(&dq)[4], const half* a_ptr, const half2 g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_16(half2(&dq)[8], const half* a_ptr, const half2 g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ half2 dot22_32(half2(&dq)[16], const half* a_ptr, const half2 g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+    return __hfma2(result, __halves2half2(qs_h, qs_h), g_result);
+}
+
+__forceinline__ __device__ float dot22_8_f(half2(&dq)[4], const half* a_ptr, const float g_result, const float qs_f)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 4; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+    return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_16_f(half2(&dq)[8], const half* a_ptr, const float g_result, const float qs_f)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+    return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ float dot22_32_f(half2(&dq)[16], const half* a_ptr, const float g_result, const float qs_f)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+    float result_f = __half2float(__low2half(result)) + __half2float(__high2half(result));
+    return fma(result_f, qs_f, g_result);
+}
+
+__forceinline__ __device__ half dot22_8_h(half2(&dq)[4], const half* a_ptr, const half g_result, const half qs_h)
+{
+    // Use FP32 accumulator to avoid potential overflow since unscaled weights are in the range -128..127
+
+    float result = {};
+    #pragma unroll
+    for (int i = 0; i < 4; i++)
+    {
+        half2 w01 = dq[i];
+        float w0 = __low2float(w01);
+        float w1 = __high2float(w01);
+        float x0 = __half2float(*a_ptr++);
+        float x1 = __half2float(*a_ptr++);
+        result = fma(w0, x0, result);
+        result = fma(w1, x1, result);
+    }
+    float qs = __half2float(qs_h);
+    result *= qs;
+    half result_h = __float2half_rn(result);
+    return __hadd(result_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_16_h(half2(&dq)[8], const half* a_ptr, const half g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 8; i++) result = __hfma2(dq[i], *a2_ptr++, result);
+    half result_h = __hadd(__low2half(result), __high2half(result));
+    return __hfma(result_h, qs_h, g_result);
+}
+
+__forceinline__ __device__ half dot22_32_h(half2(&dq)[16], const half* a_ptr, const half g_result, const half qs_h)
+{
+    half2 result = {};
+    const half2* a2_ptr = (const half2*)a_ptr;
+    #pragma unroll
+    for (int i = 0; i < 16; i += 1) result = __hfma2(dq[i], *a2_ptr++, result);
+    half result_h = __hadd(__low2half(result), __high2half(result));
+    return __hfma(result_h, qs_h, g_result);
+}
+
+
 typedef void (*fp_gemm_half_q_half_gptq_kernel)
 (
     const half*,
@@ -89,8 +193,9 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel)
     const int*
 );
 
+
 template <bool first_block, int m_count>
-__global__ void gemm_half_q_half_gptq_kernel
+__global__ void gemm_half_q_half_gptq_4bit_kernel
 (
     const half* __restrict__ a,
     const uint32_t* __restrict__ b_q_weight,
@@ -231,80 +336,794 @@ __global__ void gemm_half_q_half_gptq_kernel
     }
 }
 
-
-fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(bool first_block, const int m_count)
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_2bit_kernel
+(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm
+)
 {
-    #if BLOCK_M_SIZE_MAX >= 1
-    if (m_count == 1) return gemm_half_q_half_gptq_kernel<true, 1>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 2
-    if (m_count == 2) return gemm_half_q_half_gptq_kernel<true, 2>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 3
-    if (m_count == 3) return gemm_half_q_half_gptq_kernel<true, 3>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 4
-    if (m_count == 4) return gemm_half_q_half_gptq_kernel<true, 4>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 5
-    if (m_count == 5) return gemm_half_q_half_gptq_kernel<true, 5>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 6
-    if (m_count == 6) return gemm_half_q_half_gptq_kernel<true, 6>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 7
-    if (m_count == 7) return gemm_half_q_half_gptq_kernel<true, 7>;
-    #endif
-    #if BLOCK_M_SIZE_MAX >= 8
-    if (m_count == 8) return gemm_half_q_half_gptq_kernel<true, 8>;
-    #endif
-    return NULL;
-}
+    MatrixView_half a_(a, size_m, size_k);
+    MatrixView_half_rw c_(c, size_m, size_n);
+    MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
+    int t = threadIdx.x;
 
-void gemm_half_q_half_cuda_part
+    // Block
+    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+    int offset_m = blockIdx.y * m_count;
+    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+    int end_m = min(offset_m + m_count, size_m);
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    int n = offset_n + t * 4;
+
+    // Preload block_a
+    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+    if (offset_k + t < end_k)
+    {
+        for (int m = 0; m < m_count; ++m)
+        {
+            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+            half* block_a_ptr = block_a[m];
+
+            half a0;
+            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+            else a0 = a_ptr[offset_k + t];
+            block_a_ptr[t] = a0;
+        }
+    }
+
+    // Zero output
+    if (n >= size_n) return;
+
+    if (blockIdx.z == 0)
+    {
+        for (int m = 0; m < m_count; m++)
+            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+    }
+
+    __syncthreads();
+
+    // Find initial group
+    int groupsize = size_k / groups;
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // a, b offset
+    int qk = offset_k / (32 / 2);
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+    const half* a_ptr = &block_a[0][0];
+    int a_stride = BLOCK_KN_SIZE;
+
+    // Initial group
+    int zeros[4];
+    half scales[4];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4(scales, group, n);
+    // Column result
+    half block_c[m_count][4] = {};
+
+    // Dequantize and multiply
+    int k = offset_k;
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4(scales, group, n);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 1; j++)
+        {
+            const int4* b_ptr4 = (int4*) b_ptr;
+            int4 load_int4 = *b_ptr4;
+
+            half2 dq[4][8];
+            dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+            dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+            dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+            dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
+
+            #pragma unroll
+            for (int m = 0; m < m_count; m++)
+            {
+                block_c[m][0] = dot22_16_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+                block_c[m][1] = dot22_16_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+                block_c[m][2] = dot22_16_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+                block_c[m][3] = dot22_16_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+            }
+
+            b_ptr += size_n;
+            a_ptr += 16;
+        }
+
+        k += 16;
+    }
+
+    for (int m = 0; m < m_count; m++)
+    {
+        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+        atomicAdd(out    , result01);
+        atomicAdd(out + 1, result23);
+    }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_3bit_kernel
 (
-    const half* a,
-    const uint32_t* b_q_weight,
-    const uint32_t* b_gptq_qzeros,
-    const half* b_gptq_scales,
-    const int* b_q_perm,
-    half* c,
-    int size_m,
-    int size_n,
-    int size_k,
-    int m_count,
-    int groups
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm
 )
 {
-    dim3 blockDim, gridDim;
-    blockDim.x = BLOCK_KN_SIZE;
-    blockDim.y = 1;
-    blockDim.z = 1;
-    gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
-    gridDim.y = DIVIDE(size_m, m_count);
-    gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+    MatrixView_half a_(a, size_m, size_k);
+    MatrixView_half_rw c_(c, size_m, size_n);
+    MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-    fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count);
+    int t = threadIdx.x;
 
-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    kernel<<<gridDim, blockDim, 0, stream>>>
-    (
-        a,
-        b_q_weight,
-        b_gptq_qzeros,
-        b_gptq_scales,
-        c,
-        size_m,
-        size_n,
-        size_k,
-        groups,
-        b_q_perm
-    );
-}
+    // Block
+    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+    int offset_m = blockIdx.y * m_count;
+    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+    int end_m = min(offset_m + m_count, size_m);
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    int n = offset_n + t * 4;
+
+    // Preload block_a
+    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+    if (offset_k + t < end_k)
+    {
+        for (int m = 0; m < m_count; ++m)
+        {
+            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+            half* block_a_ptr = block_a[m];
 
+            half a0;
+            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+            else a0 = a_ptr[offset_k + t];
+            block_a_ptr[t] = a0;
+        }
+    }
+
+    // Zero output
+    if (n >= size_n) return;
+
+    if (blockIdx.z == 0)
+    {
+        for (int m = 0; m < m_count; m++)
+            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+    }
+
+    __syncthreads();
+
+    // Find initial group
+    int groupsize = size_k / groups;
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // a, b offset
+    int qk = offset_k / 32 * 3;
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+    const half* a_ptr = &block_a[0][0];
+    int a_stride = BLOCK_KN_SIZE;
+
+    // Initial group
+    int zeros[4];
+    half scales[4];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4(scales, group, n);
+    // Column result
+    half block_c[m_count][4] = {};
+
+    // Dequantize and multiply
+    int k = offset_k;
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4(scales, group, n);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 1; j++)
+        {
+            int4 load_int4[3];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][16];
+            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1);
+            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1);
+            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1);
+            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1);
+
+            #pragma unroll
+            for (int m = 0; m < m_count; m++)
+            {
+                block_c[m][0] = dot22_32_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+                block_c[m][1] = dot22_32_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+                block_c[m][2] = dot22_32_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+                block_c[m][3] = dot22_32_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+            }
+            a_ptr += 32;
+        }
+
+        k += 32;
+    }
+
+    for (int m = 0; m < m_count; m++)
+    {
+        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+        atomicAdd(out    , result01);
+        atomicAdd(out + 1, result23);
+    }
+}
+
+template <bool first_block, int m_count>
+__global__ void gemm_half_q_half_gptq_8bit_kernel
+(
+    const half* __restrict__ a,
+    const uint32_t* __restrict__ b_q_weight,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    half* __restrict__ c,
+    const int size_m,
+    const int size_n,
+    const int size_k,
+    const int groups,
+    const int* __restrict__ b_q_perm
+)
+{
+    MatrixView_half a_(a, size_m, size_k);
+    MatrixView_half_rw c_(c, size_m, size_n);
+    MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+    int t = threadIdx.x;
+
+    // Block
+    int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+    int offset_m = blockIdx.y * m_count;
+    int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+
+    int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+    int end_m = min(offset_m + m_count, size_m);
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    int n = offset_n + t * 4;
+
+    // Preload block_a
+    __shared__ half block_a[m_count][BLOCK_KN_SIZE];
+
+    if (offset_k + t < end_k)
+    {
+        for (int m = 0; m < m_count; ++m)
+        {
+            const half* a_ptr = a_.item_ptr(offset_m + m, 0);
+            half* block_a_ptr = block_a[m];
+
+            half a0;
+            if (b_q_perm) a0 = a_ptr[b_q_perm[offset_k + t]];
+            else a0 = a_ptr[offset_k + t];
+            block_a_ptr[t] = a0;
+        }
+    }
+
+    // Zero output
+    if (n >= size_n) return;
+
+    if (blockIdx.z == 0)
+    {
+        for (int m = 0; m < m_count; m++)
+            *((uint64_t*)c_.item_ptr(offset_m + m, n)) = 0;
+    }
+
+    __syncthreads();
+
+    // Find initial group
+    int groupsize = size_k / groups;
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // a, b offset
+    int qk = offset_k / (32 / 8);
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+    const half* a_ptr = &block_a[0][0];
+    int a_stride = BLOCK_KN_SIZE;
+
+    // Initial group
+    int zeros[4];
+    half scales[4];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4(scales, group, n);
+    // Column result
+    half block_c[m_count][4] = {};
+
+    // Dequantize and multiply
+    int k = offset_k;
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4(scales, group, n);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            int4 load_int4[2];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][4];
+            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1);
+            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1);
+            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1);
+            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1);
+
+            for (int m = 0; m < m_count; m++)
+            {
+                block_c[m][0] = dot22_8_h(dq[0], a_ptr + m * a_stride, block_c[m][0], scales[0]);
+                block_c[m][1] = dot22_8_h(dq[1], a_ptr + m * a_stride, block_c[m][1], scales[1]);
+                block_c[m][2] = dot22_8_h(dq[2], a_ptr + m * a_stride, block_c[m][2], scales[2]);
+                block_c[m][3] = dot22_8_h(dq[3], a_ptr + m * a_stride, block_c[m][3], scales[3]);
+            }
+            a_ptr += 8;
+        }
+        k += 32;
+    }
+
+    for (int m = 0; m < m_count; m++)
+    {
+        half2 *out = (half2*) c_.item_ptr(offset_m + m, n);
+        half2 result01 = __halves2half2(block_c[m][0], block_c[m][1]);
+        half2 result23 = __halves2half2(block_c[m][2], block_c[m][3]);
+        atomicAdd(out    , result01);
+        atomicAdd(out + 1, result23);
+    }
+}
+
+fp_gemm_half_q_half_gptq_kernel pick_gemm_half_q_half_gptq_kernel(
+    bool first_block, const int m_count, const int bit)
+{
+    #define SELECT_KERNEL(M_COUNT)                                            \
+    if (m_count == M_COUNT) {                                                 \
+      if (bit == 2) return gemm_half_q_half_gptq_2bit_kernel<true, M_COUNT>;  \
+      if (bit == 3) return gemm_half_q_half_gptq_3bit_kernel<true, M_COUNT>;  \
+      if (bit == 4) return gemm_half_q_half_gptq_4bit_kernel<true, M_COUNT>;  \
+      if (bit == 8) return gemm_half_q_half_gptq_8bit_kernel<true, M_COUNT>;  \
+    }
+    #if BLOCK_M_SIZE_MAX >= 1
+    SELECT_KERNEL(1);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 2
+    SELECT_KERNEL(2);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 3
+    SELECT_KERNEL(3);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 4
+    SELECT_KERNEL(4);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 5
+    SELECT_KERNEL(5);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 6
+    SELECT_KERNEL(6);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 7
+    SELECT_KERNEL(7);
+    #endif
+    #if BLOCK_M_SIZE_MAX >= 8
+    SELECT_KERNEL(8);
+    #endif
+    return NULL;
+}
+
+
+void gemm_half_q_half_cuda_part
+(
+    const half* a,
+    const uint32_t* b_q_weight,
+    const uint32_t* b_gptq_qzeros,
+    const half* b_gptq_scales,
+    const int* b_q_perm,
+    half* c,
+    int size_m,
+    int size_n,
+    int size_k,
+    int m_count,
+    int groups,
+    int bit
+)
+{
+    dim3 blockDim, gridDim;
+    blockDim.x = BLOCK_KN_SIZE;
+    blockDim.y = 1;
+    blockDim.z = 1;
+    gridDim.x = DIVIDE(size_n, BLOCK_KN_SIZE * 4);
+    gridDim.y = DIVIDE(size_m, m_count);
+    gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
+
+    fp_gemm_half_q_half_gptq_kernel kernel = pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
+
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    kernel<<<gridDim, blockDim, 0, stream>>>
+    (
+        a,
+        b_q_weight,
+        b_gptq_qzeros,
+        b_gptq_scales,
+        c,
+        size_m,
+        size_n,
+        size_k,
+        groups,
+        b_q_perm
+    );
+}
+
+
+__global__ void reconstruct_exllama_8bit_kernel
+(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b
+)
+{
+    MatrixView_half_rw b_(b, size_k, size_n);
+    MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    // Preload remapping table
+    __shared__ int perm[BLOCK_KN_SIZE];
+    int t = threadIdx.x;
+
+    if (b_q_perm)
+    {
+        if (offset_k + t < size_k)
+            perm[t] = b_q_perm[offset_k + t];
+    }
+
+    // Column
+    int n = offset_n + t * 4;
+    if (n >= size_n) return;
+
+    // Find initial group
+    int groupsize = size_k / groups;
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // b offset
+    int qk = offset_k / (32 / 8);
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+    // Initial zeros/scale
+    int zeros[4];
+    half2 scales[4];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4_h2(scales, group, n);
+
+    __syncthreads();
+
+    int k = offset_k;
+    int lk = 0;
+
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4_h2(scales, group, n);
+        }
+
+        for (int p = 0; p < 4; p++)
+        {
+            int4 load_int4[2];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][4];
+            dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n, zeros[0] + 1);
+            dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n, zeros[1] + 1);
+            dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n, zeros[2] + 1);
+            dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n, zeros[3] + 1);
+
+            //half* dqh = (half*)dq;
+            if (b_q_perm)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+            else
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+        }
+        k += 32;
+    }
+}
+
+__global__ void reconstruct_exllama_4bit_kernel
+(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b
+)
+{
+    MatrixView_half_rw b_(b, size_k, size_n);
+    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    // Preload remapping table
+    __shared__ int perm[BLOCK_KN_SIZE];
+    int t = threadIdx.x;
+
+    if (b_q_perm)
+    {
+        if (offset_k + t < size_k)
+            perm[t] = b_q_perm[offset_k + t];
+    }
+
+    // Column
+    int n = offset_n + t * 4;
+    if (n >= size_n) return;
+
+    // Find initial group
+    int groupsize = size_k / groups;
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // b offset
+    int qk = offset_k / (32 / 4);
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+    // Initial zeros/scale
+    int zeros[4];
+    half2 scales[4];
+    half2 z1z16[4][2];
+    half2 y1y16[4][2];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4_h2(scales, group, n);
+    dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+    dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+    dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+    dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+
+    __syncthreads();
+
+    int k = offset_k;
+    int lk = 0;
+
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4_h2(scales, group, n);
+            dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
+            dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
+            dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
+            dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
+        }
+
+        for (int p = 0; p < 4; p++)
+        {
+            half2 dq[4][4];
+            const int4* b_ptr4 = (int4*) b_ptr;
+            int4 load_int4 = *b_ptr4;
+
+            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
+            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
+            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
+            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+
+            b_ptr += size_n;
+            //half* dqh = (half*)dq;
+            if (b_q_perm)
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+            else
+            {
+                for (int j = 0; j < 4; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+        }
+        k += 32;
+    }
+}
+
+__global__ void reconstruct_exllama_3bit_kernel
+(
+    const uint32_t* __restrict__ b_q_weight,
+    const int* __restrict__ b_q_perm,
+    const uint32_t* __restrict__ b_gptq_qzeros,
+    const half* __restrict__ b_gptq_scales,
+    const int size_k,
+    const int size_n,
+    const int groups,
+    half* __restrict__ b
+)
+{
+    MatrixView_half_rw b_(b, size_k, size_n);
+    MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
+
+    int offset_k = BLOCK_KN_SIZE * blockIdx.y;
+    int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+
+    int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
+
+    // Preload remapping table
+    __shared__ int perm[BLOCK_KN_SIZE];
+    int t = threadIdx.x;
+
+    if (b_q_perm)
+    {
+        if (offset_k + t < size_k)
+            perm[t] = b_q_perm[offset_k + t];
+    }
+
+    // Column
+    int n = offset_n + t * 4;
+    if (n >= size_n) return;
+
+    // Find initial group
+    int groupsize = size_k / groups;
+    int group = offset_k / groupsize;
+    int nextgroup = offset_k + groupsize;
+
+    // b offset
+    int qk = offset_k / 32* 3;
+
+    const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
+
+    // Initial zeros/scale
+    int zeros[4];
+    half2 scales[4];
+    b_gptq_qzeros_.item4(zeros, group, n);
+    b_gptq_scales_.item4_h2(scales, group, n);
+
+    __syncthreads();
+
+    int k = offset_k;
+    int lk = 0;
+
+    while (k < end_k)
+    {
+        if (k == nextgroup)
+        {
+            group++;
+            nextgroup += groupsize;
+            b_gptq_qzeros_.item4(zeros, group, n);
+            b_gptq_scales_.item4_h2(scales, group, n);
+        }
+
+        for (int p = 0; p < 1; p++)
+        {
+            int4 load_int4[3];
+            load_int4[0] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[1] = *((int4*) b_ptr); b_ptr += size_n;
+            load_int4[2] = *((int4*) b_ptr); b_ptr += size_n;
+
+            half2 dq[4][16];
+            dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0], size_n, zeros[0] + 1);
+            dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1], size_n, zeros[1] + 1);
+            dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2], size_n, zeros[2] + 1);
+            dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3], size_n, zeros[3] + 1);
+
+            if (b_q_perm)
+            {
+                for (int j = 0; j < 16; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(perm[lk++], n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+            else
+            {
+                for (int j = 0; j < 16; j++)
+                {
+                    for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
+                    b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
+                    b_.set4(offset_k + lk++, n, __high2half(dq[0][j]), __high2half(dq[1][j]), __high2half(dq[2][j]), __high2half(dq[3][j]));
+                }
+            }
+        }
+        k += 32;
+    }
+}
 
-__global__ void reconstruct_exllama_kernel
+__global__ void reconstruct_exllama_2bit_kernel
 (
     const uint32_t* __restrict__ b_q_weight,
     const int* __restrict__ b_q_perm,
@@ -317,7 +1136,7 @@ __global__ void reconstruct_exllama_kernel
 )
 {
     MatrixView_half_rw b_(b, size_k, size_n);
-    MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
+    MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
     MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
     int offset_k = BLOCK_KN_SIZE * blockIdx.y;
@@ -345,21 +1164,15 @@ __global__ void reconstruct_exllama_kernel
     int nextgroup = offset_k + groupsize;
 
     // b offset
-    int qk = offset_k / (32 / 4);
+    int qk = offset_k / (32 / 2);
 
     const uint32_t* b_ptr = b_q_weight + qk * size_n + n;
 
     // Initial zeros/scale
     int zeros[4];
     half2 scales[4];
-    half2 z1z16[4][2];
-    half2 y1y16[4][2];
     b_gptq_qzeros_.item4(zeros, group, n);
     b_gptq_scales_.item4_h2(scales, group, n);
-    dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-    dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-    dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-    dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
 
     __syncthreads();
 
@@ -374,28 +1187,24 @@ __global__ void reconstruct_exllama_kernel
             nextgroup += groupsize;
             b_gptq_qzeros_.item4(zeros, group, n);
             b_gptq_scales_.item4_h2(scales, group, n);
-            dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
-            dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
-            dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
-            dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
         }
 
-        for (int p = 0; p < 4; p++)
+        for (int p = 0; p < 2; p++)
         {
-            half2 dq[4][4];
             const int4* b_ptr4 = (int4*) b_ptr;
             int4 load_int4 = *b_ptr4;
 
-            dequant_4bit_8_gptq(load_int4.x, dq[0], z1z16[0], y1y16[0], size_n, false);
-            dequant_4bit_8_gptq(load_int4.y, dq[1], z1z16[1], y1y16[1], size_n, false);
-            dequant_4bit_8_gptq(load_int4.z, dq[2], z1z16[2], y1y16[2], size_n, false);
-            dequant_4bit_8_gptq(load_int4.w, dq[3], z1z16[3], y1y16[3], size_n, false);
+            half2 dq[4][8];
+            dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
+            dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
+            dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
+            dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
 
             b_ptr += size_n;
             //half* dqh = (half*)dq;
             if (b_q_perm)
             {
-                for (int j = 0; j < 4; j++)
+                for (int j = 0; j < 8; j++)
                 {
                     for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
                     b_.set4(perm[lk++], n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
@@ -404,7 +1213,7 @@ __global__ void reconstruct_exllama_kernel
             }
             else
             {
-                for (int j = 0; j < 4; j++)
+                for (int j = 0; j < 8; j++)
                 {
                     for (int v = 0; v < 4; v++) dq[v][j] = __hmul2(scales[v], dq[v][j]);
                     b_.set4(offset_k + lk++, n, __low2half(dq[0][j]), __low2half(dq[1][j]), __low2half(dq[2][j]), __low2half(dq[3][j]));
@@ -416,7 +1225,6 @@ __global__ void reconstruct_exllama_kernel
     }
 }
 
-
 void reconstruct_exllama
 (
     const uint32_t* b_q_weight,
@@ -426,7 +1234,8 @@ void reconstruct_exllama
     half* out,
     int height,
     int width,
-    int groups
+    int groups,
+    int bit
 )
 {
     dim3 blockDim, gridDim;
@@ -435,6 +1244,15 @@ void reconstruct_exllama
     gridDim.y = DIVIDE(height, BLOCK_KN_SIZE);
     gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
 
+    auto reconstruct_exllama_kernel = reconstruct_exllama_4bit_kernel;
+    if (bit == 2) {
+        reconstruct_exllama_kernel = reconstruct_exllama_2bit_kernel;
+    } else if (bit == 3) {
+        reconstruct_exllama_kernel = reconstruct_exllama_3bit_kernel;
+    } else if (bit == 8) {
+        reconstruct_exllama_kernel = reconstruct_exllama_8bit_kernel;
+    }
+
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>
     (
@@ -450,7 +1268,7 @@ void reconstruct_exllama
 }
 
 
-__global__ void gemm_half_q_half_alt_kernel(
+__global__ void gemm_half_q_half_alt_4bit_kernel(
     const half2* __restrict__ vec,
     const uint32_t* __restrict__ mat,
     half* __restrict__ mul,
@@ -548,6 +1366,95 @@ __global__ void gemm_half_q_half_alt_kernel(
 }
 
 
+__global__ void gemm_half_q_half_alt_8bit_kernel(
+    const half2* __restrict__ vec,
+    const uint32_t* __restrict__ mat,
+    half* __restrict__ mul,
+    const half* __restrict__ scales,
+    const uint32_t* __restrict__ zeros,
+    const int* __restrict__ g_idx,
+    int batch,
+    int height,
+    int width
+)
+{
+    int zero_width = width / 4;
+    int vec_height = height * 2;
+    const int blockwidth2 = BLOCK_KN_SIZE / 2;
+    int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+    int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
+    int h = BLOCK_KN_SIZE * blockIdx.z / 4;
+    int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
+    int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+
+    __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
+    if (threadIdx.x < h_end) {
+        for (int m = 0; m < b_end; ++m) {
+          blockvec[m][threadIdx.x] =
+              vec[(m + b) * vec_height + blockIdx.z * BLOCK_KN_SIZE / 2 +
+                  threadIdx.x];
+        }
+    }
+
+
+    if (blockIdx.z == 0)
+    {
+        for (int m = 0; m < b_end; m++)
+            mul[(b + m) * width + w] = __int2half_rn(0);
+    }
+    __syncthreads();
+
+    int i = width * h + w;
+    int g_h = h * 4;
+    int k = 0;
+    int z_w = w / 4;
+    int z_mod = (w % 4) * 8;
+    half2 res2;
+    half res[BLOCK_M_SIZE_MAX] = {};
+
+    unsigned int tmp;
+    while (k < h_end) {
+        tmp = mat[i];
+        half2 scales_tmp[2];
+        half2 zeros_tmp[2];
+        for (int tmp_k = 0; tmp_k < 2; tmp_k++) {
+            int g = g_idx[g_h + (k + tmp_k) * 2];
+            int g2 = g_idx[g_h + (k + tmp_k) * 2 + 1];
+            half scale_f = scales[g * width + w];
+            half scale_f2 = scales[g2 * width + w];
+            half2 scale = __halves2half2(scale_f, scale_f2);
+            half2 zero = __halves2half2(
+                __hmul(scale_f, __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
+                __hmul(scale_f2, __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1))
+            );
+            scales_tmp[tmp_k] = scale;
+            zeros_tmp[tmp_k] = zero;
+        }
+        for (int m = 0; m < b_end; m++) {
+#ifndef USE_ROCM
+            res2 = {};
+#else
+            res2.x = __half_as_ushort(__float2half(0));
+            res2.y = __half_as_ushort(__float2half(0));
+#endif
+            half2 v12 = __halves2half2(__int2half_rn(tmp & 0xFF), __int2half_rn((tmp >> 8) & 0xFF));
+            res2 = __hfma2(__hfma2(v12, scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
+            half2 v34 = __halves2half2(__int2half_rn((tmp >> 16) & 0xFF), __int2half_rn((tmp >> 24) & 0xFF));
+            res2 = __hfma2(__hfma2(v34, scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
+#ifndef USE_ROCM
+            res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
+#else
+            res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
+#endif
+        }
+        i += width;
+        k += 2;
+    }
+    for (int m = 0; m < b_end; m++) {
+        atomicAdd(&mul[(b + m) * width + w], res[m]);
+    }
+}
+
 void gemm_half_q_half_alt
 (
     const half* a,
@@ -558,7 +1465,8 @@ void gemm_half_q_half_alt
     half* c,
     int size_m,
     int size_n,
-    int size_k
+    int size_k,
+    int bit
 )
 {
     dim3 blockDim, gridDim;
@@ -569,8 +1477,13 @@ void gemm_half_q_half_alt
     gridDim.y = DIVIDE(size_m, BLOCK_M_SIZE_MAX);
     gridDim.z = DIVIDE(size_k, BLOCK_KN_SIZE);
 
+    auto kernel = gemm_half_q_half_alt_4bit_kernel;
+    if (bit == 8) {
+        kernel = gemm_half_q_half_alt_8bit_kernel;
+    }
+
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    gemm_half_q_half_alt_kernel<<<gridDim, blockDim, 0, stream>>>
+    kernel<<<gridDim, blockDim, 0, stream>>>
     (
         (const half2*) a,
         b_q_weight,
@@ -579,12 +1492,12 @@ void gemm_half_q_half_alt
         b_gptq_qzeros,
         b_g_idx,
         size_m,
-        size_k / 8,
+        size_k / 32 * bit,
         size_n
     );
 }
 
-
+template<class T, int bit>
 __global__ void reconstruct_gptq_kernel
 (
     const uint32_t* __restrict__ w,
@@ -600,30 +1513,79 @@ __global__ void reconstruct_gptq_kernel
     // Start of block
 
     int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-    int row = blockIdx.y * 8;
+    int row = blockIdx.y * 32 / bit;
     if (column >= width) return;
 
     // Views
 
-    MatrixView_q4_column w_(w, height, width);
     MatrixView_half_rw out_(out, height, width);
     MatrixView_half w_scales_(w_scales, group, width);
-    MatrixView_q4_row w_zeros_(w_zeros, group, width);
+    T w_zeros_(w_zeros, group, width);
 
-    uint32_t w_read = w_.item_uint32_t(row, column);
+    uint32_t w_read = w[blockIdx.y * width + column];
     half* out_ptr = out_.item_ptr(row, column);
 
     #pragma unroll
-    for (int s = 0; s < 32; s += 4)
+    for (int s = 0; s < 32; s += bit)
     {
-        int group = g_idx[row + s / 4];
+        int group = g_idx[row + s / bit];
         half w_scale = w_scales_.item(group, column);
         uint32_t w_zero = w_zeros_.item(group, column) + 1;
-        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
+        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero), w_scale);
         *out_ptr = w_item; out_ptr += out_.width;
     }
 }
 
+__global__ void reconstruct_gptq_3bit_kernel
+(
+    const uint32_t* __restrict__ w,
+    const half* __restrict__ w_scales,
+    const uint32_t* __restrict__ w_zeros,
+    const int* __restrict__ g_idx,
+    const int height,
+    const int width,
+    const int group,
+    half* __restrict__ out
+)
+{
+    // Start of block
+    int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+    int row = blockIdx.y * 32;
+    if (column >= width) return;
+
+    // Views
+
+    MatrixView_half_rw out_(out, height, width);
+    MatrixView_half w_scales_(w_scales, group, width);
+    MatrixView_q3_row w_zeros_(w_zeros, group, width);
+
+    uint32_t w1 = w[(blockIdx.y * 3) * width + column];
+    uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
+    uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
+    half* out_ptr = out_.item_ptr(row, column);
+
+    #pragma unroll
+    for (int i = 0; i < 32; i += 1)
+    {
+        int group = g_idx[row + i];
+        half w_scale = w_scales_.item(group, column);
+        uint32_t w_zero = w_zeros_.item(group, column) + 1;
+        int w_item;
+        if (i == 10) {
+            w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
+        } else if (i == 21) {
+            w_item = (w2 >> 31) | ((w3 << 1) & 0x6);
+        } else if (i < 10) {
+            w_item = ((w1 >> (i * 3)) & 0x7);
+        } else if (i < 21) {
+            w_item = ((w2 >> (i * 3 - 32)) & 0x7);
+        } else {
+            w_item = ((w3 >> (i * 3 - 64)) & 0x7);
+        }
+        *out_ptr = __hmul(__int2half_rn(w_item - w_zero), w_scale);
+        out_ptr += out_.width;
+    }
+}
 
 void reconstruct_gptq
 (
@@ -634,16 +1596,28 @@ void reconstruct_gptq
     half* out,
     int height,
     int width,
-    int groups
+    int groups,
+    int bit
 )
 {
     dim3 blockDim, gridDim;
     blockDim.x = BLOCK_KN_SIZE;
     blockDim.y = 1;
-    gridDim.y = DIVIDE(height, 8);
+    gridDim.y = DIVIDE(height, 32 / bit);
     gridDim.x = DIVIDE(width, BLOCK_KN_SIZE);
+
+    auto kernel = reconstruct_gptq_kernel<MatrixView_q4_row, 4>;
+    if (bit == 2) {
+        kernel = reconstruct_gptq_kernel<MatrixView_q2_row, 2>;
+    } else if (bit == 8) {
+        kernel = reconstruct_gptq_kernel<MatrixView_q8_row, 8>;
+    } else if (bit == 3) {
+        kernel = reconstruct_gptq_3bit_kernel;
+        gridDim.y = DIVIDE(height, 32);
+    }
+
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    reconstruct_gptq_kernel<<<gridDim, blockDim, 0, stream>>>
+    kernel<<<gridDim, blockDim, 0, stream>>>
     (
         b_q_weight,
         b_gptq_scales,
@@ -671,19 +1645,27 @@ void gemm_half_q_half_cuda
     int size_n,
     int size_k,
     int groups,
-    bool use_exllama
+    bool use_exllama,
+    int bit
 )
 {
-    if ((use_exllama && size_m > MAX_Q_GEMM_ROWS) || (!use_exllama && size_m > MAX_ALT_GEMM_ROWS)) {
+    bool use_reconstruct;
+    if (use_exllama) {
+        use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) || (bit != 8 && size_m > MAX_Q_GEMM_ROWS));
+    } else {
+        // The 2/3-bit kernels are somehow slower than dequant + gemm baseline, so we disabled them for now.
+        use_reconstruct = (bit < 4 || size_m > MAX_ALT_GEMM_ROWS);
+    }
+    if (use_reconstruct) {
         // Reconstruct FP16 matrix, then cuBLAS
         if (use_exllama) {
             reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, temp_dq,
-                                size_k, size_n, groups);
+                                size_k, size_n, groups, bit);
         }
         else
         {
             reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                             temp_dq, size_k, size_n, groups);
+                             temp_dq, size_k, size_n, groups, bit);
         }
 
         const half alpha = __float2half(1.0f);
@@ -707,7 +1689,7 @@ void gemm_half_q_half_cuda
         {
             gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
                                         c, last_chunk, size_n, size_k, BLOCK_M_SIZE_MAX,
-                                        groups);
+                                        groups, bit);
         }
 
         if (last_chunk_size)
@@ -715,18 +1697,17 @@ void gemm_half_q_half_cuda
             gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight, b_gptq_qzeros,
                                         b_gptq_scales, b_g_idx, c + last_chunk * size_n,
                                         last_chunk_size, size_n, size_k, last_chunk_size,
-                                        groups);
+                                        groups, bit);
         }
     }
     else
     {
         gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
-                             c, size_m, size_n, size_k);
+                             c, size_m, size_n, size_k, bit);
     }
 }
 
-
-__global__ void shuffle_kernel
+__global__ void shuffle_4bit_kernel
 (
     uint32_t* __restrict__ b_q_weight,
     const int size_k,
@@ -740,13 +1721,53 @@ __global__ void shuffle_kernel
     while (k < size_k) { shuffle_4bit_8 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  8; }
 }
 
+__global__ void shuffle_8bit_kernel
+(
+    uint32_t* __restrict__ b_q_weight,
+    const int size_k,
+    const int size_n
+)
+{
+    int n = blockIdx.x * THREADS_X + threadIdx.x;
+    if (n >= size_n) return;
+    int k = 0;
+    uint32_t* b_ptr = b_q_weight + n;
+    while (k < size_k) { shuffle_8bit_4 (b_ptr, size_n); b_ptr += 1 * size_n; k +=  4; }
+}
+
+__global__ void shuffle_2bit_kernel
+(
+    uint32_t* __restrict__ b_q_weight,
+    const int size_k,
+    const int size_n
+)
+{
+    int n = blockIdx.x * THREADS_X + threadIdx.x;
+    if (n >= size_n) return;
+    int k = 0;
+    uint32_t* b_ptr = b_q_weight + n;
+    while (k < size_k) { shuffle_2bit_16(b_ptr, size_n); b_ptr += 1 * size_n; k += 16;  }
+}
+
+__global__ void shuffle_3bit_kernel
+(
+    uint32_t* __restrict__ b_q_weight,
+    const int size_k,
+    const int size_n
+)
+{
+    int n = blockIdx.x * THREADS_X + threadIdx.x;
+    if (n >= size_n) return;
+    int k = 0;
+    uint32_t* b_ptr = b_q_weight + n;
+    while (k < size_k) { shuffle_3bit_32(b_ptr, size_n); b_ptr += 3 * size_n; k += 32;  }
+}
 
-__global__ void make_sequential_kernel
+__global__ void make_sequential_4bit_kernel
 (
     const uint32_t* __restrict__ w,
     uint32_t* __restrict__ w_new,
     const int* __restrict__ q_perm,
-    const int w_height,
     const int w_width
 )
 {
@@ -778,37 +1799,204 @@ __global__ void make_sequential_kernel
     w_new2[w_new2_row * w2_stride + w2_column] = dst;
 }
 
+__global__ void make_sequential_2bit_kernel
+(
+    const uint32_t* __restrict__ w,
+    uint32_t* __restrict__ w_new,
+    const int* __restrict__ q_perm,
+    const int w_width
+)
+{
+    const uint64_t* w2 = (uint64_t*) w;
+    uint64_t* w_new2 = (uint64_t*) w_new;
+    int w2_stride = w_width >> 1;
+    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+    if (w2_column >= w2_stride) return;
+    int w_new2_row = blockIdx.y;
+    int q_perm_idx = w_new2_row << 4;
+    uint64_t dst = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        int source_row = q_perm[q_perm_idx++];
+
+        int w2_row = source_row >> 4;
+        int w2_subrow = source_row & 0x0f;
+        int w2_row_shift = w2_subrow << 1;
+        int wnew2_row_shift = i << 1;
+
+        uint64_t src = w2[w2_row * w2_stride + w2_column];
+        src >>= w2_row_shift;
+        src &= 0x0000000300000003;
+        src <<= wnew2_row_shift;
+        dst |= src;
+    }
+    w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
+__global__ void make_sequential_3bit_kernel
+(
+    const uint32_t* __restrict__ w,
+    uint32_t* __restrict__ w_new,
+    const int* __restrict__ q_perm,
+    const int w_width
+)
+{
+    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+    if (w_column >= w_width) return;
+    int w_new_row = blockIdx.y * 3;
+    int q_perm_idx = blockIdx.y << 5;
+    uint32_t dst[3] = {0, 0, 0};
+
+    #pragma unroll
+    for (int i = 0; i < 32; i++)
+    {
+        int source_row = q_perm[q_perm_idx++];
+        int z_w = (source_row / 32) * 3;
+        int z_mod = source_row % 32;
+        int z_bit;
+
+        if (z_mod != 10){
+            if (z_mod != 21){
+                z_bit = z_mod;
+                if (z_bit > 21){
+                    z_bit *= 3;
+                    z_bit -= 64;
+                    z_w += 2;
+                } else if (z_bit > 10){
+                    z_bit *= 3;
+                    z_bit -= 32;
+                    z_w += 1;
+                } else {
+                    z_bit *= 3;
+                }
+            } else {
+                z_w += 1;
+            }
+        }
+
+        uint64_t src;
+        if (z_mod == 10) {
+            src = (w[z_w * w_width + w_column] >> 30) | ((w[(z_w + 1) * w_width + w_column] << 2) & 0x4);
+        } else if (z_mod == 21){
+            src = (w[z_w * w_width + w_column] >> 31) | ((w[(z_w + 1) * w_width + w_column] << 1) & 0x6);
+        } else {
+            src = w[z_w * w_width + w_column];
+            src >>= z_bit;
+            src &= 0x07;
+        }
+
+        z_w = 0;
+        if (i != 10){
+            if (i != 21){
+                z_bit = i;
+                if (z_bit > 21){
+                    z_bit *= 3;
+                    z_bit -= 64;
+                    z_w += 2;
+                } else if (z_bit > 10){
+                    z_bit *= 3;
+                    z_bit -= 32;
+                    z_w += 1;
+                } else {
+                    z_bit *= 3;
+                }
+            } else {
+                z_w += 1;
+            }
+        }
+        if (i == 10) {
+            dst[z_w] |= (src & 0x03) << 30;
+            dst[z_w + 1] |= ((src & 0x4) >> 2);
+        } else if (i == 21) {
+            dst[z_w] |= (src & 0x01) << 31;
+            dst[z_w + 1] |= ((src & 0x6) >> 1);
+        } else {
+            dst[z_w] |= (src << z_bit);
+        }
+    }
+    w_new[w_new_row * w_width + w_column] = dst[0];
+    w_new[(w_new_row + 1) * w_width + w_column] = dst[1];
+    w_new[(w_new_row + 2) * w_width + w_column] = dst[2];
+}
+
+__global__ void make_sequential_8bit_kernel
+(
+    const uint32_t* __restrict__ w,
+    uint32_t* __restrict__ w_new,
+    const int* __restrict__ q_perm,
+    const int w_width
+)
+{
+    const uint64_t* w2 = (uint64_t*) w;
+    uint64_t* w_new2 = (uint64_t*) w_new;
+    int w2_stride = w_width >> 1;
+    int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+    if (w2_column >= w2_stride) return;
+    int w_new2_row = blockIdx.y;
+    int q_perm_idx = w_new2_row << 2;
+    uint64_t dst = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++)
+    {
+        int source_row = q_perm[q_perm_idx++];
+
+        int w2_row = source_row >> 2;
+        int w2_subrow = source_row & 0x03;
+        int w2_row_shift = w2_subrow << 3;
+        int wnew2_row_shift = i << 3;
+
+        uint64_t src = w2[w2_row * w2_stride + w2_column];
+        src >>= w2_row_shift;
+        src &= 0x000000ff000000ff;
+        src <<= wnew2_row_shift;
+        dst |= src;
+    }
+    w_new2[w_new2_row * w2_stride + w2_column] = dst;
+}
+
 
 void shuffle_exllama_weight
 (
     uint32_t* q_weight,
     int* q_perm,
     int height,
-    int width
+    int width,
+    int bit
 )
 {
     if (q_perm)
     {
         uint32_t* new_qweight = NULL;
-        cudaMalloc(&new_qweight, height / 8 * width * sizeof(uint32_t));
+        cudaMalloc(&new_qweight, height / 32 * bit * width * sizeof(uint32_t));
 
         dim3 blockDim, gridDim;
         blockDim.x = THREADS_X;
         blockDim.y = 1;
         gridDim.x = DIVIDE(width, THREADS_X);
-        gridDim.y = height / 8;
-
+        gridDim.y = height / 32 * bit;
+
+        auto kernel = make_sequential_4bit_kernel;
+        if (bit == 2) {
+            kernel = make_sequential_2bit_kernel;
+        } else if (bit == 3) {
+            kernel = make_sequential_3bit_kernel;
+            gridDim.y = height / 32;
+        } else if (bit == 8) {
+            kernel = make_sequential_8bit_kernel;
+        }
         const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        make_sequential_kernel<<<gridDim, blockDim, 0, stream>>>
+        kernel<<<gridDim, blockDim, 0, stream>>>
         (
             q_weight,
             new_qweight,
             q_perm,
-            height / 8,
             width
         );
         // Replace qweights
-        cudaMemcpyAsync(q_weight, new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
+        cudaMemcpyAsync(q_weight, new_qweight, height / 32 * bit * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
         // Cleanup
         cudaDeviceSynchronize();
         cudaFree(new_qweight);
@@ -818,6 +2006,14 @@ void shuffle_exllama_weight
     blockDim.y = 1;
     gridDim.x = DIVIDE(width, THREADS_X);
     gridDim.y = 1;
+    auto shuffle_kernel = shuffle_4bit_kernel;
+    if (bit == 2) {
+        shuffle_kernel = shuffle_2bit_kernel;
+    } else if (bit == 3) {
+        shuffle_kernel = shuffle_3bit_kernel;
+    } else if (bit == 8) {
+        shuffle_kernel = shuffle_8bit_kernel;
+    }
     const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
     shuffle_kernel<<<gridDim, blockDim, 0, stream>>>(q_weight, height, width);
 }
@@ -832,13 +2028,14 @@ torch::Tensor gptq_gemm
     torch::Tensor b_gptq_qzeros,
     torch::Tensor b_gptq_scales,
     torch::Tensor b_g_idx,
-    bool use_exllama
+    bool use_exllama,
+    int bit
 )
 {
     const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
     auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
     at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
-    at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 8, b_q_weight.size(1)}, options);
+    at::Tensor temp_dq = torch::empty({b_q_weight.size(0) * 32 / bit, b_q_weight.size(1)}, options);
 
     vllm::gptq::gemm_half_q_half_cuda
     (
@@ -854,7 +2051,8 @@ torch::Tensor gptq_gemm
         c.size(1),  // n
         a.size(1),  // k
         b_gptq_qzeros.size(0),  // group number
-        use_exllama
+        use_exllama,
+        bit
     );
     return c;
 }
@@ -862,14 +2060,16 @@ torch::Tensor gptq_gemm
 void gptq_shuffle
 (
     torch::Tensor q_weight,
-    torch::Tensor q_perm
+    torch::Tensor q_perm,
+    int bit
 )
 {
     const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
     vllm::gptq::shuffle_exllama_weight(
         (uint32_t*) q_weight.data_ptr(),
         q_perm.device().is_meta() ? NULL : (int*) q_perm.data_ptr(),
-        q_weight.size(0) * 8,
-        q_weight.size(1)
+        q_weight.size(0) * 32 / bit,
+        q_weight.size(1),
+        bit
     );
 }
diff --git a/csrc/quantization/gptq/qdq_2.cuh b/csrc/quantization/gptq/qdq_2.cuh
new file mode 100644
index 0000000000000..295872a91de37
--- /dev/null
+++ b/csrc/quantization/gptq/qdq_2.cuh
@@ -0,0 +1,87 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_2_cuh
+#define _qdq_2_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+// Permutation:
+//
+// ffddbb99 77553311  eeccaa88 66442200
+
+__forceinline__ __device__ void shuffle_2bit_16
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0];
+    uint32_t qb = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 8; i++)
+    {
+        uint32_t qa0 = qa & 0x03;
+        uint32_t qa1 = (qa & 0x0c) >> 2;
+        qa >>= 4;
+        qb |= (qa1 << (i * 2 + 16));
+        qb |= (qa0 << (i * 2));
+    }
+    q[0] = qb;
+}
+
+__forceinline__ __device__ void dequant_2bit_16
+(
+    const uint32_t q_0,
+    half2 (&dq)[8],
+    int stride,
+    const uint32_t zero
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y4_  = __float2half_rn(1.0f /  4.0f);
+    const half y16_ = __float2half_rn(1.0f / 16.0f);
+    const half y64_ = __float2half_rn(1.0f / 64.0f);
+    const half2 y4  = __halves2half2(y4_,  y4_);
+    const half2 y16 = __halves2half2(y16_, y16_);
+    const half2 y64 = __halves2half2(y64_, y64_);
+
+    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+    const half z4_ = __hsub(__int2half_rn(-256), __int2half_rn(zero));
+    const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+    const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+    const half2 z1 = __half2half2(z1_.as_half);
+    const half2 z4 = __half2half2(z4_);
+    const half2 z16 = __half2half2(z16_);
+    const half2 z64 = __half2half2(z64_);
+
+    uint32_t qa = q_0;
+    half2_uint32 q0((qa & 0x00030003) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1((qa & 0x000c000c) | c0); // half2(q[ 2], q[ 3]) *  4 + 1024
+    half2_uint32 q2((qa & 0x00300030) | c0); // half2(q[ 4], q[ 5]) * 16 + 1024
+    half2_uint32 q3((qa & 0x00c000c0) | c0); // half2(q[ 6], q[ 7]) * 64 + 1024
+    qa >>= 8;
+    half2_uint32 q4((qa & 0x00030003) | c0); // half2(q[ 8], q[ 8])      + 1024
+    half2_uint32 q5((qa & 0x000c000c) | c0); // half2(q[10], q[11]) *  4 + 1024
+    half2_uint32 q6((qa & 0x00300030) | c0); // half2(q[12], q[13]) * 16 + 1024
+    half2_uint32 q7((qa & 0x00c000c0) | c0); // half2(q[14], q[15]) * 64 + 1024
+
+    dq[0] = __hadd2(q0.as_half2, z1);
+    dq[1] = __hfma2(q1.as_half2, y4,  z4);
+    dq[2] = __hfma2(q2.as_half2, y16, z16);
+    dq[3] = __hfma2(q3.as_half2, y64, z64);
+    dq[4] = __hadd2(q4.as_half2, z1);
+    dq[5] = __hfma2(q5.as_half2, y4,  z4);
+    dq[6] = __hfma2(q6.as_half2, y16, z16);
+    dq[7] = __hfma2(q7.as_half2, y64, z64);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/csrc/quantization/gptq/qdq_3.cuh b/csrc/quantization/gptq/qdq_3.cuh
new file mode 100644
index 0000000000000..3e7ecde752ba3
--- /dev/null
+++ b/csrc/quantization/gptq/qdq_3.cuh
@@ -0,0 +1,141 @@
+#ifndef _qdq_3_cuh
+#define _qdq_3_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+// Permutation:
+//
+// v9997775 55333111  u8886664 44222000  (u, v lsb)
+// vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+// vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+__forceinline__ __device__ void shuffle_3bit_32
+(
+    uint32_t* q,
+    int stride
+)
+{
+    uint32_t qa = q[0 * stride];
+    uint32_t qb = q[1 * stride];
+    uint32_t qc = q[2 * stride];
+
+    // qa: aa999888 77766655  54443332 22111000
+    // qb: lkkkjjji iihhhggg  fffeeedd dcccbbba
+    // qc: vvvuuutt tsssrrrq  qqpppooo nnnmmmll
+
+    uint32_t qd = qc >> 26;
+    qc <<= 4;
+    qc |= qb >> 28;
+    qb <<= 2;
+    qb |= qa >> 30;
+
+    // qa: ..999888 77766655  54443332 22111000
+    // qb: ..jjjiii hhhgggff  feeedddc ccbbbaaa
+    // qc: ..tttsss rrrqqqpp  pooonnnm mmlllkkk
+    // qd:                               vvvuuu
+
+    uint32_t za = 0;
+    uint32_t zb = 0;
+    uint32_t zc = 0;
+
+    for (int i = 0; i < 5; i++) { uint32_t t0 = qa & 0x07; uint32_t t1 = (qa & 0x38) >> 3; qa >>= 6; za |= (t0 << (i * 3)); za |= (t1 << (i * 3 + 16)); }
+    for (int i = 0; i < 5; i++) { uint32_t t0 = qb & 0x07; uint32_t t1 = (qb & 0x38) >> 3; qb >>= 6; zb |= (t0 << (i * 3)); zb |= (t1 << (i * 3 + 16)); }
+    for (int i = 0; i < 5; i++) { uint32_t t0 = qc & 0x07; uint32_t t1 = (qc & 0x38) >> 3; qc >>= 6; zc |= (t0 << (i * 3)); zc |= (t1 << (i * 3 + 16)); }
+
+    // za:  9997775 55333111   8886664 44222000
+    // zb:  jjjhhhf ffdddbbb   iiiggge eecccaaa
+    // zc:  tttrrrp ppnnnlll   sssqqqo oommmkkk
+    // qd:                               vvvuuu
+
+    za |= ((qd & 0x01) >> 0) << 15;
+    zb |= ((qd & 0x02) >> 1) << 15;
+    zc |= ((qd & 0x04) >> 2) << 15;
+    za |= ((qd & 0x08) >> 3) << 31;
+    zb |= ((qd & 0x10) >> 4) << 31;
+    zc |= ((qd & 0x20) >> 5) << 31;
+
+    // za: v9997775 55333111  u8886664 44222000  (u, v lsb)
+    // zb: vjjjhhhf ffdddbbb  uiiiggge eecccaaa
+    // zc: vtttrrrp ppnnnlll  usssqqqo oommmkkk
+
+    q[0 * stride] = za;
+    q[1 * stride] = zb;
+    q[2 * stride] = zc;
+}
+
+__forceinline__ __device__ void dequant_3bit_32
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    const uint32_t q_2,
+    half2 (&dq)[16],
+    int stride,
+    const uint32_t zero
+)
+{
+    const uint32_t c0 = 0x64006400;
+    const half y8_  = __float2half_rn(1.0f /  8.0f);
+    const half y64_ = __float2half_rn(1.0f / 64.0f);
+    const half2 y8  = __halves2half2(y8_,  y8_);
+    const half2 y64 = __halves2half2(y64_, y64_);
+    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+    const half z8_ = __hsub(__int2half_rn(-128), __int2half_rn(zero));
+    const half z64_ = __hsub(__int2half_rn(-16), __int2half_rn(zero));
+    const half2 z1  = __halves2half2(z1_.as_half,  z1_.as_half);
+    const half2 z8  = __halves2half2(z8_,  z8_);
+    const half2 z64 = __halves2half2(z64_, z64_);
+
+    uint32_t qa = q_0;
+    uint32_t qb = q_1;
+    uint32_t qc = q_2;
+
+    half2_uint32 q0((qa & 0x00070007) | c0); // half2(q[ 0], q[ 1])      + 1024
+    half2_uint32 q1((qa & 0x00380038) | c0); // half2(q[ 2], q[ 3]) *  8 + 1024
+    qa >>= 6;
+    half2_uint32 q2((qa & 0x00070007) | c0); // half2(q[ 4], q[ 5])      + 1024
+    half2_uint32 q3((qa & 0x00380038) | c0); // half2(q[ 6], q[ 7]) *  8 + 1024
+    half2_uint32 q4((qa & 0x01c001c0) | c0); // half2(q[ 8], q[ 9]) * 64 + 1024
+    qa >>= 9;
+    qa &= 0x00010001;
+    half2_uint32 q5((qb & 0x00070007) | c0); // half2(q[10], q[11])      + 1024
+    half2_uint32 q6((qb & 0x00380038) | c0); // half2(q[12], q[13]) *  8 + 1024
+    qb >>= 6;
+    half2_uint32 q7((qb & 0x00070007) | c0); // half2(q[14], q[15])      + 1024
+    half2_uint32 q8((qb & 0x00380038) | c0); // half2(q[16], q[17]) *  8 + 1024
+    half2_uint32 q9((qb & 0x01c001c0) | c0); // half2(q[18], q[19]) * 64 + 1024
+    qb >>= 8;
+    qb &= 0x00020002;
+    half2_uint32 q10((qc & 0x00070007) | c0); // half2(q[20], q[21])      + 1024
+    half2_uint32 q11((qc & 0x00380038) | c0); // half2(q[22], q[23]) *  8 + 1024
+    qc >>= 6;
+    half2_uint32 q12((qc & 0x00070007) | c0); // half2(q[24], q[25])      + 1024
+    half2_uint32 q13((qc & 0x00380038) | c0); // half2(q[26], q[27]) *  8 + 1024
+    half2_uint32 q14((qc & 0x01c001c0) | c0); // half2(q[28], q[29]) * 64 + 1024
+    qc >>= 7;
+    qc &= 0x00040004;
+    half2_uint32 q15((qa | qb | qc) | c0);
+
+    dq[ 0] = __hadd2( q0.as_half2, z1);
+    dq[ 1] = __hfma2( q1.as_half2, y8,  z8);
+    dq[ 2] = __hadd2( q2.as_half2, z1);
+    dq[ 3] = __hfma2( q3.as_half2, y8,  z8);
+    dq[ 4] = __hfma2( q4.as_half2, y64, z64);
+    dq[ 5] = __hadd2( q5.as_half2, z1);
+    dq[ 6] = __hfma2( q6.as_half2, y8,  z8);
+    dq[ 7] = __hadd2( q7.as_half2, z1);
+    dq[ 8] = __hfma2( q8.as_half2, y8,  z8);
+    dq[ 9] = __hfma2( q9.as_half2, y64, z64);
+    dq[10] = __hadd2(q10.as_half2, z1);
+    dq[11] = __hfma2(q11.as_half2, y8,  z8);
+    dq[12] = __hadd2(q12.as_half2, z1);
+    dq[13] = __hfma2(q13.as_half2, y8,  z8);
+    dq[14] = __hfma2(q14.as_half2, y64, z64);
+    dq[15] = __hadd2(q15.as_half2, z1);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/csrc/quantization/gptq/qdq_4.cuh b/csrc/quantization/gptq/qdq_4.cuh
index cfc4635a22c1d..881f353f6564d 100644
--- a/csrc/quantization/gptq/qdq_4.cuh
+++ b/csrc/quantization/gptq/qdq_4.cuh
@@ -38,16 +38,17 @@ __forceinline__ __device__ void dequant_4bit_8
 (
     const uint32_t q_0,
     half2 (&dq)[4],
-    int stride
+    int stride,
+    const uint32_t zero
 )
 {
     const uint32_t c0 = 0x64006400;
     const half y16_ = __float2half_rn(1.0f / 16.0f);
     const half2 y16 = __halves2half2(y16_, y16_);
-    const half z1_  = __float2half_rn(-1024.0f         - 8.0f);
-    const half z16_ = __float2half_rn(-1024.0f / 16.0f - 8.0f);
-    const half2 z1  = __halves2half2(z1_,  z1_);
-    const half2 z16 = __halves2half2(z16_, z16_);
+    const half_uint16 z1_(0xe400 | zero); // half(-1024.0f - zero);
+    const half z16_ = __hsub(__int2half_rn(-64), __int2half_rn(zero));
+    const half2 z1 = __half2half2(z1_.as_half);
+    const half2 z16 = __half2half2(z16_);
 
     uint32_t qa = q_0;
     half2_uint32 q0((qa & 0x000f000f) | c0); // half2(q[ 0], q[ 1])      + 1024
@@ -143,93 +144,4 @@ __forceinline__ __device__ void dequant_4bit_8_gptq
 }  // namespace gptq
 }  // namespace vllm
 
-#else
-
-namespace vllm {
-namespace gptq {
-__forceinline__ __device__ void shuffle_4bit_8
-(
-    uint32_t* q,
-    int stride
-)
-{
-}
-
-__forceinline__ __device__ void dequant_4bit_8
-(
-    const uint32_t q_0,
-    half2 (&dq)[4],
-    int stride
-)
-{
-    half dqh[8];
-    for (int i = 0; i < 8; i++) dqh[i] = dq_ns(exb(q_0, i * 4, 0x0f), 8);
-
-    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_prep_zero_scale
-(
-    const uint32_t zero,
-    const half scale,
-    half2 (&z1)[2],
-    half2 (&y1)[2]
-)
-{
-    half z = __int2half_rn(-((int)zero));
-    z = __hmul(z, scale);
-    z1[0] = __half2half2(z);
-    y1[0] = __half2half2(scale);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_prep_zero
-(
-    const uint32_t zero,
-    half2(&z1)[2],
-    half2(&y1)[2]
-)
-{
-    half z = __int2half_rn(-((int)zero));
-    z1[0] = __half2half2(z);
-}
-
-__forceinline__ __device__ void dequant_4bit_8_gptq
-(
-    const uint32_t q_0,
-    half2 (&dq)[4],
-    half2 (&z1)[2],
-    half2 (&y1)[2],
-    int stride,
-    bool scaled
-)
-{
-    half2 dqh2[8];
-
-    uint32_t qa = q_0;
-    for (int i = 0; i < 4; i++)
-    {
-        half d0 = __int2half_rn(qa & 0x0f); qa >>= 4;
-        half d1 = __int2half_rn(qa & 0x0f); qa >>= 4;
-        dqh2[i] = __halves2half2(d0, d1);
-    }
-
-    if (scaled)
-    {
-        dq[0] = __hfma2(dqh2[0], y1[0], z1[0]);
-        dq[1] = __hfma2(dqh2[1], y1[0], z1[0]);
-        dq[2] = __hfma2(dqh2[2], y1[0], z1[0]);
-        dq[3] = __hfma2(dqh2[3], y1[0], z1[0]);
-    }
-    else
-    {
-        dq[0] = __hadd2(dqh2[0], z1[0]);
-        dq[1] = __hadd2(dqh2[1], z1[0]);
-        dq[2] = __hadd2(dqh2[2], z1[0]);
-        dq[3] = __hadd2(dqh2[3], z1[0]);
-    }
-}
-
-}  // namespace gptq
-}  // namespace vllm
-
 #endif
diff --git a/csrc/quantization/gptq/qdq_8.cuh b/csrc/quantization/gptq/qdq_8.cuh
new file mode 100644
index 0000000000000..0c7ad7876140b
--- /dev/null
+++ b/csrc/quantization/gptq/qdq_8.cuh
@@ -0,0 +1,40 @@
+/*
+Copied from https://github.com/turboderp/exllamav2
+*/
+
+#ifndef _qdq_8_cuh
+#define _qdq_8_cuh
+
+#include "qdq_util.cuh"
+
+namespace vllm {
+namespace gptq {
+
+__forceinline__ __device__ void shuffle_8bit_4
+(
+    uint32_t* q,
+    int stride
+)
+{
+}
+
+__forceinline__ __device__ void dequant_8bit_8
+(
+    const uint32_t q_0,
+    const uint32_t q_1,
+    half2 (&dq)[4],
+    int stride,
+    const uint32_t zero
+)
+{
+    half dqh[8];
+    for (int i = 0; i < 4; i++) dqh[i    ] = dq_ns(exb(q_0, i * 8, 0xff), zero);
+    for (int i = 0; i < 4; i++) dqh[i + 4] = dq_ns(exb(q_1, i * 8, 0xff), zero);
+
+    for (int i = 0; i < 4; i++) dq[i] = __halves2half2(dqh[i * 2], dqh[i * 2 + 1]);
+}
+
+}  // namespace gptq
+}  // namespace vllm
+
+#endif
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 7218760fbe55d..2e6aabb232673 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -1,6 +1,7 @@
 import enum
 from enum import Enum
 from typing import Any, Dict, List, Optional
+from fractions import Fraction
 
 import torch
 from torch.nn.parameter import Parameter
@@ -27,11 +28,10 @@ def __init__(
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
-        self.pack_factor = 32 // self.weight_bits
-        # exllama kernel v1 only supports 4 bit
-        if self.weight_bits != 4:
+        self.pack_factor = Fraction(32, self.weight_bits)
+        if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
-                "Currently, only 4-bit weight quantization is supported for "
+                "Currently, only 2/3/4/8-bit weight quantization is supported for "
                 f"GPTQ, but got {self.weight_bits} bits.")
 
     def __repr__(self) -> str:
@@ -101,7 +101,7 @@ def create_weights(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
-        if output_size_per_partition % self.quant_config.pack_factor != 0:
+        if output_size_per_partition % self.quant_config.pack_factor.numerator != 0:
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
@@ -201,11 +201,13 @@ def apply_weights(self,
             else:
                 weights["g_idx"] = torch.empty((1, 1), device="meta")
             weights["exllama_state"] = ExllamaState.READY
-            ops.gptq_shuffle(weights["qweight"], weights["g_idx"])
+            ops.gptq_shuffle(weights["qweight"], weights["g_idx"],
+                             self.quant_config.weight_bits)
         output = ops.gptq_gemm(reshaped_x, weights["qweight"],
                                weights["qzeros"], weights["scales"],
                                weights["g_idx"],
-                               weights["exllama_state"] == ExllamaState.READY)
+                               weights["exllama_state"] == ExllamaState.READY,
+                               self.quant_config.weight_bits)
         if bias is not None:
             output = output + bias
         return output.reshape(out_shape)

From a6d471c75939b2f4708a4e1cb1aa3b7b993ee54b Mon Sep 17 00:00:00 2001
From: Jae-Won Chung <jwnchung@umich.edu>
Date: Thu, 29 Feb 2024 01:04:07 -0500
Subject: [PATCH 030/113] Fix: `AttributeError` in OpenAI-compatible server
 (#3018)

---
 vllm/entrypoints/openai/protocol.py     | 2 +-
 vllm/entrypoints/openai/serving_chat.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e85e7e2b1ede9..97cfd797587c4 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -57,7 +57,7 @@ class UsageInfo(BaseModel):
 
 class ChatCompletionRequest(BaseModel):
     model: str
-    messages: Union[str, List[Dict[str, str]]]
+    messages: List[Dict[str, str]]
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     n: Optional[int] = 1
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 5635ac6c9e106..e5ae39e110a40 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -80,7 +80,7 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
         if request.add_generation_prompt:
             return self.response_role
         else:
-            return request.messages[-1].role
+            return request.messages[-1]["role"]
 
     async def chat_completion_stream_generator(
             self, request: ChatCompletionRequest,

From 9289e577ec185bd9feb2c03bb86b82f1bf9bb633 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Thu, 29 Feb 2024 14:15:18 +0800
Subject: [PATCH 031/113] add cache_config's info to prometheus metrics.
 (#3100)

---
 vllm/config.py            |  4 ++++
 vllm/engine/llm_engine.py |  1 +
 vllm/engine/metrics.py    | 10 +++++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index fc848b72d7f2a..2f8883fe0733e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -308,6 +308,10 @@ def __init__(
         self.num_gpu_blocks = None
         self.num_cpu_blocks = None
 
+    def metrics_info(self):
+        # convert cache_config to dict(key: str, value:str) for prometheus metrics info
+        return {key: str(value) for key, value in self.__dict__.items()}
+
     def _verify_args(self) -> None:
         if self.gpu_memory_utilization > 1.0:
             raise ValueError(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f0fd7efdef813..6f5af71426d78 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -138,6 +138,7 @@ def __init__(
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                 labels=dict(model_name=model_config.model))
+            self.stat_logger.info("cache_config", self.cache_config)
 
         self.forward_dag = None
         if USE_RAY_COMPILED_DAG:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 83e66a9372272..54b09c38f58a5 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,5 +1,5 @@
 from vllm.logger import init_logger
-from prometheus_client import Counter, Gauge, Histogram, REGISTRY, disable_created_metrics
+from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics
 
 import time
 import numpy as np
@@ -23,6 +23,10 @@ def __init__(self, labelnames: List[str]):
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 REGISTRY.unregister(collector)
 
+        self.info_cache_config = Info(
+            name='vllm:cache_config',
+            documentation='information of cache_config')
+
         # System stats
         self.gauge_scheduler_running = Gauge(
             name="vllm:num_requests_running",
@@ -128,6 +132,10 @@ def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
         self.labels = labels
         self.metrics = Metrics(labelnames=list(labels.keys()))
 
+    def info(self, type: str, obj: object) -> None:
+        if type == "cache_config":
+            self.metrics.info_cache_config.info(obj.metrics_info())
+
     def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
         return float(np.sum(tracked_stats) / (now - self.last_local_log))
 

From bfdcfa6a053c693800551bd1bd71acabbe1941e8 Mon Sep 17 00:00:00 2001
From: Seonghyeon <seonghyeon.drew@gmail.com>
Date: Thu, 29 Feb 2024 17:51:48 +0900
Subject: [PATCH 032/113] Support starcoder2 architecture (#3089)

---
 README.md                                     |   1 +
 tests/models/test_models.py                   |   1 +
 vllm/model_executor/models/__init__.py        |   1 +
 vllm/model_executor/models/starcoder2.py      | 310 ++++++++++++++++++
 vllm/transformers_utils/config.py             |  10 +
 vllm/transformers_utils/configs/__init__.py   |   2 +
 vllm/transformers_utils/configs/starcoder2.py | 127 +++++++
 7 files changed, 452 insertions(+)
 create mode 100644 vllm/model_executor/models/starcoder2.py
 create mode 100644 vllm/transformers_utils/configs/starcoder2.py

diff --git a/README.md b/README.md
index f771788db2b89..064faa550f267 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
 - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
+- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index e44452e9893cf..fb567e837d281 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -19,6 +19,7 @@
     "microsoft/phi-2",
     "stabilityai/stablelm-3b-4e1t",
     "allenai/OLMo-1B",
+    "bigcode/starcoder2-3b",
 ]
 
 
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index e4f3a785cd99a..75c2ae1e9f48e 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -45,6 +45,7 @@
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
     "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
+    "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
 }
 
 # Models not supported by ROCm.
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
new file mode 100644
index 0000000000000..1eda07b724cae
--- /dev/null
+++ b/vllm/model_executor/models/starcoder2.py
@@ -0,0 +1,310 @@
+# coding=utf-8
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Starcoder2 model."""
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearMethodBase,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
+from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size
+from vllm.model_executor.weight_utils import (default_weight_loader,
+                                              hf_model_weights_iterator)
+from vllm.sequence import SamplerOutput
+
+try:
+    from transformers import Starcoder2Config
+except ImportError:
+    # fallback to PretrainedConfig
+    # NOTE: Please install transformers from source or use transformers>=4.39.0
+    from transformers import PretrainedConfig as Starcoder2Config
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class Starcoder2Attention(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.use_bias = config.use_bias
+        self.sliding_window = config.sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=self.use_bias,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=self.use_bias,
+            linear_method=linear_method,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = PagedAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            sliding_window=self.sliding_window,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Starcoder2MLP(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.c_fc = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=config.use_bias,
+            linear_method=linear_method,
+        )
+        self.c_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            linear_method=linear_method,
+        )
+        self.act = get_act_fn(config.hidden_act,
+                              intermediate_size=config.intermediate_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class Starcoder2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Starcoder2Attention(config,
+                                             linear_method=linear_method)
+        self.mlp = Starcoder2MLP(config, linear_method=linear_method)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.norm_epsilon)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Starcoder2Model(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # TODO: consider padding_idx (currently removed)
+        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+                                                   config.hidden_size)
+        self.layers = nn.ModuleList([
+            Starcoder2DecoderLayer(config, linear_method=linear_method)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states, kv_caches[i],
+                                  input_metadata)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Starcoder2ForCausalLM(nn.Module):
+
+    def __init__(self,
+                 config: Starcoder2Config,
+                 linear_method: Optional[LinearMethodBase] = None):
+        super().__init__()
+        self.config = config
+        self.model = Starcoder2Model(config, linear_method=linear_method)
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+        if config.tie_word_embeddings:
+            self.lm_head_weight = self.model.embed_tokens.weight
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+            self.lm_head_weight = self.lm_head.weight
+        self.sampler = Sampler(self.unpadded_vocab_size, config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, load_format, revision):
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 6b0413f440a0e..5e1f0439aec51 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -9,6 +9,7 @@
     "mpt": MPTConfig,
     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+    "starcoder2": Starcoder2Config,
 }
 
 
@@ -16,6 +17,15 @@ def get_config(model: str,
                trust_remote_code: bool,
                revision: Optional[str] = None,
                code_revision: Optional[str] = None) -> PretrainedConfig:
+    # FIXME(woosuk): This is a temporary fix for StarCoder2.
+    # Remove this when the model is supported by HuggingFace transformers.
+    if "bigcode" in model and "starcoder2" in model:
+        config_class = _CONFIG_REGISTRY["starcoder2"]
+        config = config_class.from_pretrained(model,
+                                              revision=revision,
+                                              code_revision=code_revision)
+        return config
+
     try:
         config = AutoConfig.from_pretrained(
             model,
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ef955f75cedaa..4966526f15184 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -4,9 +4,11 @@
 # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
+from vllm.transformers_utils.configs.starcoder2 import Starcoder2Config
 
 __all__ = [
     "ChatGLMConfig",
     "MPTConfig",
     "RWConfig",
+    "Starcoder2Config",
 ]
diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py
new file mode 100644
index 0000000000000..4c3b6b8def074
--- /dev/null
+++ b/vllm/transformers_utils/configs/starcoder2.py
@@ -0,0 +1,127 @@
+from transformers import PretrainedConfig
+
+
+class Starcoder2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
+    Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49152):
+            Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Starcoder2Model`]
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 12288):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 30):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            Epsilon value for the layer norm
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the "end-of-sequence" token.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None` (no sliding window).
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        residual_dropout (`float`, *optional*, defaults to 0.0):
+            Residual connection dropout value.
+        embedding_dropout (`float`, *optional*, defaults to 0.0):
+            Embedding dropout.
+        use_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias term on linear layers of the model.
+
+
+    ```python
+    >>> from transformers import Starcoder2Model, Starcoder2Config
+
+    >>> # Initializing a Starcoder2 7B style configuration
+    >>> configuration = Starcoder2Config()
+
+    >>> # Initializing a model from the Starcoder2 7B style configuration
+    >>> model = Starcoder2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "starcoder2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=49152,
+        hidden_size=3072,
+        intermediate_size=12288,
+        num_hidden_layers=30,
+        num_attention_heads=24,
+        num_key_value_heads=2,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=4096,
+        initializer_range=0.018042,
+        norm_epsilon=1e-5,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        rope_theta=10000.0,
+        sliding_window=None,
+        attention_dropout=0.0,
+        residual_dropout=0.0,
+        embedding_dropout=0.0,
+        use_bias=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.use_bias = use_bias
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_epsilon = norm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.embedding_dropout = embedding_dropout
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if self.architectures is None:
+            self.architectures = ['Starcoder2ForCausalLM']

From 2c08ff23c07f2f8d51da8e1783c5346dccc1fd12 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Fri, 1 Mar 2024 03:13:58 +0800
Subject: [PATCH 033/113] Fix building from source on WSL (#3112)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 16978d74e0425..1f48be948aa84 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ def _is_neuron() -> bool:
     torch_neuronx_installed = True
     try:
         subprocess.run(["neuron-ls"], capture_output=True, check=True)
-    except FileNotFoundError:
+    except (FileNotFoundError, PermissionError):
         torch_neuronx_installed = False
     return torch_neuronx_installed
 

From 29a8d6a554a87292f05b62078976b43a899691e3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 29 Feb 2024 11:20:42 -0800
Subject: [PATCH 034/113] [Fix] Don't deep-copy LogitsProcessors when copying
 SamplingParams (#3099)

---
 vllm/engine/llm_engine.py |  5 +++--
 vllm/sampling_params.py   | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 6f5af71426d78..9bf19b932d35b 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -484,8 +484,9 @@ def add_request(
             prompt_token_ids[:prefix_pos], lora_request.lora_int_id
             if lora_request else 0) if prefix_pos is not None else None
 
-        # Defensive copy of SamplingParams, which are used by the sampler
-        sampling_params = copy.deepcopy(sampling_params)
+        # Defensive copy of SamplingParams, which are used by the sampler,
+        # this doesn't deep-copy LogitsProcessor objects
+        sampling_params = sampling_params.clone()
 
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 51d39220ca9ca..8103f3c2b24bf 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -1,4 +1,5 @@
 """Sampling parameters for text generation."""
+import copy
 from enum import IntEnum
 from functools import cached_property
 from typing import Callable, List, Optional, Union
@@ -237,6 +238,20 @@ def sampling_type(self) -> SamplingType:
             return SamplingType.RANDOM_SEED
         return SamplingType.RANDOM
 
+    def clone(self) -> "SamplingParams":
+        """Deep copy excluding LogitsProcessor objects.
+
+        LogitsProcessor objects are excluded because they may contain an
+        arbitrary, nontrivial amount of data.
+        See https://github.com/vllm-project/vllm/issues/3087
+        """
+
+        logit_processor_refs = None if self.logits_processors is None else {
+            id(lp): lp
+            for lp in self.logits_processors
+        }
+        return copy.deepcopy(self, memo=logit_processor_refs)
+
     def __repr__(self) -> str:
         return (
             f"SamplingParams(n={self.n}, "

From 703e42ee4b3efed3c71e7ae7d15f0f96e05722d4 Mon Sep 17 00:00:00 2001
From: felixzhu555 <79335195+felixzhu555@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:13:08 -0800
Subject: [PATCH 035/113] Add guided decoding for OpenAI API server (#2819)

Co-authored-by: br3no <breno@veltefaria.de>
Co-authored-by: simon-mo <simon.mo@hey.com>
---
 requirements.txt                              |   1 +
 tests/entrypoints/test_guided_processors.py   |  75 ++++++
 tests/entrypoints/test_openai_server.py       | 237 ++++++++++++++++++
 vllm/engine/async_llm_engine.py               |   3 +
 vllm/entrypoints/openai/protocol.py           |  36 ++-
 vllm/entrypoints/openai/serving_chat.py       |   9 +
 vllm/entrypoints/openai/serving_completion.py |   9 +
 vllm/model_executor/guided_decoding.py        |  99 ++++++++
 .../guided_logits_processors.py               | 129 ++++++++++
 9 files changed, 597 insertions(+), 1 deletion(-)
 create mode 100644 tests/entrypoints/test_guided_processors.py
 create mode 100644 vllm/model_executor/guided_decoding.py
 create mode 100644 vllm/model_executor/guided_logits_processors.py

diff --git a/requirements.txt b/requirements.txt
index d4599ec95d945..05ec2e804e13b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,5 @@ pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 pynvml == 11.5.0
 triton >= 2.1.0
+outlines >= 0.0.27
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
new file mode 100644
index 0000000000000..5b39269916f8b
--- /dev/null
+++ b/tests/entrypoints/test_guided_processors.py
@@ -0,0 +1,75 @@
+# This unit test should be moved to a new
+# tests/test_guided_decoding directory.
+
+from transformers import AutoTokenizer
+import torch
+
+from vllm.model_executor.guided_logits_processors import (RegexLogitsProcessor,
+                                                          JSONLogitsProcessor)
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
+             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+
+
+def test_guided_logits_processors():
+    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
+    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+    regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
+    json_LP = JSONLogitsProcessor(TEST_SCHEMA, tokenizer)
+
+    regex_LP.init_state()
+    token_ids = tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    regex_LP(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+    json_LP.init_state()
+    token_ids = tokenizer.encode(
+        f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    json_LP(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 72e2374899793..e426cf7eed72b 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -9,12 +9,64 @@
 import openai  # use the official client for correctness check
 from huggingface_hub import snapshot_download  # downloading lora to test lora requests
 
+# imports for guided decoding tests
+import json
+import jsonschema
+import re
+
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
 LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
 
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
+             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+
 pytestmark = pytest.mark.asyncio
 
 
@@ -325,6 +377,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
         max_tokens=max_tokens,
         temperature=0.0,
         logit_bias={str(token_id): 100},
+        seed=42,
     )
     assert completion.choices[0].text is not None and len(
         completion.choices[0].text) >= 5
@@ -358,5 +411,189 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
     assert first_response != completion.choices[0].text
 
 
+async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=
+        f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}",
+        n=3,
+        temperature=1.0,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA))
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 3
+    for i in range(3):
+        assert completion.choices[i].text is not None
+        output_json = json.loads(completion.choices[i].text)
+        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+
+
+async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "Give an example JSON for an employee profile that " + \
+                    f"fits this schema: {TEST_SCHEMA}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json1 = json.loads(message.content)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+
+    messages.append({"role": "assistant", "content": message.content})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=500,
+        extra_body=dict(guided_json=TEST_SCHEMA))
+    message = chat_completion.choices[0].message
+    assert message.content is not None
+    json2 = json.loads(message.content)
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+async def test_guided_regex_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+        n=3,
+        temperature=1.0,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX))
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 3
+    for i in range(3):
+        assert completion.choices[i].text is not None
+        assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+
+
+async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example IP address with this regex: {TEST_REGEX}"
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX))
+    ip1 = chat_completion.choices[0].message.content
+    assert ip1 is not None
+    assert re.fullmatch(TEST_REGEX, ip1) is not None
+
+    messages.append({"role": "assistant", "content": ip1})
+    messages.append({"role": "user", "content": "Give me a different one"})
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=20,
+        extra_body=dict(guided_regex=TEST_REGEX))
+    ip2 = chat_completion.choices[0].message.content
+    assert ip2 is not None
+    assert re.fullmatch(TEST_REGEX, ip2) is not None
+    assert ip1 != ip2
+
+
+async def test_guided_choice_completion(server, client: openai.AsyncOpenAI):
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt="The best language for type-safe systems programming is ",
+        n=2,
+        temperature=1.0,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE))
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 2
+    for i in range(2):
+        assert completion.choices[i].text in TEST_CHOICE
+
+
+async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE))
+    choice1 = chat_completion.choices[0].message.content
+    assert choice1 in TEST_CHOICE
+
+    messages.append({"role": "assistant", "content": choice1})
+    messages.append({
+        "role": "user",
+        "content": "I disagree, pick another one"
+    })
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=10,
+        extra_body=dict(guided_choice=TEST_CHOICE))
+    choice2 = chat_completion.choices[0].message.content
+    assert choice2 in TEST_CHOICE
+    assert choice1 != choice2
+
+
+async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI):
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example JSON that fits this schema: 42",
+            extra_body=dict(guided_json=42))
+
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        "The best language for type-safe systems programming is "
+    }]
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.chat.completions.create(model=MODEL_NAME,
+                                                 messages=messages,
+                                                 extra_body=dict(guided_regex={
+                                                     1: "Python",
+                                                     2: "C++"
+                                                 }))
+
+    with pytest.raises(openai.BadRequestError):
+        _ = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Give an example string that fits this regex",
+            extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 7cba654602779..daa6419cdad3b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -333,6 +333,9 @@ def is_running(self) -> bool:
         return (self.background_loop is not None
                 and not self.background_loop.done())
 
+    def get_tokenizer(self):
+        return self.engine.tokenizer.tokenizer
+
     def start_background_loop(self) -> None:
         """Start the background loop."""
         if self.is_running:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 97cfd797587c4..26499b8d7a66f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -3,7 +3,7 @@
 import time
 from typing import Dict, List, Literal, Optional, Union
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 
 from vllm.utils import random_uuid
 from vllm.sampling_params import SamplingParams
@@ -86,6 +86,9 @@ class ChatCompletionRequest(BaseModel):
     min_p: Optional[float] = 0.0
     include_stop_str_in_output: Optional[bool] = False
     length_penalty: Optional[float] = 1.0
+    guided_json: Optional[Union[str, dict, BaseModel]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[List[str]] = None
 
     def to_sampling_params(self) -> SamplingParams:
         if self.logprobs and not self.top_logprobs:
@@ -131,6 +134,20 @@ def logit_bias_logits_processor(
             logits_processors=logits_processors,
         )
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_guided_decoding_count(cls, data):
+        guide_count = sum([
+            "guided_json" in data and data["guided_json"] is not None,
+            "guided_regex" in data and data["guided_regex"] is not None,
+            "guided_choice" in data and data["guided_choice"] is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('guided_json', 'guided_regex' or 'guided_choice').")
+        return data
+
 
 class CompletionRequest(BaseModel):
     model: str
@@ -163,6 +180,9 @@ class CompletionRequest(BaseModel):
     min_p: Optional[float] = 0.0
     include_stop_str_in_output: Optional[bool] = False
     length_penalty: Optional[float] = 1.0
+    guided_json: Optional[Union[str, dict, BaseModel]] = None
+    guided_regex: Optional[str] = None
+    guided_choice: Optional[List[str]] = None
 
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
@@ -207,6 +227,20 @@ def logit_bias_logits_processor(
             logits_processors=logits_processors,
         )
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_guided_decoding_count(cls, data):
+        guide_count = sum([
+            "guided_json" in data and data["guided_json"] is not None,
+            "guided_regex" in data and data["guided_regex"] is not None,
+            "guided_choice" in data and data["guided_choice"] is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('guided_json', 'guided_regex' or 'guided_choice').")
+        return data
+
 
 class LogProbs(BaseModel):
     text_offset: List[int] = Field(default_factory=list)
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index e5ae39e110a40..f4ad0aa5a0184 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -12,6 +12,7 @@
     UsageInfo)
 from vllm.outputs import RequestOutput
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
+from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
 
 logger = init_logger(__name__)
 
@@ -62,6 +63,14 @@ async def create_chat_completion(
                                                            prompt=prompt)
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
+            guided_decode_logits_processor = (
+                await get_guided_decoding_logits_processor(
+                    request, self.engine.get_tokenizer()))
+            if guided_decode_logits_processor:
+                if sampling_params.logits_processors is None:
+                    sampling_params.logits_processors = []
+                sampling_params.logits_processors.append(
+                    guided_decode_logits_processor)
         except ValueError as e:
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 610f53549da48..713e67793b290 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -16,6 +16,7 @@
 )
 from vllm.outputs import RequestOutput
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
+from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
 
 logger = init_logger(__name__)
 
@@ -286,6 +287,14 @@ async def create_completion(self, request: CompletionRequest,
         try:
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
+            guided_decode_logit_processor = (
+                await get_guided_decoding_logits_processor(
+                    request, self.engine.get_tokenizer()))
+            if guided_decode_logit_processor is not None:
+                if sampling_params.logits_processors is None:
+                    sampling_params.logits_processors = []
+                sampling_params.logits_processors.append(
+                    guided_decode_logit_processor)
             prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
 
             for i, prompt in enumerate(prompts):
diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py
new file mode 100644
index 0000000000000..a8573f8bdc6c8
--- /dev/null
+++ b/vllm/model_executor/guided_decoding.py
@@ -0,0 +1,99 @@
+import asyncio
+import concurrent.futures
+from copy import copy
+from enum import Enum
+from functools import lru_cache
+from json import dumps as json_dumps
+from re import escape as regex_escape
+from typing import Union, Tuple
+from pydantic import BaseModel
+
+from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest
+from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor
+
+
+class GuidedDecodingMode(Enum):
+    JSON = "json"
+    REGEX = "regex"
+    CHOICE = "choice"
+
+
+global_thread_pool = None  # used for generating logits processor fsm
+
+
+async def get_guided_decoding_logits_processor(
+        request: Union[CompletionRequest, ChatCompletionRequest],
+        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    global global_thread_pool
+    guide, mode = _get_guide_and_mode(request)
+    if not guide:
+        return None
+
+    if global_thread_pool is None:
+        global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=2)
+    loop = asyncio.get_running_loop()
+
+    result = await loop.run_in_executor(global_thread_pool,
+                                        _get_cached_logits_processor, guide,
+                                        tokenizer, mode)
+
+    logits_processor = copy(result)
+    # reset logits processor's internal state
+    logits_processor.init_state()
+    return logits_processor
+
+
+def _get_guide_and_mode(
+    request: Union[CompletionRequest, ChatCompletionRequest]
+) -> Tuple[str, GuidedDecodingMode]:
+
+    if request.guided_json:
+        if not isinstance(request.guided_json, (str, dict, BaseModel)):
+            raise TypeError("JSON schema must be str, dict, or BaseModel")
+
+        json = request.guided_json
+        if isinstance(json, dict):
+            # turn dict into hashable string
+            json = json_dumps(json, sort_keys=True)
+        elif isinstance(json, BaseModel):
+            # use pydantic signature so that different model classes
+            # with the same fields will get hashed the same
+            json = str(json.__signature__)
+        return json, GuidedDecodingMode.JSON
+
+    elif request.guided_regex:
+        if not isinstance(request.guided_regex, str):
+            raise TypeError("Regex must be string")
+        return request.guided_regex, GuidedDecodingMode.REGEX
+
+    elif request.guided_choice:
+        if not isinstance(request.guided_choice, list):
+            raise TypeError("Choices must be a list")
+
+        # choice just uses regex
+        choices = [
+            regex_escape(str(choice)) for choice in request.guided_choice
+        ]
+        choices_regex = "(" + "|".join(choices) + ")"
+        return choices_regex, GuidedDecodingMode.CHOICE
+
+    else:
+        return None, None
+
+
+@lru_cache(maxsize=32)
+def _get_cached_logits_processor(guide: str, tokenizer,
+                                 mode: GuidedDecodingMode):
+    if mode == GuidedDecodingMode.JSON:
+        return JSONLogitsProcessor(guide, tokenizer)
+    elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
+        return RegexLogitsProcessor(guide, tokenizer)
+    else:
+        raise ValueError(f"Unknown guided decoding mode {mode}")
diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py
new file mode 100644
index 0000000000000..1b3e5e71a5911
--- /dev/null
+++ b/vllm/model_executor/guided_logits_processors.py
@@ -0,0 +1,129 @@
+# Copyright 2024- the Outlines developers
+# This file is adapted from
+# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import math
+from collections import defaultdict
+from typing import Union, DefaultDict, Dict, List, Optional
+
+import torch
+from pydantic import BaseModel
+from outlines.fsm.fsm import RegexFSM
+from outlines.fsm.json_schema import build_regex_from_schema
+
+
+class RegexLogitsProcessor:
+
+    def __init__(self, regex_string: str, tokenizer):
+        """Compile the FSM that drives the regex-structured generation.
+
+        Parameters
+        ----------
+        regex_string
+            A string that represents a regular expression
+        tokenizer
+            The model's tokenizer
+
+        """
+        tokenizer = self.adapt_tokenizer(tokenizer)
+        fsm = RegexFSM(regex_string, tokenizer)
+        self.fsm = fsm
+
+    def init_state(self):
+        """Initialize the FSM states."""
+        self.fsm_state: DefaultDict[int, int] = defaultdict(int)
+
+    def __call__(self, input_ids: List[int],
+                 scores: torch.Tensor) -> torch.Tensor:
+        """Use the FSM to bias the logits before sampling the next token."""
+
+        seq_id = hash(tuple(input_ids))
+
+        if len(input_ids) == 0:
+            self.init_state()
+        else:
+            last_token = input_ids[-1]
+            last_seq_id = hash(tuple(input_ids[:-1]))
+            self.fsm_state[seq_id] = self.fsm.next_state(
+                self.fsm_state[last_seq_id], last_token)
+
+        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
+
+        mask = torch.full((scores.shape[-1], ),
+                          -math.inf,
+                          device=scores.device)
+        mask[allowed_tokens] = 0
+        scores.add_(mask)
+
+        return scores
+
+    def adapt_tokenizer(self, tokenizer):
+        """Adapt vLLM's tokenizer to use to compile the FSM.
+
+        The API of Outlines tokenizers is slightly different to that of
+        `transformers`. In addition we need to handle the missing spaces to
+        Llama's tokenizer to be able to compile FSMs for this model.
+
+        """
+        tokenizer.vocabulary = tokenizer.get_vocab()
+        tokenizer.special_tokens = set(tokenizer.all_special_tokens)
+
+        def convert_token_to_string(token: str) -> str:
+            from transformers.file_utils import SPIECE_UNDERLINE
+
+            string = tokenizer.convert_tokens_to_string([token])
+
+            # A hack to handle missing spaces to HF's Llama tokenizers
+            if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+                return " " + string
+
+            return string
+
+        tokenizer.convert_token_to_string = convert_token_to_string
+
+        return tokenizer
+
+
+class JSONLogitsProcessor(RegexLogitsProcessor):
+
+    def __init__(self,
+                 schema: Union[str, Dict, BaseModel],
+                 tokenizer,
+                 whitespace_pattern: Optional[str] = None):
+        """Compile the FSM that drives the JSON-guided generation.
+
+        Parameters
+        ----------
+        schema
+            A JSON schema that encodes the structure we want the model to generate
+        tokenizer
+            The model's tokenizer
+        whitespace_pattern
+            Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
+            Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
+        """
+        if isinstance(schema, type(BaseModel)):
+            schema_str = json.dumps(schema.model_json_schema())
+        elif isinstance(schema, Dict):
+            schema_str = json.dumps(schema)
+        elif isinstance(schema, str):
+            schema_str = schema
+        else:
+            raise ValueError(
+                f"Cannot parse schema {schema}. The schema must be either " +
+                "a Pydantic object, a dictionary or a string that contains the JSON "
+                + "Schema specification")
+        regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
+        super().__init__(regex_string, tokenizer)

From 54d3544784ff20e7038abf72793eaf734e727269 Mon Sep 17 00:00:00 2001
From: Sherry <503147114@qq.com>
Date: Fri, 1 Mar 2024 15:52:22 +0800
Subject: [PATCH 036/113] Fix: Output text is always truncated in some models
 (#3016)

---
 vllm/engine/llm_engine.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 9bf19b932d35b..df4858a696530 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -980,7 +980,10 @@ def _check_stop(self, seq: Sequence,
     def _finalize_sequence(self, seq: Sequence,
                            sampling_params: SamplingParams,
                            stop_string: str) -> None:
-        if not sampling_params.include_stop_str_in_output and stop_string:
+        if sampling_params.include_stop_str_in_output:
+            return
+
+        if stop_string and seq.output_text.endswith(stop_string):
             # Truncate the output text so that the stop string is
             # not included in the output.
             seq.output_text = seq.output_text[:-len(stop_string)]

From 27ca23dc002e06eade014ac6b801dc2dcbea40f3 Mon Sep 17 00:00:00 2001
From: Seonghyeon <seonghyeon.drew@gmail.com>
Date: Sat, 2 Mar 2024 02:59:06 +0900
Subject: [PATCH 037/113] Remove exclude_unset in streaming response (#3143)

---
 vllm/entrypoints/openai/serving_completion.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 713e67793b290..86b753fa06ab5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -96,7 +96,7 @@ async def completion_stream_generator(
                         logprobs=logprobs,
                         finish_reason=finish_reason,
                     )
-                ]).model_dump_json(exclude_unset=True)
+                ]).model_dump_json()
             yield f"data: {response_json}\n\n"
 
             if output.finish_reason is not None:  # return final usage
@@ -121,7 +121,7 @@ async def completion_stream_generator(
                         )
                     ],
                     usage=final_usage,
-                ).model_dump_json(exclude_unset=True)
+                ).model_dump_json()
                 yield f"data: {response_json}\n\n"
 
     yield "data: [DONE]\n\n"
@@ -306,7 +306,7 @@ async def create_completion(self, request: CompletionRequest,
                         request, prompt=prompt)
 
                 generators.append(
-                    self.engine.generate(None,
+                    self.engine.generate(prompt,
                                          sampling_params,
                                          f"{request_id}-{i}",
                                          prompt_token_ids=input_ids,

From 49d849b3ab7aa6ae493ccde1d85d226833f73fbb Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 1 Mar 2024 14:04:14 -0500
Subject: [PATCH 038/113] docs: Add tutorial on deploying vLLM model with
 KServe (#2586)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/index.rst                         | 1 +
 docs/source/serving/deploying_with_kserve.rst | 8 ++++++++
 2 files changed, 9 insertions(+)
 create mode 100644 docs/source/serving/deploying_with_kserve.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 32929257661ad..bdc541cb2d58e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -70,6 +70,7 @@ Documentation
 
    serving/distributed_serving
    serving/run_on_sky
+   serving/deploying_with_kserve
    serving/deploying_with_triton
    serving/deploying_with_docker
    serving/serving_with_langchain
diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst
new file mode 100644
index 0000000000000..7f22766e09aef
--- /dev/null
+++ b/docs/source/serving/deploying_with_kserve.rst
@@ -0,0 +1,8 @@
+.. _deploying_with_kserve:
+
+Deploying with KServe
+============================
+
+vLLM can be deployed with `KServe <https://github.com/kserve/kserve>`_ on Kubernetes for highly scalable distributed model serving.
+
+Please see `this guide <https://kserve.github.io/website/latest/modelserving/v1beta1/llm/vllm/>`_ for more details on using vLLM with KServe.

From 90fbf12540da089fcc7dc825ce2ceb7ea3a3df33 Mon Sep 17 00:00:00 2001
From: Huarong <huohuarong@gmail.com>
Date: Sat, 2 Mar 2024 03:42:06 +0800
Subject: [PATCH 039/113] fix relative import path of protocol.py (#3134)

Co-authored-by: huohuarong <huohuarong@zuoshouyisheng.com>
---
 vllm/entrypoints/openai/serving_completion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 86b753fa06ab5..99a10196b5f73 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -5,7 +5,7 @@
 from vllm.logger import init_logger
 from vllm.utils import random_uuid
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from .protocol import (
+from vllm.entrypoints.openai.protocol import (
     CompletionRequest,
     CompletionResponse,
     CompletionResponseChoice,

From c0c2335ce027486d254c31f665ce00d7db427d22 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 1 Mar 2024 14:47:51 -0600
Subject: [PATCH 040/113] Integrate Marlin Kernels for Int4 GPTQ inference
 (#2497)

Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com>
Co-authored-by: alexm <alexm@neuralmagic.com>
---
 csrc/ops.h                                    |    9 +
 csrc/pybind.cpp                               |    4 +-
 csrc/quantization/marlin/LICENSE              |  209 +++
 .../quantization/marlin/marlin_cuda_kernel.cu | 1145 +++++++++++++++++
 requirements-dev.txt                          |    1 +
 setup.py                                      |    2 +
 tests/conftest.py                             |   32 +
 tests/models/test_marlin.py                   |   97 ++
 vllm/config.py                                |   18 +-
 vllm/model_executor/layers/linear.py          |   29 +
 .../layers/quantization/__init__.py           |    2 +
 .../layers/quantization/marlin.py             |  210 +++
 12 files changed, 1752 insertions(+), 6 deletions(-)
 create mode 100644 csrc/quantization/marlin/LICENSE
 create mode 100644 csrc/quantization/marlin/marlin_cuda_kernel.cu
 create mode 100644 tests/models/test_marlin.py
 create mode 100644 vllm/model_executor/layers/quantization/marlin.py

diff --git a/csrc/ops.h b/csrc/ops.h
index 08dfb0e8604f1..249c7451bf73c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -84,6 +84,15 @@ torch::Tensor awq_dequantize(
     int split_k_iters,
     int thx,
     int thy);
+
+torch::Tensor marlin_gemm(
+    torch::Tensor& a, 
+    torch::Tensor& b_q_weight,
+    torch::Tensor& b_scales, 
+    torch::Tensor& workspace,
+    int64_t size_m, 
+    int64_t size_n, 
+    int64_t size_k);
 #endif
 
 void squeezellm_gemm(
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 5d062bb5700bc..4b6ade7566398 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -52,11 +52,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     &rotary_embedding,
     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 
-  // Quantization ops
+// Quantization ops
 #ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
   ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
 #endif
+ 
   ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
   ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
diff --git a/csrc/quantization/marlin/LICENSE b/csrc/quantization/marlin/LICENSE
new file mode 100644
index 0000000000000..1d1e4cf9c8233
--- /dev/null
+++ b/csrc/quantization/marlin/LICENSE
@@ -0,0 +1,209 @@
+Contains code from https://github.com/IST-DASLab/marlin
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------------------------------------------------------------------------
+
+This product bundles various third-party components under other open source licenses.
+This section summarizes those components and their licenses. See licenses/
+for text of these licenses.
diff --git a/csrc/quantization/marlin/marlin_cuda_kernel.cu b/csrc/quantization/marlin/marlin_cuda_kernel.cu
new file mode 100644
index 0000000000000..cf1b0afdec8b4
--- /dev/null
+++ b/csrc/quantization/marlin/marlin_cuda_kernel.cu
@@ -0,0 +1,1145 @@
+/*
+ * Modified by Neural Magic
+ * Copyright (C) Marlin.2024 Elias Frantar
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/extension.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+template <typename T> inline std::string str(T x) { return std::to_string(x); }
+
+namespace marlin {
+
+constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+
+// Instances of `Vec` are used to organize groups of >>registers<<, as needed
+// for instance as inputs to tensor core operations. Consequently, all
+// corresponding index accesses must be compile-time constants, which is why we
+// extensively use `#pragma unroll` throughout the kernel code to guarantee
+// this.
+template <typename T, int n> struct Vec {
+  T elems[n];
+  __device__ T &operator[](int i) { return elems[i]; }
+};
+
+using I4 = Vec<int, 4>;
+
+// Matrix fragments for tensor core instructions; their precise layout is
+// documented here:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+using FragA = Vec<half2, 4>;
+using FragB = Vec<half2, 2>;
+using FragC = Vec<float, 4>;
+using FragS = Vec<half2, 1>; // quantization scales
+
+// Predicated asynchronous global->shared copy; used for inputs A where we apply
+// predication to handle batchsizes that are not multiples of 16.
+__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("{\n"
+               "   .reg .pred p;\n"
+               "   setp.ne.b32 p, %0, 0;\n"
+               "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+               "}\n" ::"r"((int)pred),
+               "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+// Asynchronous global->shared copy with a cache hint indicating that the values
+// may be evicted immediately; used for quantized weights B, which are only
+// accessed precisely once and should thus not pollute the L2 cache which we
+// need for inputs A and outputs C.
+__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .b64 p;\n"
+      "   createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
+      "   cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+
+// Async copy fence.
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+
+// Wait until at most `n` async copy stages are still pending.
+template <int n> __device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+
+// m16n8k16 tensor core mma instruction with fp16 inputs and fp32
+// output/accumulation.
+__device__ inline void mma(const FragA &a_frag, const FragB &frag_b,
+                           FragC &frag_c) {
+  const uint32_t *a = reinterpret_cast<const uint32_t *>(&a_frag);
+  const uint32_t *b = reinterpret_cast<const uint32_t *>(&frag_b);
+  float *c = reinterpret_cast<float *>(&frag_c);
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+               "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+               : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+               : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]),
+                 "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+}
+
+// Instruction for loading a full 16x16 matrix fragment of operand A from shared
+// memory, directly in tensor core layout.
+__device__ inline void ldsm4(FragA &frag_a, const void *smem_ptr) {
+  uint32_t *a = reinterpret_cast<uint32_t *>(&frag_a);
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n"
+               : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3])
+               : "r"(smem));
+}
+
+// Lookup-table based 3-input logical operation; explicitly used for
+// dequantization as the compiler does not seem to automatically recognize it in
+// all cases.
+template <int lut> __device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+// Efficiently dequantize an int32 value into a full B-fragment of 4 fp16
+// values. We mostly follow the strategy in the link below, with some small
+// changes:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+__device__ inline FragB dequant(int q) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  // Guarantee that the `(a & b) | c` operations are LOP3s.
+  int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX);
+  int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX);
+  // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
+  // directly into `SUB` and `ADD`.
+  const int SUB = 0x64086408;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd480d480;
+  FragB frag_b;
+  frag_b[0] = __hsub2(*reinterpret_cast<half2 *>(&lo),
+                      *reinterpret_cast<const half2 *>(&SUB));
+  frag_b[1] = __hfma2(*reinterpret_cast<half2 *>(&hi),
+                      *reinterpret_cast<const half2 *>(&MUL),
+                      *reinterpret_cast<const half2 *>(&ADD));
+  return frag_b;
+}
+
+// Multiply dequantized values by the corresponding quantization scale; used
+// only for grouped quantization.
+__device__ inline void scale(FragB &frag_b, FragS &frag_s, int i) {
+  half2 s = __half2half2(reinterpret_cast<__half *>(&frag_s)[i]);
+  frag_b[0] = __hmul2(frag_b[0], s);
+  frag_b[1] = __hmul2(frag_b[1], s);
+}
+
+// Wait until barrier reaches `count`, then lock for current threadblock.
+__device__ inline void barrier_acquire(int *lock, int count) {
+  if (threadIdx.x == 0) {
+    int state = -1;
+    do
+      // Guarantee that subsequent writes by this threadblock will be visible
+      // globally.
+      asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n"
+                   : "=r"(state)
+                   : "l"(lock));
+    while (state != count);
+  }
+  __syncthreads();
+}
+
+// Release barrier and increment visitation count.
+__device__ inline void barrier_release(int *lock, bool reset = false) {
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    if (reset) {
+      lock[0] = 0;
+      return;
+    }
+    int val = 1;
+    // Make sure that all writes since acquiring this barrier are visible
+    // globally, while releasing the barrier.
+    asm volatile("fence.acq_rel.gpu;\n");
+    asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n"
+                 :
+                 : "l"(lock), "r"(val));
+  }
+}
+
+template <const int threads,         // number of threads in a threadblock
+          const int thread_m_blocks, // number of 16x16 blocks in the m
+                                     // dimension (batchsize) of the threadblock
+          const int thread_n_blocks, // same for n dimension (output)
+          const int thread_k_blocks, // same for k dimension (reduction)
+          const int stages, // number of stages for the async global->shared
+                            // fetch pipeline
+          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+                                      // a separate quantization scale
+          >
+__global__ void
+Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+       const int4
+           *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn
+       int prob_m,          // batch dimension m
+       int prob_n,          // output dimension n
+       int prob_k,          // reduction dimension k
+       int *locks           // extra global storage for barrier synchronization
+) {
+  // Each threadblock processes one "stripe" of the B matrix with (roughly) the
+  // same size, which might involve multiple column "slices" (of width 16 *
+  // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
+  // example:
+  //   0 1 3
+  //   0 2 3
+  //   1 2 4
+  // While this kind of partitioning makes things somewhat more complicated, it
+  // ensures good utilization of all SMs for many kinds of shape and GPU
+  // configurations, while requiring as few slow global cross-threadblock
+  // reductions as possible.
+
+  // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
+  // better partitioning with less reductions
+  int parallel = 1;
+  if (prob_m > 16 * thread_m_blocks) {
+    parallel = prob_m / (16 * thread_m_blocks);
+    prob_m = 16 * thread_m_blocks;
+  }
+
+  int k_tiles = prob_k / 16 / thread_k_blocks;
+  int n_tiles = prob_n / 16 / thread_n_blocks;
+  int iters = ceildiv(k_tiles * n_tiles * parallel, gridDim.x);
+  // Ensure that the number of tiles in each stripe is a multiple of the
+  // groupsize; this avoids an annoying special case where a stripe starts in
+  // the middle of group.
+  if (group_blocks != -1)
+    iters = (group_blocks / thread_k_blocks) *
+            ceildiv(iters, (group_blocks / thread_k_blocks));
+
+  int slice_row = (iters * blockIdx.x) % k_tiles;
+  int slice_col_par = (iters * blockIdx.x) / k_tiles;
+  int slice_col = slice_col_par;
+  int slice_iters; // number of threadblock tiles in the current slice
+  int slice_count =
+      0;         // total number of active threadblocks in the current slice
+  int slice_idx; // index of threadblock in current slice; numbered bottom to
+                 // top
+
+  // We can easily implement parallel problem execution by just remapping
+  // indices and advancing global pointers
+  if (slice_col_par >= n_tiles) {
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
+    locks += (slice_col_par / n_tiles) * n_tiles;
+    slice_col = slice_col_par % n_tiles;
+  }
+
+  // Compute all information about the current slice which is required for
+  // synchronization.
+  auto init_slice = [&]() {
+    slice_iters =
+        iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
+    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel)
+      slice_iters = 0;
+    if (slice_iters == 0)
+      return;
+    if (slice_row + slice_iters > k_tiles)
+      slice_iters = k_tiles - slice_row;
+    slice_count = 1;
+    slice_idx = 0;
+    int col_first = iters * ceildiv(k_tiles * slice_col_par, iters);
+    if (col_first <= k_tiles * (slice_col_par + 1)) {
+      int col_off = col_first - k_tiles * slice_col_par;
+      slice_count = ceildiv(k_tiles - col_off, iters);
+      if (col_off > 0)
+        slice_count++;
+      int delta_first = iters * blockIdx.x - col_first;
+      if (delta_first < 0 || (col_off == 0 && delta_first == 0))
+        slice_idx = slice_count - 1;
+      else {
+        slice_idx = slice_count - 1 - delta_first / iters;
+        if (col_off > 0)
+          slice_idx--;
+      }
+    }
+    if (slice_col == n_tiles) {
+      A += 16 * thread_m_blocks * prob_k / 8;
+      C += 16 * thread_m_blocks * prob_n / 8;
+      locks += n_tiles;
+      slice_col = 0;
+    }
+  };
+  init_slice();
+
+  int a_gl_stride = prob_k / 8; // stride of the A matrix in global memory
+  // We typically use `constexpr` to indicate that this value is a compile-time
+  // constant
+  constexpr int a_sh_stride =
+      16 * thread_k_blocks / 8; // stride of an A matrix tile in shared memory
+  constexpr int a_gl_rd_delta_o =
+      16 * thread_k_blocks /
+      8; // delta between subsequent A tiles in global memory
+  int a_gl_rd_delta_i =
+      a_gl_stride *
+      (threads / a_gl_rd_delta_o); // between subsequent accesses within a tile
+  constexpr int a_sh_wr_delta =
+      a_sh_stride * (threads / a_gl_rd_delta_o); // between shared memory writes
+  constexpr int a_sh_rd_delta_o =
+      2 * ((threads / 32) /
+           (thread_n_blocks / 4)); // between shared memory tile reads
+  constexpr int a_sh_rd_delta_i =
+      a_sh_stride * 16; // within a shared memory tile
+  constexpr int a_sh_stage =
+      a_sh_stride * (16 * thread_m_blocks); // overall size of a tile
+  constexpr int a_sh_wr_iters =
+      ceildiv(a_sh_stage,
+              a_sh_wr_delta); // number of shared write iterations for a tile
+
+  int b_gl_stride = 16 * prob_n / 32;
+  constexpr int b_sh_stride = 32 * thread_n_blocks / 4;
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
+  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride);
+  constexpr int b_sh_wr_delta = threads;
+  constexpr int b_sh_rd_delta = threads;
+  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
+
+  int s_gl_stride = prob_n / 8;
+  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  constexpr int s_sh_stage = s_sh_stride;
+  int s_gl_rd_delta = s_gl_stride;
+
+  // Global A read index of current thread.
+  int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  a_gl_rd += a_gl_rd_delta_o * slice_row;
+  // Shared write index of current thread.
+  int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                (threadIdx.x % a_gl_rd_delta_o);
+  // Shared read index.
+  int a_sh_rd =
+      a_sh_stride * ((threadIdx.x % 32) % 16) + (threadIdx.x % 32) / 16;
+  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+
+  int b_gl_rd =
+      b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += b_gl_rd_delta_o * slice_row;
+  int b_sh_wr = threadIdx.x;
+  int b_sh_rd = threadIdx.x;
+
+  int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+  int s_sh_wr = threadIdx.x;
+  int s_sh_rd;
+  // We use a different scale layout for grouped and column-wise quantization as
+  // we scale a `half2` tile in column-major layout in the former and in
+  // row-major in the latter case.
+  if (group_blocks != -1)
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) / 4;
+  else
+    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+              (threadIdx.x % 32) % 4;
+
+  // Precompute which thread should not read memory in which iterations; this is
+  // needed if there are more threads than required for a certain tilesize or
+  // when the batchsize is not a multiple of 16.
+  bool a_sh_wr_pred[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+
+  // To ensure that writing and reading A tiles to/from shared memory, the
+  // latter in fragment format, is fully bank conflict free, we need to use a
+  // rather fancy XOR-based layout. The key here is that neither reads nor
+  // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the
+  // same shared memory banks. Further, it seems (based on NSight-Compute) that
+  // each warp must also write a consecutive memory segment?
+  auto transform_a = [&](int i) {
+    int row = i / a_gl_rd_delta_o;
+    return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ row;
+  };
+  // Since the computation of this remapping is non-trivial and, due to our main
+  // loop unrolls, all shared memory accesses are static, we simply precompute
+  // both transformed reads and writes.
+  int a_sh_wr_trans[a_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < a_sh_wr_iters; i++)
+    a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr);
+  int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++) {
+#pragma unroll
+    for (int j = 0; j < thread_m_blocks; j++)
+      a_sh_rd_trans[i][j] =
+          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+  }
+
+  // Since B-accesses have non-constant stride they have to be computed at
+  // runtime; we break dependencies between subsequent accesses with a tile by
+  // maintining multiple pointers (we have enough registers), a tiny
+  // optimization.
+  const int4 *B_ptr[b_sh_wr_iters];
+#pragma unroll
+  for (int i = 0; i < b_sh_wr_iters; i++)
+    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
+
+  extern __shared__ int4 sh[];
+  // Shared memory storage for global fetch pipelines.
+  int4 *sh_a = sh;
+  int4 *sh_b = sh_a + (stages * a_sh_stage);
+  int4 *sh_s = sh_b + (stages * b_sh_stage);
+  // Register storage for double buffer of shared memory reads.
+  FragA frag_a[2][thread_m_blocks];
+  I4 frag_b_quant[2];
+  FragC frag_c[thread_m_blocks][4][2];
+  FragS frag_s[2][4];
+
+  // Zero accumulators.
+  auto zero_accums = [&]() {
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++)
+      reinterpret_cast<float *>(frag_c)[i] = 0;
+  };
+
+  // Asynchronously fetch the next A, B and s tile from global to the next
+  // shared memory pipeline location.
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
+    if (pred) {
+      int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        cp_async4_pred(
+            &sh_a_stage[a_sh_wr_trans[i]],
+            &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off],
+            a_sh_wr_pred[i]);
+      }
+      int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+#pragma unroll
+      for (int i = 0; i < b_sh_wr_iters; i++) {
+        cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+        B_ptr[i] += b_gl_rd_delta_o;
+      }
+      // Only fetch scales if this tile starts a new group
+      if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
+        int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
+        if (s_sh_wr_pred)
+          cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+        s_gl_rd += s_gl_rd_delta;
+      }
+    }
+    // Insert a fence even when we are winding down the pipeline to ensure that
+    // waiting is also correct at this point.
+    cp_async_fence();
+  };
+
+  // Wait until the next thread tile has been loaded to shared memory.
+  auto wait_for_stage = [&]() {
+    // We only have `stages - 2` active fetches since we are double buffering
+    // and can only issue the next fetch when it is guaranteed that the previous
+    // shared memory load is fully complete (as it may otherwise be
+    // overwritten).
+    cp_async_wait<stages - 2>();
+    __syncthreads();
+  };
+
+  // Load the next sub-tile from the current location in the shared memory pipe
+  // into the current register buffer.
+  auto fetch_to_registers = [&](int k, int pipe) {
+    // It may seem inefficient that we reload the groups for every sub-tile;
+    // however, this does not seem to be a significant bottleneck, while some
+    // theoretically better attempts have lead to bad instruction ordering by
+    // the compiler and correspondingly a noticeable drop in performance.
+    if (group_blocks != -1) {
+      int4 *sh_s_stage =
+          sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
+                               (pipe / (group_blocks / thread_k_blocks)));
+      reinterpret_cast<int4 *>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+    }
+    int4 *sh_a_stage = sh_a + a_sh_stage * pipe;
+#pragma unroll
+    for (int i = 0; i < thread_m_blocks; i++)
+      ldsm4(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
+    int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
+    frag_b_quant[k % 2] = *reinterpret_cast<I4 *>(
+        &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd]);
+  };
+
+  // Execute the actual tensor core matmul of a sub-tile.
+  auto matmul = [&](int k) {
+// We have the m dimension as the inner loop in order to encourage overlapping
+// dequantization and matmul operations.
+#pragma unroll
+    for (int j = 0; j < 4; j++) {
+      int b_quant = frag_b_quant[k % 2][j];
+      int b_quant_shift = b_quant >> 8;
+      FragB frag_b0 = dequant(b_quant);
+      // If there are no groups, we can just scale the final output once and can
+      // avoid doing so for each weight.
+      if (group_blocks != -1)
+        scale(frag_b0, frag_s[k % 2][j], 0);
+      FragB frag_b1 = dequant(b_quant_shift);
+      if (group_blocks != -1)
+        scale(frag_b1, frag_s[k % 2][j], 1);
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma(frag_a[k % 2][i], frag_b0, frag_c[i][j][0]);
+        mma(frag_a[k % 2][i], frag_b1, frag_c[i][j][1]);
+      }
+    }
+  };
+
+  // Since we slice across the k dimension of a tile in order to increase the
+  // number of warps while keeping the n dimension of a tile reasonable, we have
+  // multiple warps that accumulate their partial sums of the same output
+  // location; which we have to reduce over in the end. We do in shared memory.
+  auto thread_block_reduce = [&]() {
+    constexpr int red_off = threads / b_sh_stride / 2;
+    if (red_off >= 1) {
+      int red_idx = threadIdx.x / b_sh_stride;
+      constexpr int red_sh_stride = b_sh_stride * 4 * 2;
+      constexpr int red_sh_delta = b_sh_stride;
+      int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
+                      (threadIdx.x % b_sh_stride);
+
+      // Parallel logarithmic shared memory reduction. We make sure to avoid any
+      // unnecessary read or write iterations, e.g., for two warps we write only
+      // once by warp 1 and read only once by warp 0.
+
+#pragma unroll
+      for (int m_block = 0; m_block < thread_m_blocks; m_block++) {
+#pragma unroll
+        for (int i = red_off; i > 0; i /= 2) {
+          if (i <= red_idx && red_idx < 2 * i) {
+#pragma unroll
+            for (int j = 0; j < 4 * 2; j++) {
+              int red_sh_wr =
+                  red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
+              if (i < red_off) {
+                float *c_rd = reinterpret_cast<float *>(
+                    &sh[red_sh_delta * j + red_sh_rd]);
+                float *c_wr = reinterpret_cast<float *>(&sh[red_sh_wr]);
+#pragma unroll
+                for (int k = 0; k < 4; k++)
+                  reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + j][k] +=
+                      c_rd[k] + c_wr[k];
+              }
+              sh[red_sh_wr] =
+                  reinterpret_cast<int4 *>(&frag_c)[4 * 2 * m_block + j];
+            }
+          }
+          __syncthreads();
+        }
+        if (red_idx == 0) {
+#pragma unroll
+          for (int i = 0; i < 4 * 2; i++) {
+            float *c_rd =
+                reinterpret_cast<float *>(&sh[red_sh_delta * i + red_sh_rd]);
+#pragma unroll
+            for (int j = 0; j < 4; j++)
+              reinterpret_cast<FragC *>(frag_c)[4 * 2 * m_block + i][j] +=
+                  c_rd[j];
+          }
+        }
+        __syncthreads();
+      }
+    }
+  };
+
+  // Since multiple threadblocks may process parts of the same column slice, we
+  // finally have to globally reduce over the results. As the striped partitioning
+  // minimizes the number of such reductions and our outputs are usually rather
+  // small, we perform this reduction serially in L2 cache.
+  auto global_reduce = [&](bool first = false, bool last = false) {
+    // We are very careful here to reduce directly in the output buffer to
+    // maximize L2 cache utilization in this step. To do this, we write out
+    // results in FP16 (but still reduce with FP32 compute).
+    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    if (threadIdx.x < active_threads) {
+      int c_gl_stride = prob_n / 8;
+      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_i = 4 * (active_threads / 32);
+      int c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+                    4 * (threadIdx.x / 32) + threadIdx.x % 4;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      constexpr int c_sh_wr_delta = active_threads;
+      int c_sh_wr = threadIdx.x;
+
+      int row = (threadIdx.x % 32) / 4;
+
+      if (!first) {
+// Interestingly, doing direct global accesses here really seems to mess up the
+// compiler and lead to slowdowns, hence we also use async-copies even though
+// these fetches are not actually asynchronous.
+#pragma unroll
+        for (int i = 0; i < thread_m_blocks * 4; i++) {
+          cp_async4_pred(&sh[c_sh_wr + c_sh_wr_delta * i],
+                         &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                            c_gl_wr_delta_i * (i % 2)],
+                         i < (thread_m_blocks - 1) * 4 ||
+                             8 * (i / 2) + row < prob_m);
+        }
+        cp_async_fence();
+        cp_async_wait<0>();
+      }
+
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks * 4; i++) {
+        if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
+          if (!first) {
+            int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<float *>(
+                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)] +=
+                  __half2float(reinterpret_cast<__half *>(&c_red)[j]);
+            }
+          }
+          if (!last) {
+            int4 c;
+#pragma unroll
+            for (int j = 0; j < 2 * 4; j++) {
+              reinterpret_cast<__half *>(&c)[j] =
+                  __float2half(reinterpret_cast<float *>(
+                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4)]);
+            }
+            C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] =
+                c;
+          }
+        }
+      }
+    }
+  };
+
+  // Write out the reduce final result in the correct layout. We only actually
+  // reshuffle matrix fragments in this step, the reduction above is performed
+  // in fragment layout.
+  auto write_result = [&]() {
+    int c_gl_stride = prob_n / 8;
+    constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
+    int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
+    constexpr int c_sh_rd_delta =
+        c_sh_stride * (threads / (2 * thread_n_blocks));
+
+    int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+    c_gl_wr += (2 * thread_n_blocks) * slice_col;
+    int c_sh_wr =
+        (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
+    c_sh_wr += 32 * (threadIdx.x / 32);
+    int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
+                  (threadIdx.x % (2 * thread_n_blocks));
+
+    int c_gl_wr_end = c_gl_stride * prob_m;
+
+    // We first reorder in shared memory to guarantee the most efficient final
+    // global write patterns
+    auto write = [&](int idx, float c0, float c1, FragS &s) {
+      half2 res = __halves2half2(__float2half(c0), __float2half(c1));
+      if (group_blocks ==
+          -1) // for per-column quantization we finally apply the scale here
+        res = __hmul2(res, s[0]);
+      ((half2 *)sh)[idx] = res;
+    };
+    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+#pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+#pragma unroll
+        for (int j = 0; j < 4; j++) {
+          int wr = c_sh_wr + 8 * j;
+          write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
+                frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
+                frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
+          write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
+                frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
+          write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
+                frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
+        }
+        c_sh_wr += 16 * (4 * c_sh_stride);
+      }
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0;
+         i < ceildiv(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
+         i++) {
+      if (c_gl_wr < c_gl_wr_end) {
+        C[c_gl_wr] = sh[c_sh_rd];
+        c_gl_wr += c_gl_wr_delta;
+        c_sh_rd += c_sh_rd_delta;
+      }
+    }
+  };
+
+  // Start global fetch and register load pipelines.
+  auto start_pipes = [&]() {
+#pragma unroll
+    for (int i = 0; i < stages - 1; i++)
+      fetch_to_shared(i, i, i < slice_iters);
+    zero_accums();
+    wait_for_stage();
+    fetch_to_registers(0, 0);
+    a_gl_rd += a_gl_rd_delta_o * (stages - 1);
+  };
+  start_pipes();
+
+  // Main loop.
+  while (slice_iters) {
+// We unroll over both the global fetch and the register load pipeline to ensure
+// all shared memory accesses are static. Note that both pipelines have even
+// length meaning that the next iteration will always start at index 0.
+#pragma unroll
+    for (int pipe = 0; pipe < stages;) {
+#pragma unroll
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+        }
+        matmul(k);
+      }
+      slice_iters--;
+      if (slice_iters == 0)
+        break;
+    }
+    a_gl_rd += a_gl_rd_delta_o * stages;
+
+    // Process results and, if necessary, proceed to the next column slice.
+    // While this pattern may not be the most readable, other ways of writing
+    // the loop seemed to noticeably worse performance after compilation.
+    if (slice_iters == 0) {
+      cp_async_wait<0>();
+      bool last = slice_idx == slice_count - 1;
+      // For per-column scales, we only fetch them here in the final step before
+      // write-out
+      if (group_blocks == -1 && last) {
+        if (s_sh_wr_pred)
+          cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]);
+        cp_async_fence();
+      }
+      thread_block_reduce();
+      if (group_blocks == -1 && last) {
+        cp_async_wait<0>();
+        __syncthreads();
+        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          reinterpret_cast<int4 *>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          reinterpret_cast<int4 *>(&frag_s)[1] = sh_s[s_sh_rd + 4];
+        }
+      }
+      if (slice_count > 1) { // only globally reduce if there is more than one
+                             // block in a slice
+        barrier_acquire(&locks[slice_col], slice_idx);
+        global_reduce(slice_idx == 0, last);
+        barrier_release(&locks[slice_col], last);
+      }
+      if (last) // only the last block in a slice actually writes the result
+        write_result();
+      slice_row = 0;
+      slice_col_par++;
+      slice_col++;
+      init_slice();
+      if (slice_iters) {
+        a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
+                  (threadIdx.x % a_gl_rd_delta_o);
+#pragma unroll
+        for (int i = 0; i < b_sh_wr_iters; i++)
+          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
+        if (slice_col == 0) {
+#pragma unroll
+          for (int i = 0; i < b_sh_wr_iters; i++)
+            B_ptr[i] -= b_gl_stride;
+        }
+        s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+        start_pipes();
+      }
+    }
+  }
+}
+
+#else
+
+template <const int threads,         // number of threads in a threadblock
+          const int thread_m_blocks, // number of 16x16 blocks in the m
+                                     // dimension (batchsize) of the threadblock
+          const int thread_n_blocks, // same for n dimension (output)
+          const int thread_k_blocks, // same for k dimension (reduction)
+          const int stages, // number of stages for the async global->shared
+                            // fetch pipeline
+          const int group_blocks = -1 // number of consecutive 16x16 blocks with
+                                      // a separate quantization scale
+          >
+__global__ void
+Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
+       const int4 *__restrict__ B, // 4bit quantized weight matrix of shape kxn
+       int4 *__restrict__ C,       // fp16 output buffer of shape mxn
+       const int4
+           *__restrict__ s, // fp16 quantization scales of shape (k/groupsize)xn
+       int prob_m,          // batch dimension m
+       int prob_n,          // output dimension n
+       int prob_k,          // reduction dimension k
+       int *locks           // extra global storage for barrier synchronization
+) {
+  // Marlin is not implemented yet for SM < 8.0
+  assert(false);
+  return;
+}
+
+#endif
+
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+const int USER_THREADS =
+    256;              // Note: This is only used with user-provided thread_k/n
+const int STAGES = 4; // 4 pipeline stages fit into shared memory
+const int SHARED_MEM =
+    96 * 1024; // max shared memory on compute capability 8.6 (< 8.0)
+
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+
+static constexpr int pack_factor_4bit =
+    8; // We have 8 4-bit vals inside a 32 bit
+
+#define __CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,           \
+                  GROUP_BLOCKS, NUM_THREADS)                                   \
+  else if (thread_m_blocks == THREAD_M_BLOCKS &&                               \
+           thread_n_blocks == THREAD_N_BLOCKS &&                               \
+           thread_k_blocks == THREAD_K_BLOCKS &&                               \
+           group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) {       \
+    cudaFuncSetAttribute(Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, \
+                                THREAD_K_BLOCKS, STAGES, GROUP_BLOCKS>,        \
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,          \
+                         SHARED_MEM);                                          \
+    Marlin<NUM_THREADS, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,     \
+           STAGES, GROUP_BLOCKS><<<blocks, NUM_THREADS, SHARED_MEM, stream>>>( \
+        A_ptr, B_ptr, C_ptr, s_ptr, prob_m, prob_n, prob_k, locks);            \
+  }
+
+typedef struct {
+  int thread_k;
+  int thread_n;
+  int num_threads;
+} thread_config_t;
+
+thread_config_t small_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {128, 128, 256}, // Default
+    {128, 64, 128},  // Reduce N 2X, same K
+    {64, 256, 256},  // Reduce K 2X, increase N 2X
+    {64, 128, 128},  // Reduce K 2X, same N
+};
+
+thread_config_t large_batch_thread_configs[] = {
+    // Ordered by priority
+
+    // thread_k, thread_n, num_threads
+    {64, 256, 256},  // Default
+    {128, 128, 256}, // Reduce N 2X, increase K 2X
+    {64, 128, 128},  // Reduce N 2X, same K
+    {128, 64, 128},  // Reduce N 4X, increase K 2X
+};
+
+bool is_valid_config(thread_config_t const &th_config, int prob_m, int prob_n,
+                     int prob_k) {
+  // Sanity
+  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
+      th_config.num_threads == -1) {
+    return false;
+  }
+
+  // Verify K/N are divisible by thread K/N
+  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
+    return false;
+  }
+
+  // thread_k can be only 128 or 64 (because it must be less than groupsize
+  // which is 128)
+  if (th_config.thread_k != 128 && th_config.thread_k != 64) {
+    return false;
+  }
+
+  // Verify min for thread K/N
+  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
+    return false;
+  }
+
+  // num_threads must be at least 128 (= 4 warps)
+  if (th_config.num_threads < 128) {
+    return false;
+  }
+
+  return true;
+}
+
+thread_config_t determine_thread_config(int prob_m, int prob_n, int prob_k) {
+
+  if (prob_m <= 16) {
+    for (auto th_config : small_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+
+  } else {
+    for (auto th_config : large_batch_thread_configs) {
+      if (is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+        return th_config;
+      }
+    }
+  }
+
+  return thread_config_t{-1, -1, -1};
+}
+
+#define CALL_IF(N_BLOCKS, K_BLOCKS, NUM_THREADS)                               \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(1, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(2, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(3, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)                             \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, -1, NUM_THREADS)                            \
+  __CALL_IF(4, N_BLOCKS, K_BLOCKS, 8, NUM_THREADS)
+
+void marlin_cuda(const void *A, const void *B, void *C, void *s, int prob_m,
+                 int prob_n, int prob_k, void *workspace, int groupsize = -1,
+                 int dev = 0, cudaStream_t stream = 0, int thread_k = -1,
+                 int thread_n = -1, int sms = -1, int max_par = 16) {
+  int tot_m = prob_m;
+  int tot_m_blocks = ceildiv(tot_m, 16);
+  int pad = 16 * tot_m_blocks - tot_m;
+
+  if (sms == -1)
+    cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev);
+
+  // Set thread config
+  thread_config_t th_config;
+  if (thread_k != -1 && thread_n != -1) {
+    // User-defined config
+    th_config = thread_config_t{thread_k, thread_n, USER_THREADS};
+  } else {
+    // Auto config
+    th_config = determine_thread_config(prob_m, prob_n, prob_k);
+  }
+
+  if (!is_valid_config(th_config, prob_m, prob_n, prob_k)) {
+    throw std::runtime_error(
+        "Invalid thread config: thread_k = " + str(th_config.thread_k) +
+        ", thread_n = " + str(th_config.thread_n) +
+        ", num_threads = " + str(th_config.num_threads) + " for MKN = [" +
+        str(prob_m) + ", " + str(prob_k) + ", " + str(prob_n) + "]");
+  }
+
+  // Uncomment for debug
+  // std::cout << "Using thread_config: thread_k = " + str(th_config.thread_k) +
+  //                  ", thread_n = " + str(th_config.thread_n) +
+  //                  ", num_threads = " + str(th_config.num_threads) + " for
+  //                  MKN = [" + str(prob_m) +
+  //                  ", " + str(prob_k) + ", " + str(prob_n) + "]\n";
+
+  int num_threads = th_config.num_threads;
+  thread_k = th_config.thread_k;
+  thread_n = th_config.thread_n;
+
+  int thread_k_blocks = thread_k / 16;
+  int thread_n_blocks = thread_n / 16;
+  int group_blocks = (groupsize == -1) ? -1 : groupsize / 16;
+  int blocks = sms;
+
+  if (prob_m == 0 || prob_n == 0 || prob_k == 0) {
+    return;
+  }
+
+  TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
+              " is not divisible by thread_n = ", thread_n);
+  TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
+              " is not divisible by thread_k = ", thread_k);
+  if (group_blocks != -1) {
+    TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
+                " is not divisible by group_blocks = ", group_blocks);
+  }
+
+  const int4 *A_ptr = (const int4 *)A;
+  const int4 *B_ptr = (const int4 *)B;
+  int4 *C_ptr = (int4 *)C;
+  const int4 *s_ptr = (const int4 *)s;
+
+  int *locks = (int *)workspace;
+
+  for (int i = 0; i < tot_m_blocks; i += 4) {
+    int thread_m_blocks = tot_m_blocks - i;
+    prob_m = tot_m - 16 * i;
+    int par = 1;
+    if (thread_m_blocks > 4) {
+      // Note that parallel > 1 currently only works for inputs without any
+      // padding
+      par = (16 * thread_m_blocks - pad) / 64;
+      if (par > max_par)
+        par = max_par;
+      prob_m = 64 * par;
+      i += 4 * (par - 1);
+      thread_m_blocks = 4;
+    }
+
+    // For compilation speed, we only define the kernel configurations that have
+    // seemed useful (in terms of performance) in our testing, however many more
+    // are, in principle, possible.
+    if (false) {
+    }
+    CALL_IF(8, 8, 256)
+    CALL_IF(16, 4, 256)
+    CALL_IF(8, 4, 128)
+    CALL_IF(4, 8, 128)
+    else {
+      throw std::runtime_error("Unsupported shapes: MKN = [" + str(prob_m) +
+                               ", " + str(prob_k) + ", " + str(prob_n) + "]" +
+                               ", groupsize = " + str(groupsize) +
+                               ", thread_m_blocks = " + str(thread_m_blocks) +
+                               ", thread_n_blocks = " + str(thread_n_blocks) +
+                               ", thread_k_blocks = " + str(thread_k_blocks));
+    }
+
+    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
+  }
+}
+
+} // namespace marlin
+
+torch::Tensor marlin_gemm(torch::Tensor &a, torch::Tensor &b_q_weight,
+                          torch::Tensor &b_scales, torch::Tensor &workspace,
+                          int64_t size_m, int64_t size_n, int64_t size_k) {
+
+  // Verify M
+  TORCH_CHECK(size_m == a.size(0),
+              "Shape mismatch: a.size(0) = " + str(a.size(0)) +
+                  ", size_m = " + str(size_m));
+
+  // Verify K
+  TORCH_CHECK(size_k == a.size(1),
+              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
+                  ", size_k = " + str(size_k));
+  TORCH_CHECK(size_k % marlin::tile_size == 0,
+              "size_k = " + str(size_k) +
+                  " is not divisible by tile_size = " + str(marlin::tile_size));
+  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+              "Shape mismatch: b_q_weight.size(0) = " +
+                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
+                  ", tile_size = " + str(marlin::tile_size));
+
+  // Verify N
+  TORCH_CHECK(b_scales.size(1) == size_n,
+              "b_scales.size(1) = " + str(b_scales.size(1)) +
+                  ", size_n = " + str(size_n));
+  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
+              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+                  " is not divisible by tile_size = " + str(marlin::tile_size));
+
+  int actual_size_n =
+      (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
+  TORCH_CHECK(size_n == actual_size_n,
+              "size_n = " + str(size_n) +
+                  ", actual_size_n = " + str(actual_size_n));
+
+  // Verify A device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  // Verify B device and strides
+  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
+  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
+
+  // Verify scales device and strides
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  // Alloc C matrix
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({size_m, size_n}, options);
+
+  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_k = -1;
+  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
+  // auto -1)
+  int thread_n = -1;
+  // sms: number of SMs to use for the kernel (can usually be left as auto -1)
+  int sms = -1;
+
+  // Detect groupsize
+  if (b_scales.size(0) != 1) {
+    TORCH_CHECK(size_k % b_scales.size(0) == 0,
+                "size_k = " + str(size_k) +
+                    ", is not divisible by b_scales.size(0) = " +
+                    str(b_scales.size(0)));
+  }
+  int groupsize = b_scales.size(0) == 1 ? -1 : size_k / b_scales.size(0);
+
+  // Verify groupsize
+  TORCH_CHECK(groupsize == -1 || groupsize == 128,
+              "Unexpected groupsize = " + str(groupsize));
+
+  // Verify workspace size
+  TORCH_CHECK(
+      size_n % marlin::min_thread_n == 0,
+      "size_n = " + str(size_n) +
+          ", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
+  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+  TORCH_CHECK(workspace.numel() >= min_workspace_size,
+              "workspace.numel = " + str(workspace.numel()) +
+                  " is below min_workspace_size = " + str(min_workspace_size));
+
+  int dev = a.get_device();
+  marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
+                      b_scales.data_ptr(), size_m, size_n, size_k,
+                      workspace.data_ptr(), groupsize, dev,
+                      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
+                      sms, marlin::max_par);
+
+  return c;
+}
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 80d66530f47f0..55e102374fd73 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,6 +15,7 @@ types-setuptools
 pytest
 pytest-forked
 pytest-asyncio
+pytest-rerunfailures
 httpx
 einops # required for MPT
 openai
diff --git a/setup.py b/setup.py
index 1f48be948aa84..745b5a9b2d02a 100644
--- a/setup.py
+++ b/setup.py
@@ -342,6 +342,8 @@ def get_torch_arch_list() -> Set[str]:
 
 if _is_cuda():
     vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
+    vllm_extension_sources.append(
+        "csrc/quantization/marlin/marlin_cuda_kernel.cu")
     vllm_extension_sources.append("csrc/custom_all_reduce.cu")
 
     # Add MoE kernels.
diff --git a/tests/conftest.py b/tests/conftest.py
index 30a3df89d9f12..6eb8159837d51 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -199,6 +199,24 @@ def generate(
             outputs.append((req_sample_output_ids, req_sample_output_strs))
         return outputs
 
+    def generate_w_logprobs(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+    ) -> List[Tuple[List[int], str]]:
+        assert sampling_params.logprobs is not None
+
+        req_outputs = self.model.generate(prompts,
+                                          sampling_params=sampling_params)
+        outputs = []
+        for req_output in req_outputs:
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = sample.token_ids
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs))
+        return outputs
+
     def generate_greedy(
         self,
         prompts: List[str],
@@ -209,6 +227,20 @@ def generate_greedy(
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
 
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        num_logprobs: int,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_logprobs_params = SamplingParams(temperature=0.0,
+                                                max_tokens=max_tokens,
+                                                logprobs=num_logprobs)
+        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+
+        return [(output_ids, output_str, output_logprobs)
+                for output_ids, output_str, output_logprobs in outputs]
+
     def generate_beam_search(
         self,
         prompts: List[str],
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
new file mode 100644
index 0000000000000..f3cc517364f06
--- /dev/null
+++ b/tests/models/test_marlin.py
@@ -0,0 +1,97 @@
+"""Compare the outputs of a GPTQ model to a Marlin model.
+
+Note: GPTQ and Marlin do not have bitwise correctness. 
+As a result, in this test, we just confirm that the top selected tokens of the 
+Marlin/GPTQ models are in the top 3 selections of each other.
+
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_marlin.py --forked`.
+"""
+
+import pytest
+import torch
+from dataclasses import dataclass
+from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+marlin_not_supported = (
+    capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability())
+
+
+@dataclass
+class ModelPair:
+    model_marlin: str
+    model_gptq: str
+
+
+model_pairs = [
+    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
+              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
+    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
+              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
+    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
+              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
+]
+
+
+@pytest.mark.flaky(reruns=2)
+@pytest.mark.skipif(marlin_not_supported,
+                    reason="Marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model_pair", model_pairs)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [3])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model_pair: ModelPair,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype)
+    marlin_outputs = marlin_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+
+    # Note: not sure why, but deleting just the model on Ada Lovelace
+    #   does not free the GPU memory. On Ampere, deleting the just model
+    #   frees the memory.
+    del marlin_model.model.llm_engine.driver_worker
+    del marlin_model
+
+    gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
+    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
+                                                       max_tokens,
+                                                       num_logprobs)
+
+    # Note: not sure why, but deleting just the model on Ada Lovelace
+    #   does not free the GPU memory. On Ampere, deleting the just model
+    #   frees the memory.
+    del gptq_model.model.llm_engine.driver_worker
+    del gptq_model
+
+    # loop through the prompts
+    for prompt_idx in range(len(example_prompts)):
+        gptq_output_ids, gptq_output_str, gptq_logprobs = gptq_outputs[
+            prompt_idx]
+        marlin_output_ids, marlin_output_str, marlin_logprobs = marlin_outputs[
+            prompt_idx]
+
+        for idx, (gptq_output_id, marlin_output_id) in enumerate(
+                zip(gptq_output_ids, marlin_output_ids)):
+            # If sequence is not an exact match,
+            if marlin_output_id != gptq_output_id:
+                # Each predicted token must be in top 5 of the other's
+                assert gptq_output_id in marlin_logprobs[idx], (
+                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}"
+                )
+                assert marlin_output_id in gptq_logprobs[idx], (
+                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}"
+                )
+
+                # Break out since sequences will now diverge.
+                break
diff --git a/vllm/config.py b/vllm/config.py
index 2f8883fe0733e..b4d48d34a8a72 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -155,15 +155,21 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_quantization(self) -> None:
-        supported_quantization = ["awq", "gptq", "squeezellm"]
-        rocm_not_supported_quantization = ["awq"]
+        supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]
+        rocm_not_supported_quantization = ["awq", "marlin"]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
 
         # Parse quantization method from the HF model config, if available.
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
+
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+            # If the GPTQ model is serialized in marlin format, use marlin.
+            if (hf_quant_method == "gptq"
+                    and "is_marlin_format" in hf_quant_config
+                    and hf_quant_config["is_marlin_format"]):
+                hf_quant_method = "marlin"
             if self.quantization is None:
                 self.quantization = hf_quant_method
             elif self.quantization != hf_quant_method:
@@ -183,9 +189,11 @@ def _verify_quantization(self) -> None:
                 raise ValueError(
                     f"{self.quantization} quantization is currently not supported "
                     f"in ROCm.")
-            logger.warning(f"{self.quantization} quantization is not fully "
-                           "optimized yet. The speed can be slower than "
-                           "non-quantized models.")
+            if self.quantization != "marlin":
+                logger.warning(
+                    f"{self.quantization} quantization is not fully "
+                    "optimized yet. The speed can be slower than "
+                    "non-quantized models.")
 
     def _verify_cuda_graph(self) -> None:
         if self.max_context_len_to_capture is None:
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 55d38b763b2b5..b2396a1d6f141 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -17,6 +17,14 @@
 logger = init_logger(__name__)
 
 
+def adjust_marlin_shard(param, shard_size, shard_offset):
+    marlin_tile_size = getattr(param, "marlin_tile_size", None)
+    if marlin_tile_size is None:
+        return shard_size, shard_offset
+
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
 class LinearMethodBase(ABC):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -276,6 +284,11 @@ def weight_loader(self,
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
+
+                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset)
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -293,6 +306,11 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
+
+                # If marlin, we need to adjust the offset and size to account for the tiling.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
@@ -372,6 +390,7 @@ def weight_loader(self,
                       loaded_shard_id: Optional[str] = None):
         param_data = param.data
         output_dim = getattr(param, "output_dim", None)
+
         if loaded_shard_id is None:
             # Loaded weight is already packed.
             if output_dim is None:
@@ -393,6 +412,11 @@ def weight_loader(self,
                 if packed_dim == output_dim:
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
+
+                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset)
+
                 loaded_weight_shard = loaded_weight.narrow(
                     output_dim, shard_offset, shard_size)
                 self.weight_loader(param, loaded_weight_shard, shard_id)
@@ -417,6 +441,11 @@ def weight_loader(self,
             if packed_dim == output_dim:
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
+
+                # If marlin, we need to adjust the offset and size to account for the tiling.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset)
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             if loaded_shard_id == "q":
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b3449eaff0e35..dc54641878c64 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,11 +4,13 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
+from vllm.model_executor.layers.quantization.marlin import MarlinConfig
 
 _QUANTIZATION_CONFIG_REGISTRY = {
     "awq": AWQConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
+    "marlin": MarlinConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
new file mode 100644
index 0000000000000..7566d78a8aba4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -0,0 +1,210 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm._C import ops
+from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
+from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+
+
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+    ) -> None:
+        # Group size for the quantization.
+        self.group_size = group_size
+        if self.group_size != 128 and self.group_size != -1:
+            raise ValueError(
+                "Currently, only group size 128 and -1 (channelwise) is supported for "
+                f"Marlin, but got group_size of {self.group_size}")
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // 4
+
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+
+        # Min out_features dim
+        self.min_n_threads = 64
+
+        # Min in_features dim
+        self.min_k_threads = 128
+
+        # Max parallel problems to solve at once (improves large batch performance)
+        self.max_parallel = 16
+
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return f"MarlinConfig(group_size={self.group_size}"
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(group_size)
+
+    def get_linear_method(self) -> "MarlinLinearMethod":
+        return MarlinLinearMethod(self)
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class MarlinLinearMethod(LinearMethodBase):
+    """Linear method for Marlin.
+
+    Args:
+        quant_config: The Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: MarlinConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        input_size_per_partition: int,
+        output_size_per_partition: int,
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        del output_size  # Unused.
+
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}")
+
+        # Validate output_size_per_partition
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}."
+            )
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}."
+            )
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}."
+            )
+        if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}."
+            )
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2)
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError(
+                "Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition * self.quant_config.tile_size //
+                self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                "output_dim": 1,
+                "packed_dim": 1,
+                "pack_factor": self.quant_config.pack_factor,
+                "marlin_tile_size": self.quant_config.tile_size,
+            },
+        )
+
+        # Determine if channelwise or not
+        input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size
+
+        scales = Parameter(
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            scales,
+            {
+                "input_dim": None if input_groups == 1 else 0,
+                "output_dim": 1,
+            },
+        )
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition //
+            self.quant_config.min_n_threads) * self.quant_config.max_parallel
+        workspace = Parameter(torch.zeros(max_workspace_size,
+                                          device="cuda",
+                                          dtype=torch.int),
+                              requires_grad=False)
+
+        return {
+            "B": qweight,
+            "s": scales,
+            "workspace": workspace,
+        }
+
+    def apply_weights(
+        self,
+        weights: Dict[str, Any],
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = weights["B"]
+        scales = weights["s"]
+        workspace = weights["workspace"]
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.marlin_gemm(x_2d, qweight, scales, workspace, size_m,
+                                    size_n, size_k)
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output

From 82091b864af105dbe373353655dc9d8c0a6ba66f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 1 Mar 2024 12:58:06 -0800
Subject: [PATCH 041/113] Bump up to v0.3.3 (#3129)

---
 vllm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 7ff92d8cc681d..f1e30f5eb6e6e 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -8,7 +8,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.3.2"
+__version__ = "0.3.3"
 
 __all__ = [
     "LLM",

From 29e70e3e88698feca9509cf07fcf06b12163f1c3 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Sat, 2 Mar 2024 07:28:41 +0800
Subject: [PATCH 042/113] allow user chose log level by --log-level instead of
 fixed 'info'. (#3109)

Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
---
 vllm/config.py                        | 2 +-
 vllm/engine/metrics.py                | 1 +
 vllm/entrypoints/openai/api_server.py | 8 +++++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b4d48d34a8a72..e260e6a0cb1d6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -317,7 +317,7 @@ def __init__(
         self.num_cpu_blocks = None
 
     def metrics_info(self):
-        # convert cache_config to dict(key: str, value:str) for prometheus metrics info
+        # convert cache_config to dict(key: str, value: str) for prometheus metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
     def _verify_args(self) -> None:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 54b09c38f58a5..d31542159e4a4 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -23,6 +23,7 @@ def __init__(self, labelnames: List[str]):
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 REGISTRY.unregister(collector)
 
+        # Config Information
         self.info_cache_config = Info(
             name='vllm:cache_config',
             documentation='information of cache_config')
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b2f040114a078..3777e0f3a0601 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -62,6 +62,12 @@ def parse_args():
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser.add_argument("--host", type=str, default=None, help="host name")
     parser.add_argument("--port", type=int, default=8000, help="port number")
+    parser.add_argument(
+        "--uvicorn-log-level",
+        type=str,
+        default="info",
+        choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
+        help="log level for uvicorn")
     parser.add_argument("--allow-credentials",
                         action="store_true",
                         help="allow credentials")
@@ -245,7 +251,7 @@ async def authentication(request: Request, call_next):
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
-                log_level="info",
+                log_level=args.uvicorn_log_level,
                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                 ssl_keyfile=args.ssl_keyfile,
                 ssl_certfile=args.ssl_certfile)

From baee28c46c242b72f90d6b1211ab9d7872ab05d3 Mon Sep 17 00:00:00 2001
From: cloudhan <cloudhan@outlook.com>
Date: Sat, 2 Mar 2024 14:34:48 +0800
Subject: [PATCH 043/113] Reorder kv dtype check to avoid nvcc not found error
 on AMD platform (#3104)

---
 vllm/config.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index e260e6a0cb1d6..ff8536c1aca55 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -330,15 +330,14 @@ def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
         elif self.cache_dtype == "fp8_e5m2":
+            if is_hip():
+                raise NotImplementedError(
+                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
             nvcc_cuda_version = get_nvcc_cuda_version()
             if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"):
                 raise ValueError(
                     "FP8 is not supported when cuda version is lower than 11.8."
                 )
-            device_name = torch.cuda.get_device_name()
-            if "AMD" in device_name:
-                raise NotImplementedError(
-                    "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
             logger.info(
                 "Using fp8_e5m2 data type to store kv cache. It reduces "
                 "the GPU memory footprint and boosts the performance. "

From ce4f5a29fb3e35041842518fefe999847b8326b9 Mon Sep 17 00:00:00 2001
From: Sage Moore <sagemoore@utexas.edu>
Date: Sat, 2 Mar 2024 03:50:01 -0500
Subject: [PATCH 044/113] Add Automatic Prefix Caching (#2762)

Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 benchmarks/benchmark_throughput.py          |  30 ++-
 docs/source/models/engine_args.rst          |   4 +
 examples/offline_inference_with_prefix.py   |  11 +-
 tests/prefix_caching/test_prefix_caching.py | 103 ++++---
 tests/test_cache_block_hashing.py           |  76 ++++++
 vllm/block.py                               |  14 +-
 vllm/config.py                              |   2 +
 vllm/core/block_manager.py                  | 285 +++++++++++++++-----
 vllm/core/evictor.py                        | 161 +++++++++++
 vllm/core/scheduler.py                      |  15 +-
 vllm/engine/arg_utils.py                    |   9 +-
 vllm/engine/async_llm_engine.py             |  14 +-
 vllm/engine/llm_engine.py                   |  26 +-
 vllm/entrypoints/api_server.py              |   6 +-
 vllm/entrypoints/llm.py                     |  14 +-
 vllm/prefix.py                              |  87 ------
 vllm/sequence.py                            |  23 +-
 vllm/worker/model_runner.py                 |  30 ++-
 18 files changed, 618 insertions(+), 292 deletions(-)
 create mode 100644 tests/test_cache_block_hashing.py
 create mode 100644 vllm/core/evictor.py
 delete mode 100644 vllm/prefix.py

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1ad502526c97c..51c1a6540a451 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -73,21 +73,21 @@ def run_vllm(
     enforce_eager: bool,
     kv_cache_dtype: str,
     device: str,
+    enable_prefix_caching: bool,
 ) -> float:
     from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        device=device,
-    )
+    llm = LLM(model=model,
+              tokenizer=tokenizer,
+              quantization=quantization,
+              tensor_parallel_size=tensor_parallel_size,
+              seed=seed,
+              trust_remote_code=trust_remote_code,
+              dtype=dtype,
+              max_model_len=max_model_len,
+              enforce_eager=enforce_eager,
+              kv_cache_dtype=kv_cache_dtype,
+              device=device,
+              enable_prefix_caching=enable_prefix_caching)
 
     # Add the requests to the engine.
     for prompt, _, output_len in requests:
@@ -211,7 +211,8 @@ def main(args: argparse.Namespace):
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
                                 args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device)
+                                args.kv_cache_dtype, args.device,
+                                args.enable_prefix_caching)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -302,6 +303,7 @@ def main(args: argparse.Namespace):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument("--enable_prefix_caching", action='store_true')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index d89b795149501..9f5f672ae4f34 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -81,6 +81,10 @@ Below, you can find an explanation of every engine argument for vLLM:
 
     Token block size for contiguous chunks of tokens.
 
+.. option:: --enable-prefix-caching
+
+    Enables automatic prefix caching
+
 .. option:: --seed <seed>
 
     Random seed for operations.
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 8ccfb1ceea731..1aa718b88907c 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -37,20 +37,13 @@
 
 print("-" * 80)
 
-# -1 since the last token can change when concatenating prompts.
-prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
-
 # The llm.generate call will batch all prompts and send the batch at once if resources allow.
 # The prefix will only be cached after the first batch is processed, so we need to call generate once
 # to calculate the prefix and cache it.
-outputs = llm.generate(generating_prompts[0],
-                       sampling_params,
-                       prefix_pos=[prefix_pos])
+outputs = llm.generate(generating_prompts[0], sampling_params)
 
 # Subsequent batches can leverage the cached prefix
-outputs = llm.generate(generating_prompts,
-                       sampling_params,
-                       prefix_pos=[prefix_pos] * len(generating_prompts))
+outputs = llm.generate(generating_prompts, sampling_params)
 
 # Print the outputs. You should see the same outputs as before
 for output in outputs:
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 1e301bedfc21e..7ef8dde7bb8f6 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,38 +4,73 @@
 """
 import pytest
 
-from vllm import LLM, SamplingParams
-
-prefix = (
-    "You are an expert school principal, skilled in effectively managing "
-    "faculty and staff. Draft 10-15 questions for a potential first grade "
-    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
-    "community, joyful discovery, and life-long learning. The candidate is "
-    "coming in for a first-round panel interview for a 8th grade Math "
-    "teaching role. They have 5 years of previous teaching experience "
-    "as an assistant teacher at a co-ed, public school with experience "
-    "in middle school math teaching. Based on these information, fulfill "
-    "the following paragraph: ")
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("max_tokens", [16])
-def test_prefix_caching(
-    example_prompts,
-    model: str,
-    max_tokens: int,
+from vllm.core.block_manager import BlockAllocator
+from vllm.utils import Device
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [16])
+def test_block_allocator(
+    block_size: int,
+    num_blocks: int,
 ):
-    llm = LLM(model=model)
-    # -1 since the last token can change when concatenating prompts.
-    prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
-    prompts = [prefix + prompt for prompt in example_prompts]
-    sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-    outputs_without_prefix = llm.generate(prompts, sampling_params)
-    outputs_with_prefix = llm.generate(prompts,
-                                       sampling_params,
-                                       prefix_pos=[prefix_pos] * len(prompts))
-    for output_without_prefix, output_with_prefix in zip(
-            outputs_without_prefix, outputs_with_prefix):
-        assert (output_without_prefix.outputs[0].token_ids ==
-                output_with_prefix.outputs[0].token_ids)
-    assert len(llm.llm_engine.scheduler.prefix_pool.prefixes) == 1
+    block_hash = 1
+    block_allocator = BlockAllocator(Device.CPU,
+                                     block_size,
+                                     num_blocks,
+                                     enable_caching=True)
+
+    # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
+    first_block = block_allocator.allocate(block_hash, 0)
+    second_block = block_allocator.allocate(block_hash, 0)
+    assert (first_block == second_block)
+    assert (second_block.ref_count == 2)
+
+    # Free the first_block and confirm that the ref_count is correctly decremented on the second block
+    block_allocator.free(first_block)
+    assert (second_block.ref_count == 1)
+
+    # Free the second block
+    block_allocator.free(second_block)
+
+    # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
+    first_block = block_allocator.allocate(block_hash, 0)
+    assert (first_block == second_block)
+    assert (first_block.block_hash == block_hash)
+
+
+@pytest.mark.parametrize("num_blocks", [16])
+def test_eviction(num_blocks: int, ):
+    block_size = 16
+    block_allocator = BlockAllocator(Device.CPU,
+                                     block_size,
+                                     num_blocks,
+                                     enable_caching=True)
+    blocks = []
+
+    for i in range(num_blocks):
+        # use i as the block_hash
+        blocks.append(block_allocator.allocate(i, 0))
+
+    #Free all blocks
+    for block in blocks:
+        block_allocator.free(block)
+
+    # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
+    new_block_hash = block_size
+    new_block = block_allocator.allocate(new_block_hash, 0)
+    assert (new_block == blocks[0])
+    assert (new_block.block_hash == new_block_hash)
+
+    # Reallocate the second in blocks to remove it from the free list
+    realloc_block_hash = 1
+    realloc_block = block_allocator.allocate(realloc_block_hash, 0)
+    assert (realloc_block == blocks[realloc_block_hash])
+    assert (realloc_block.block_hash == realloc_block_hash)
+
+    # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
+    new_block_hash = block_size + 1
+    new_block = block_allocator.allocate(new_block_hash, 0)
+    assert (realloc_block != new_block)
+    assert (new_block.block_hash == new_block_hash)
+    assert (new_block.block_number == 2)
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
new file mode 100644
index 0000000000000..7c4ade7f8c8ed
--- /dev/null
+++ b/tests/test_cache_block_hashing.py
@@ -0,0 +1,76 @@
+"""Test hashing of cache blocks.
+
+Run `pytest tests/test_cache_block_hashing.py`.
+"""
+import pytest
+
+from vllm.transformers_utils.tokenizer import TokenizerGroup
+from vllm.sequence import Sequence
+
+# Make two prefixes with different first blocks.
+prefix_start = [("You are an expert"), ("You are a")]
+prefix_common = (
+    " school principal, skilled in effectively managing "
+    "faculty and staff. Draft 10-15 questions for a potential first grade "
+    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
+    "community, joyful discovery, and life-long learning. The candidate is "
+    "coming in for a first-round panel interview for a 8th grade Math "
+    "teaching role. They have 5 years of previous teaching experience "
+    "as an assistant teacher at a co-ed, public school with experience "
+    "in middle school math teaching. Based on this, fulfill "
+    "the following: ")
+prefixes = [start + prefix_common for start in prefix_start]
+
+# Sample prompts.
+sample_prompts = [
+    "Hello, my name is", "The president of the United States is",
+    "The capital of France is", "The future of AI is"
+]
+
+
+# Helper function.
+def flatten_2d(li):
+    return [lss for ls in li for lss in ls]
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("max_num_seqs", [256])
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
+
+    tokenizer = TokenizerGroup(
+        tokenizer_id="facebook/opt-125m",
+        enable_lora=False,
+        max_num_seqs=max_num_seqs,
+        max_input_length=None,
+    )
+
+    hashes = []
+
+    for prefix in prefixes:
+        hashes.append([])
+        prompts = [prefix + prompt for prompt in sample_prompts]
+        seq_id = 0
+        for prompt in prompts:
+            hashes[-1].append([])
+            prompt_token_ids = tokenizer.encode(prompt)
+            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+
+            num_blocks = len(prompt_token_ids) // block_size
+            for idx in range(num_blocks):
+                hashes[-1][-1].append(seq.hash_of_block(idx))
+
+            seq_id += 1
+
+    # Check that hashes made with two prefixes with different first blocks are
+    # different everywhere.
+    for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
+        assert (hash0 != hash1)
+
+    # Check that hashes of different prompts made with the same prefix are the
+    # same until the hashes that contain the prompt.
+    for hash_pref in hashes:
+        same_hashes = [tuple(h[:-1]) for h in hash_pref]
+        different_hashes = [h[-1] for h in hash_pref]
+        assert (len(set(same_hashes)) == 1)
+        assert (len(set(different_hashes)) == len(different_hashes))
diff --git a/vllm/block.py b/vllm/block.py
index 5fe39ed47b2ff..2cc6b947f2255 100644
--- a/vllm/block.py
+++ b/vllm/block.py
@@ -5,6 +5,8 @@
 
 _BLANK_TOKEN_ID = -1
 
+DEFAULT_LAST_ACCESSED_TIME = -1
+
 
 class LogicalTokenBlock:
     """A block that stores a contiguous chunk of tokens from left to right.
@@ -55,17 +57,27 @@ def __init__(
         device: Device,
         block_number: int,
         block_size: int,
+        block_hash: int,
+        num_hashed_tokens: int,
     ) -> None:
         self.device = device
         self.block_number = block_number
         self.block_size = block_size
+        self.block_hash = block_hash
+        self.num_hashed_tokens = num_hashed_tokens
 
         self.ref_count = 0
+        self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
+
+        self.computed = False
 
     def __repr__(self) -> str:
         return (f'PhysicalTokenBlock(device={self.device}, '
                 f'block_number={self.block_number}, '
-                f'ref_count={self.ref_count})')
+                f'num_hashed_tokens={self.num_hashed_tokens}, '
+                f'ref_count={self.ref_count}, '
+                f'last_accessed={self.last_accessed}, '
+                f'computed={self.computed})')
 
 
 # Mapping: logical block number -> physical block.
diff --git a/vllm/config.py b/vllm/config.py
index ff8536c1aca55..876a439cd1280 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -303,12 +303,14 @@ def __init__(
         swap_space: int,
         cache_dtype: str,
         sliding_window: Optional[int] = None,
+        enable_prefix_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
         self.swap_space_bytes = swap_space * _GB
         self.cache_dtype = cache_dtype
         self.sliding_window = sliding_window
+        self.enable_prefix_caching = enable_prefix_caching
         self._verify_args()
         self._verify_cache_dtype()
 
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 3946096d4296a..08d519ab767a9 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,10 +1,13 @@
 """A block manager that manages token blocks."""
 import enum
+from itertools import count
+from os.path import commonprefix
 from typing import Dict, List, Optional, Set, Tuple
 
 from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
+from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
 
 
 class BlockAllocator:
@@ -15,29 +18,68 @@ class BlockAllocator:
     the reference count becomes zero, the block is added back to the free list.
     """
 
-    def __init__(
-        self,
-        device: Device,
-        block_size: int,
-        num_blocks: int,
-    ) -> None:
+    def __init__(self,
+                 device: Device,
+                 block_size: int,
+                 num_blocks: int,
+                 eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+                 enable_caching: bool = False) -> None:
         self.device = device
         self.block_size = block_size
         self.num_blocks = num_blocks
+        self.enable_caching = enable_caching
+
+        self.current_num_blocks = 0
+        self.cached_blocks: Dict[int, PhysicalTokenBlock] = {}
+
+        # Switch over to FIFO eviction when caching is disabled
+        if not self.enable_caching:
+            eviction_policy = EvictionPolicy.FIFO
+        self.evictor: Evictor = make_evictor(eviction_policy)
+
+        self.default_hash_ctr = count()
+
+    def allocate_block(self, block_hash: int,
+                       num_hashed_tokens: int) -> PhysicalTokenBlock:
+        if self.current_num_blocks == self.num_blocks:
+            block = self.evictor.evict()
+            block.block_hash = block_hash
+            block.num_hashed_tokens = num_hashed_tokens
+            return block
+        block = PhysicalTokenBlock(device=self.device,
+                                   block_number=self.current_num_blocks,
+                                   block_size=self.block_size,
+                                   block_hash=block_hash,
+                                   num_hashed_tokens=num_hashed_tokens)
+        self.current_num_blocks += 1
+        return block
 
-        # Initialize the free blocks.
-        self.free_blocks: BlockTable = []
-        for i in range(num_blocks):
-            block = PhysicalTokenBlock(device=device,
-                                       block_number=i,
-                                       block_size=block_size)
-            self.free_blocks.append(block)
-
-    def allocate(self) -> PhysicalTokenBlock:
-        if not self.free_blocks:
-            raise ValueError("Out of memory! No free blocks are available.")
-        block = self.free_blocks.pop()
-        block.ref_count = 1
+    def allocate(self,
+                 block_hash: Optional[int] = None,
+                 num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
+        # If caching is disabled, just allocate a new block and return it
+        if not self.enable_caching:
+            block = self.allocate_block(next(self.default_hash_ctr),
+                                        num_hashed_tokens)
+            block.ref_count += 1
+            return block
+
+        if block_hash is None:
+            block_hash = next(self.default_hash_ctr)
+        if block_hash in self.evictor:
+            assert block_hash not in self.cached_blocks
+            block = self.evictor.remove(block_hash)
+            assert block.ref_count == 0
+            self.cached_blocks[block_hash] = block
+            block.ref_count += 1
+            assert block.block_hash == block_hash
+            return block
+        if block_hash not in self.cached_blocks:
+            self.cached_blocks[block_hash] = self.allocate_block(
+                block_hash, num_hashed_tokens)
+        block = self.cached_blocks[block_hash]
+        assert block.block_hash == block_hash
+        block.ref_count += 1
         return block
 
     def free(self, block: PhysicalTokenBlock) -> None:
@@ -45,10 +87,27 @@ def free(self, block: PhysicalTokenBlock) -> None:
             raise ValueError(f"Double free! {block} is already freed.")
         block.ref_count -= 1
         if block.ref_count == 0:
-            self.free_blocks.append(block)
+            assert block.block_hash not in self.evictor
+            self.evictor.add(block)
+
+            # If caching is enabled, remove the block from the cached_blocks
+            if self.enable_caching:
+                del self.cached_blocks[block.block_hash]
 
     def get_num_free_blocks(self) -> int:
-        return len(self.free_blocks)
+        return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks
+
+    def contains_block(self, block_hash: int) -> bool:
+        return block_hash in self.cached_blocks or block_hash in self.evictor
+
+    def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
+        # If caching is enabled, update the hash of block and the cached_blocks dictionary.
+        if self.enable_caching:
+            assert not self.contains_block(block_hash)
+            old_hash = block.block_hash
+            block.block_hash = block_hash
+            del self.cached_blocks[old_hash]
+            self.cached_blocks[block_hash] = block
 
 
 class AllocStatus(enum.Enum):
@@ -75,6 +134,7 @@ def __init__(
         num_cpu_blocks: int,
         watermark: float = 0.01,
         sliding_window: Optional[int] = None,
+        enable_caching: bool = False,
     ) -> None:
         self.block_size = block_size
         self.num_total_gpu_blocks = num_gpu_blocks
@@ -89,11 +149,17 @@ def __init__(
         self.watermark = watermark
         assert watermark >= 0.0
 
+        self.enable_caching = enable_caching
+
         self.watermark_blocks = int(watermark * num_gpu_blocks)
-        self.gpu_allocator = BlockAllocator(Device.GPU, block_size,
-                                            num_gpu_blocks)
-        self.cpu_allocator = BlockAllocator(Device.CPU, block_size,
-                                            num_cpu_blocks)
+        self.gpu_allocator = BlockAllocator(Device.GPU,
+                                            block_size,
+                                            num_gpu_blocks,
+                                            enable_caching=enable_caching)
+        self.cpu_allocator = BlockAllocator(Device.CPU,
+                                            block_size,
+                                            num_cpu_blocks,
+                                            enable_caching=enable_caching)
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
 
@@ -103,9 +169,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         num_required_blocks = len(seq.logical_token_blocks)
 
-        if seq_group.prefix is not None and seq_group.prefix.allocated:
-            num_required_blocks -= seq_group.prefix.get_num_blocks()
-
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
@@ -129,36 +192,16 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         num_prompt_blocks = len(seq.logical_token_blocks)
 
         block_table: BlockTable = []
-        prefix_block_table: BlockTable = []
-        num_prefix_blocks = 0
-
-        prefix = seq_group.prefix
-        if prefix is not None and prefix.allocated:
-            # Prefix has already been allocated. Use the existing block table.
-            num_prompt_blocks -= prefix.get_num_blocks()
-            for block in prefix.block_table:
-                block.ref_count += seq_group.num_seqs()
-                block_table.append(block)
-
         for logical_idx in range(num_prompt_blocks):
             if (self.block_sliding_window is not None
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
             else:
-                block = self.gpu_allocator.allocate()
-            # Set the reference counts of the token blocks.
-            block.ref_count = seq_group.num_seqs()
+                block = self.gpu_allocator.allocate(
+                    seq.hash_of_block(logical_idx),
+                    seq.num_hashed_tokens_of_block(logical_idx))
             block_table.append(block)
 
-        if prefix is not None and not prefix.allocated:
-            # Allocate blocks for the prefix, we will compute the prefix's
-            # KV cache in this run.
-            num_prefix_blocks = prefix.get_num_blocks()
-            prefix_block_table = block_table[:num_prefix_blocks]
-            for block in prefix_block_table:
-                block.ref_count += 1
-            prefix.set_block_table(prefix_block_table)
-
         # Assign the block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
@@ -170,12 +213,72 @@ def can_append_slot(self, seq_group: SequenceGroup) -> bool:
         num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
         return num_seqs <= num_free_gpu_blocks
 
-    def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
+    def _promote_last_block(
+        self,
+        seq: Sequence,
+        last_block: PhysicalTokenBlock,
+    ) -> PhysicalTokenBlock:
+        # Compute a new hash for the block so that it can be shared by other Sequences
+        new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
+
+        # if new_hash is already in the cached table, then free last_block and return the cached version
+        if self.gpu_allocator.contains_block(new_hash):
+            self.gpu_allocator.free(last_block)
+            return self.gpu_allocator.allocate(new_hash)
+        else:
+            self.gpu_allocator.update_hash(new_hash, last_block)
+            return last_block
+
+    def _is_last_block_full(
+        self,
+        seq: Sequence,
+    ) -> bool:
+        token_ids_len = len(seq.data.get_token_ids())
+        return token_ids_len > 0 and token_ids_len % seq.block_size == 0
+
+    def _is_last_block(
+        self,
+        seq: Sequence,
+        index: int,
+    ) -> bool:
+        return index == len(seq.logical_token_blocks) - 1
+
+    def _maybe_promote_last_block(
+        self,
+        seq: Sequence,
+        last_block: PhysicalTokenBlock,
+    ) -> PhysicalTokenBlock:
+        if self._is_last_block_full(seq):
+            return self._promote_last_block(seq, last_block)
+        else:
+            return last_block
+
+    def _allocate_last_physical_block(
+        self,
+        seq: Sequence,
+    ) -> PhysicalTokenBlock:
+        block_hash: Optional[int] = None
+        if (self._is_last_block_full(seq)):
+            block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
+        num_hashed_tokens = seq.num_hashed_tokens_of_block(
+            len(seq.logical_token_blocks) - 1)
+        new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens)
+        if block_hash is None:
+            assert new_block.ref_count == 1
+        return new_block
+
+    def append_slot(
+        self,
+        seq: Sequence,
+    ) -> Optional[Tuple[int, int]]:
         """Allocate a physical slot for a new token."""
         logical_blocks = seq.logical_token_blocks
         block_table = self.block_tables[seq.seq_id]
-
+        # If we need to allocate a new physical block
         if len(block_table) < len(logical_blocks):
+            # Currently this code only supports adding one physical block
+            assert len(block_table) == len(logical_blocks) - 1
+
             if (self.block_sliding_window
                     and len(block_table) >= self.block_sliding_window):
                 # reuse a block
@@ -184,8 +287,8 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
             else:
                 # The sequence has a new logical block.
                 # Allocate a new physical block.
-                block = self.gpu_allocator.allocate()
-                block_table.append(block)
+                new_block = self._allocate_last_physical_block(seq)
+                block_table.append(new_block)
                 return None
 
         # We want to append the token to the last physical block.
@@ -193,11 +296,15 @@ def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
         assert last_block.device == Device.GPU
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
+            # If the last block is now complete, promote it to a full block so that it can be shared
+            new_block = self._maybe_promote_last_block(seq, last_block)
+            block_table[-1] = new_block
             return None
         else:
             # The last block is shared with other sequences.
             # Copy on Write: Allocate a new block and copy the tokens.
-            new_block = self.gpu_allocator.allocate()
+            new_block = self._allocate_last_physical_block(seq)
+
             block_table[-1] = new_block
             self.gpu_allocator.free(last_block)
             return last_block.block_number, new_block.block_number
@@ -233,25 +340,18 @@ def can_swap_in(self, seq_group: SequenceGroup) -> bool:
 
     def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # CPU block -> GPU block.
-        if seq_group.prefix is not None:
-            # make sure to swap in the prefix first
-            assert seq_group.prefix.allocated and seq_group.prefix.computed
-
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             new_block_table: BlockTable = []
             block_table = self.block_tables[seq.seq_id]
-            if seq_group.prefix is not None:
-                for block in seq_group.prefix.block_table:
-                    new_block_table.append(block)
-                    block.ref_count += 1
 
             for cpu_block in block_table:
                 if cpu_block in mapping:
                     gpu_block = mapping[cpu_block]
                     gpu_block.ref_count += 1
                 else:
-                    gpu_block = self.gpu_allocator.allocate()
+                    gpu_block = self.gpu_allocator.allocate(
+                        cpu_block.block_hash, cpu_block.num_hashed_tokens)
                     mapping[cpu_block] = gpu_block
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
@@ -276,17 +376,12 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
             block_table = self.block_tables[seq.seq_id]
 
             for gpu_block in block_table:
-                if (seq_group.prefix is not None
-                        and gpu_block in seq_group.prefix.block_table):
-                    # NOTE: We do not swap out the prefix blocks for now.
-                    self.gpu_allocator.free(gpu_block)
-                    continue
-
                 if gpu_block in mapping:
                     cpu_block = mapping[gpu_block]
                     cpu_block.ref_count += 1
                 else:
-                    cpu_block = self.cpu_allocator.allocate()
+                    cpu_block = self.cpu_allocator.allocate(
+                        gpu_block.block_hash, gpu_block.num_hashed_tokens)
                     mapping[gpu_block] = cpu_block
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
@@ -328,3 +423,49 @@ def get_num_free_gpu_blocks(self) -> int:
 
     def get_num_free_cpu_blocks(self) -> int:
         return self.cpu_allocator.get_num_free_blocks()
+
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
+        block_table = self.block_tables[seq.seq_id]
+        for block in block_table:
+            block.last_accessed = access_time
+
+    def compute_last_full_block_in_seq(self, seq: Sequence):
+        if seq.seq_id not in self.block_tables:
+            return
+        max_full_block = seq.get_len() // seq.block_size - 1
+        block_table = self.block_tables[seq.seq_id]
+        if max_full_block == -1:
+            return
+        block_table[max_full_block].computed = True
+
+    def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]:
+        if seq.seq_id not in self.block_tables:
+            return []
+        block_table = self.block_tables[seq.seq_id]
+        for block_idx in reversed(range(len(block_table))):
+            if block_table[block_idx].computed:
+                return [b.block_number for b in block_table[:block_idx + 1]]
+        return []
+
+    # Can return non-empty result only with prefix caching enabled.
+    def get_common_computed_block_ids(self,
+                                      seq_group: SequenceGroup) -> List[int]:
+        if not self.enable_caching:
+            return []
+
+        ids_list = [
+            self.get_all_block_ids_till_computed(seq)
+            for seq in iter(seq_group.seqs_dict.values())
+        ]
+        return commonprefix([ids for ids in ids_list if ids != []])
+
+    # We only mark the last full block because with prefix caching,
+    # all blocks until the marked one are guaranteed to be computed.
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
+        if self.enable_caching:
+            for seq in seq_group.seqs_dict.values():
+                self.compute_last_full_block_in_seq(seq)
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
new file mode 100644
index 0000000000000..b538ea574b604
--- /dev/null
+++ b/vllm/core/evictor.py
@@ -0,0 +1,161 @@
+import enum
+from typing import Dict, List, Optional
+from abc import ABC, abstractmethod, abstractproperty
+
+from vllm.block import PhysicalTokenBlock
+
+
+class EvictionPolicy(enum.Enum):
+    """Enum for eviction policy used by make_evictor to instantiate the correct
+       Evictor subclass.
+    """
+    LRU = enum.auto()
+    FIFO = enum.auto()
+
+
+class Evictor(ABC):
+    """The Evictor subclasses should be used by the BlockAllocator class to
+    handle eviction of freed PhysicalTokenBlocks.
+    """
+
+    @abstractmethod
+    def __init__(self):
+        pass
+
+    @abstractmethod
+    def __contains__(self, block_hash: int) -> bool:
+        pass
+
+    @abstractmethod
+    def evict(self) -> PhysicalTokenBlock:
+        """Runs the eviction algorithm and returns the evicted block"""
+        pass
+
+    @abstractmethod
+    def add(self, block: PhysicalTokenBlock):
+        """Adds block to the evictor, making it a candidate for eviction"""
+        pass
+
+    @abstractmethod
+    def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        """Simply removes the block with the hash value block_hash from the
+        evictor. Caller is responsible for making sure that block_hash is contained
+        in the evictor before calling remove. Should be used to "bring back" blocks
+        that have been freed but not evicted yet.
+        """
+        pass
+
+    @abstractproperty
+    def num_blocks(self) -> int:
+        pass
+
+
+class LRUEvictor(Evictor):
+    """Evicts in a least-recently-used order using the last_accessed timestamp
+    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    the same last_accessed time, then the one with the largest num_hashed_tokens
+    will be evicted. If two blocks each have the lowest last_accessed time and
+    highest num_hashed_tokens value, then one will be chose arbitrarily
+    """
+
+    def __init__(self):
+        self.free_table: Dict[int, PhysicalTokenBlock] = {}
+
+    def __contains__(self, block_hash: int) -> bool:
+        return block_hash in self.free_table
+
+    # TODO: The performance of this evict function can be optimized further.
+    def evict(self) -> PhysicalTokenBlock:
+        free_blocks: List[PhysicalTokenBlock] = list(self.free_table.values())
+        if len(free_blocks) == 0:
+            raise ValueError("No usable cache memory left")
+
+        # Find lowest timestamp
+        lowest_timestamp = free_blocks[0].last_accessed
+        for block in free_blocks:
+            if block.last_accessed < lowest_timestamp:
+                lowest_timestamp = block.last_accessed
+
+        # Find all blocks with the lowest timestamp
+        least_recent: List[PhysicalTokenBlock] = []
+        for block in free_blocks:
+            if block.last_accessed == lowest_timestamp:
+                least_recent.append(block)
+
+        # Find highest prefix count per block
+        highest_num_hashed_tokens = 0
+        for block in least_recent:
+            if block.num_hashed_tokens > highest_num_hashed_tokens:
+                highest_num_hashed_tokens = block.num_hashed_tokens
+
+        evicted_block: Optional[PhysicalTokenBlock] = None
+
+        # Find the first block with the lowest timestamp
+        for block in least_recent:
+            if block.num_hashed_tokens == highest_num_hashed_tokens:
+                evicted_block = block
+                break
+
+        assert evicted_block is not None
+
+        del self.free_table[evicted_block.block_hash]
+
+        evicted_block.computed = False
+        return evicted_block
+
+    def add(self, block: PhysicalTokenBlock):
+        self.free_table[block.block_hash] = block
+
+    def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        if block_hash not in self.free_table:
+            raise ValueError(
+                "Attempting to remove block that's not in the evictor")
+        block: PhysicalTokenBlock = self.free_table[block_hash]
+        del self.free_table[block_hash]
+        return block
+
+    @property
+    def num_blocks(self) -> int:
+        return len(self.free_table)
+
+
+class RandomEvictor(Evictor):
+    """Evicts in a first-in-first-out order"""
+
+    def __init__(self):
+        self.free_table: Dict[int, PhysicalTokenBlock] = {}
+
+    def __contains__(self, block_hash: int) -> bool:
+        return block_hash in self.free_table
+
+    def evict(self) -> PhysicalTokenBlock:
+        if len(self.free_table) == 0:
+            raise ValueError("No usable cache memory left")
+        evicted_block = next(iter(self.free_table.values()))
+        evicted_block.computed = False
+        del self.free_table[evicted_block.block_hash]
+        return evicted_block
+
+    def add(self, block: PhysicalTokenBlock):
+        self.free_table[block.block_hash] = block
+
+    def remove(self, block_hash: int) -> PhysicalTokenBlock:
+        if block_hash not in self.free_table:
+            raise ValueError(
+                "Attempting to remove block that's not in the evictor")
+        block: PhysicalTokenBlock = self.free_table[block_hash]
+        del self.free_table[block_hash]
+        return block
+
+    @property
+    def num_blocks(self) -> int:
+        return len(self.free_table)
+
+
+def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
+    if eviction_policy == EvictionPolicy.LRU:
+        return LRUEvictor()
+    elif eviction_policy == EvictionPolicy.FIFO:
+        return RandomEvictor()
+    else:
+        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 5e7cc3091d775..1ae58f525b0fb 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -10,7 +10,6 @@
 from vllm.logger import init_logger
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceStatus)
-from vllm.prefix import PrefixPool
 
 logger = init_logger(__name__)
 
@@ -95,10 +94,8 @@ def __init__(
             block_size=self.cache_config.block_size,
             num_gpu_blocks=self.cache_config.num_gpu_blocks,
             num_cpu_blocks=self.cache_config.num_cpu_blocks,
-            sliding_window=self.cache_config.sliding_window)
-
-        # Create the prefix pool to cache the prefixes.
-        self.prefix_pool = PrefixPool(self.cache_config.block_size)
+            sliding_window=self.cache_config.sliding_window,
+            enable_caching=self.cache_config.enable_prefix_caching)
 
         # Sequence groups in the WAITING state.
         self.waiting: Deque[SequenceGroup] = deque()
@@ -374,10 +371,12 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
 
             seq_data: Dict[int, SequenceData] = {}
             block_tables: Dict[int, List[int]] = {}
+
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 seq_id = seq.seq_id
                 seq_data[seq_id] = seq.data
                 block_tables[seq_id] = self.block_manager.get_block_table(seq)
+                self.block_manager.access_all_blocks_in_seq(seq, now)
 
             seq_group_metadata = SequenceGroupMetadata(
                 request_id=seq_group.request_id,
@@ -386,7 +385,8 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 sampling_params=seq_group.sampling_params,
                 block_tables=block_tables,
                 lora_request=seq_group.lora_request,
-                prefix=seq_group.prefix,
+                computed_block_nums=self.block_manager.
+                get_common_computed_block_ids(seq_group),
                 state=seq_group.state,
             )
             seq_group_metadata_list.append(seq_group_metadata)
@@ -496,3 +496,6 @@ def _swap_out(
         blocks_to_swap_out.update(mapping)
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             seq.status = SequenceStatus.SWAPPED
+
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup):
+        self.block_manager.mark_blocks_as_computed(seq_group)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c01e7311fb89a..0349c3a6636c7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -25,6 +25,7 @@ class EngineArgs:
     tensor_parallel_size: int = 1
     max_parallel_loading_workers: Optional[int] = None
     block_size: int = 16
+    enable_prefix_caching: bool = False
     swap_space: int = 4  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
@@ -173,6 +174,11 @@ def add_cli_args(
                             default=EngineArgs.block_size,
                             choices=[8, 16, 32, 128],
                             help='token block size')
+
+        parser.add_argument('--enable-prefix-caching',
+                            action='store_true',
+                            help='Enables automatic prefix caching')
+
         parser.add_argument('--seed',
                             type=int,
                             default=EngineArgs.seed,
@@ -293,7 +299,8 @@ def create_engine_configs(
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
-                                   model_config.get_sliding_window())
+                                   model_config.get_sliding_window(),
+                                   self.enable_prefix_caching)
         parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index daa6419cdad3b..9e52d20ca4980 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -225,7 +225,6 @@ async def add_request_async(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -245,7 +244,6 @@ async def add_request_async(
             sampling_params=sampling_params,
             arrival_time=arrival_time,
             lora_request=lora_request,
-            prefix_pos=prefix_pos,
         )
 
     async def _run_workers_async(
@@ -422,7 +420,6 @@ async def add_request(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> AsyncStream:
         if self.log_requests:
             shortened_prompt = prompt
@@ -435,7 +432,6 @@ async def add_request(
                                                               max_log_len]
             logger.info(f"Received request {request_id}: "
                         f"prompt: {shortened_prompt!r}, "
-                        f"prefix_pos: {prefix_pos},"
                         f"sampling_params: {sampling_params}, "
                         f"prompt_token_ids: {shortened_token_ids}, "
                         f"lora_request: {lora_request}.")
@@ -472,8 +468,7 @@ async def add_request(
             sampling_params=sampling_params,
             prompt_token_ids=prompt_token_ids,
             arrival_time=arrival_time,
-            lora_request=lora_request,
-            prefix_pos=prefix_pos)
+            lora_request=lora_request)
 
         return stream
 
@@ -484,7 +479,6 @@ async def generate(
         request_id: str,
         prompt_token_ids: Optional[List[int]] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
@@ -500,11 +494,6 @@ async def generate(
             prompt_token_ids: The token IDs of the prompt. If None, we
                 use the tokenizer to convert the prompts to token IDs.
             lora_request: LoRA request to use for generation, if any.
-            prefix_pos: If not None, we use the given position as the prefix
-                position for each prompt. We will cache the prefix's KV
-                cache and reuse it for the next request with the same prefix.
-                This is an experimental feature, and may be replaced with
-                automatic prefix caching in the future.
 
         Yields:
             The output `RequestOutput` objects from the LLMEngine for the
@@ -565,7 +554,6 @@ async def generate(
                 prompt_token_ids=prompt_token_ids,
                 arrival_time=arrival_time,
                 lora_request=lora_request,
-                prefix_pos=prefix_pos,
             )
 
             async for request_output in stream:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index df4858a696530..e84fda5640e4d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -415,7 +415,6 @@ def add_request(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         """Add a request to the engine's request pool.
 
@@ -432,11 +431,6 @@ def add_request(
                 use the tokenizer to convert the prompts to token IDs.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
-            prefix_pos: If not None, we use the given position as the prefix
-                position for each prompt. We will cache the prefix's KV
-                cache and reuse it for the next request with the same prefix.
-                This is an experimental feature, and may be replaced with
-                automatic prefix caching in the future.
 
         Details:
             - Set arrival_time to the current time if it is None.
@@ -479,18 +473,13 @@ def add_request(
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
                        lora_request)
 
-        # Check whether the input specifies prefix
-        prefix = self.scheduler.prefix_pool.add_or_get_prefix(
-            prompt_token_ids[:prefix_pos], lora_request.lora_int_id
-            if lora_request else 0) if prefix_pos is not None else None
-
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
         sampling_params = sampling_params.clone()
 
         # Create the sequence group.
         seq_group = SequenceGroup(request_id, [seq], sampling_params,
-                                  arrival_time, lora_request, prefix)
+                                  arrival_time, lora_request)
 
         # Add the sequence group to the scheduler.
         self.scheduler.add_seq_group(seq_group)
@@ -752,6 +741,13 @@ def _process_model_outputs(
         now = time.time()
         # Update the scheduled sequence groups with the model outputs.
         scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
+
+        # If prefix caching is enabled, mark all blocks in the sequence groups
+        # as completed so that future requests don't attempt to recompute them
+        if self.cache_config.enable_prefix_caching:
+            for seq_group in scheduled_seq_groups:
+                self.scheduler.mark_blocks_as_computed(seq_group)
+
         for seq_group, outputs in zip(scheduled_seq_groups, output):
             self._process_sequence_group_outputs(seq_group, outputs)
 
@@ -768,12 +764,6 @@ def _process_model_outputs(
             request_output = RequestOutput.from_seq_group(seq_group)
             request_outputs.append(request_output)
 
-        # Update prefix state, now all the uncomputed prefixes are computed.
-        for seq_group in scheduled_seq_groups:
-            if (seq_group.prefix is not None and seq_group.prefix.allocated
-                    and not seq_group.prefix.computed):
-                seq_group.prefix.computed = True
-
         # Log stats.
         if self.log_stats:
             self.stat_logger.log(self._get_stats(scheduler_outputs))
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index e7af2c6db5e4c..1eb4ab8b06b64 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -39,15 +39,11 @@ async def generate(request: Request) -> Response:
     """
     request_dict = await request.json()
     prompt = request_dict.pop("prompt")
-    prefix_pos = request_dict.pop("prefix_pos", None)
     stream = request_dict.pop("stream", False)
     sampling_params = SamplingParams(**request_dict)
     request_id = random_uuid()
 
-    results_generator = engine.generate(prompt,
-                                        sampling_params,
-                                        request_id,
-                                        prefix_pos=prefix_pos)
+    results_generator = engine.generate(prompt, sampling_params, request_id)
 
     # Streaming case
     async def stream_results() -> AsyncGenerator[bytes, None]:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fc82018d18eb6..62f1d172377f6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -124,7 +124,6 @@ def generate(
         prompts: Optional[Union[str, List[str]]] = None,
         sampling_params: Optional[SamplingParams] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
-        prefix_pos: Optional[Union[int, List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
     ) -> List[RequestOutput]:
@@ -140,11 +139,6 @@ def generate(
                 None, we use the default sampling parameters.
             prompt_token_ids: A list of token IDs for the prompts. If None, we
                 use the tokenizer to convert the prompts to token IDs.
-            prefix_pos: If not None, we use the given position as the prefix
-                position for each prompt. We will cache the prefix's KV
-                cache and reuse it for the next request with the same prefix.
-                This is an experimental feature, and may be replaced with
-                automatic prefix caching in the future.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
 
@@ -171,14 +165,12 @@ def generate(
             prompt_token_ids)
         for i in range(num_requests):
             prompt = prompts[i] if prompts is not None else None
-            prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None
             token_ids = None if prompt_token_ids is None else prompt_token_ids[
                 i]
             self._add_request(prompt,
                               sampling_params,
                               token_ids,
-                              lora_request=lora_request,
-                              prefix_pos=prefix_pos_i)
+                              lora_request=lora_request)
         return self._run_engine(use_tqdm)
 
     def _add_request(
@@ -187,15 +179,13 @@ def _add_request(
         sampling_params: SamplingParams,
         prompt_token_ids: Optional[List[int]],
         lora_request: Optional[LoRARequest] = None,
-        prefix_pos: Optional[int] = None,
     ) -> None:
         request_id = str(next(self.request_counter))
         self.llm_engine.add_request(request_id,
                                     prompt,
                                     sampling_params,
                                     prompt_token_ids,
-                                    lora_request=lora_request,
-                                    prefix_pos=prefix_pos)
+                                    lora_request=lora_request)
 
     def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # Initialize tqdm.
diff --git a/vllm/prefix.py b/vllm/prefix.py
deleted file mode 100644
index 5b6e8e4b92be6..0000000000000
--- a/vllm/prefix.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from typing import Dict, List, Sequence, Tuple, Optional
-
-from vllm.block import BlockTable
-
-
-class Prefix:
-    """Data and states associated with a prefix of prompt tokens for multiple
-    sequence groups.
-
-    NOTE: This feature is experimental and may be replaced with automatic
-        prefix caching in the future.
-
-    Args:
-        token_ids: The token ids of the prefix.
-        block_size: The block size of the executed model.
-    """
-
-    def __init__(
-        self,
-        token_ids: Sequence[int],
-        block_size: int,
-    ) -> None:
-        self.token_ids = tuple(token_ids)
-        self.block_size = block_size
-        self.length = len(token_ids)
-        self.hash = hash(token_ids)
-        assert self.length % block_size == 0
-        self.block_table: Optional[BlockTable] = None
-        self.computed = False
-
-    @property
-    def allocated(self) -> bool:
-        return self.block_table is not None
-
-    def get_num_blocks(self) -> int:
-        return self.length // self.block_size
-
-    def get_block_numbers(self) -> List[int]:
-        return [block.block_number for block in self.block_table]
-
-    def get_length(self) -> int:
-        return self.length
-
-    def __hash__(self) -> int:
-        return self.hash
-
-    def set_block_table(self, block_table: BlockTable) -> None:
-        self.block_table = block_table.copy()
-
-
-class PrefixPool:
-    """Manages all the prompt prefixes.
-
-    NOTE: This feature is experimental and may be replaced with automatic
-        prefix caching in the future.
-
-    Args:
-        block_size: The block size of the executed model.
-
-    Attributes:
-        prefixes: A list of all the prefixes.
-        block_size: The block size of the executed model.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-    ) -> None:
-        # TODO(zhuohan): Add a capacity limit to the prefix pool.
-        self.prefixes: Dict[int, Prefix] = {}
-        self.block_size = block_size
-
-    def _truncate_token_ids(self, token_ids: Sequence[int]) -> Tuple[int]:
-        new_length = len(token_ids) // self.block_size * self.block_size
-        return tuple(token_ids[:new_length])
-
-    def add_or_get_prefix(self, token_ids: Sequence[int],
-                          lora_int_id: int) -> Optional[Prefix]:
-        token_ids = self._truncate_token_ids(token_ids)
-        if len(token_ids) == 0:
-            # Prefix is empty.
-            return None
-        prefix = Prefix(token_ids, self.block_size)
-        prefix_hash = hash((prefix, lora_int_id))
-        if prefix_hash not in self.prefixes:
-            self.prefixes[prefix_hash] = prefix
-        return self.prefixes[prefix_hash]
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 040e9756e15c6..122960035e505 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,7 +5,6 @@
 from typing import Dict, List, Optional, Union
 
 from vllm.block import LogicalTokenBlock
-from vllm.prefix import Prefix
 from vllm.sampling_params import SamplingParams
 from vllm.lora.request import LoRARequest
 
@@ -161,6 +160,16 @@ def __init__(
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
+    # TODO The current hashing function is O(L^2). We should optimize this in
+    # the future.
+    def hash_of_block(self, logical_idx: int) -> int:
+        # Compute the number of tokens in the sequence
+        num_tokens = self.num_hashed_tokens_of_block(logical_idx)
+        return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
+
+    def num_hashed_tokens_of_block(self, logical_idx: int):
+        return logical_idx * self.block_size + self.block_size
+
     def _append_logical_block(self) -> None:
         block = LogicalTokenBlock(
             block_number=len(self.logical_token_blocks),
@@ -265,7 +274,6 @@ class SequenceGroup:
         sampling_params: The sampling parameters used to generate the outputs.
         arrival_time: The arrival time of the request.
         lora_request: LoRA request.
-        prefix: The prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -275,7 +283,6 @@ def __init__(
         sampling_params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
-        prefix: Optional[Prefix] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -286,7 +293,6 @@ def __init__(
                                       first_token_time=None,
                                       time_in_queue=None)
         self.lora_request = lora_request
-        self.prefix: Optional[Prefix] = prefix
         self.prompt_logprobs: Optional[PromptLogprobs] = None
         self.state = SequenceGroupState()
 
@@ -302,6 +308,10 @@ def prompt_token_ids(self) -> List[int]:
         # We use the prompt of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).data.prompt_token_ids
 
+    @property
+    def block_size(self) -> int:
+        return next(iter(self.seqs_dict.values())).block_size
+
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
@@ -408,7 +418,6 @@ class SequenceGroupMetadata:
             numbers)
         state: Internal state tied to this sequence group.
         lora_request: LoRA request.
-        prefix: The prefix of the prompt of the sequence group.
     """
 
     def __init__(
@@ -419,7 +428,7 @@ def __init__(
         sampling_params: SamplingParams,
         block_tables: Dict[int, List[int]],
         lora_request: Optional[LoRARequest] = None,
-        prefix: Optional[Prefix] = None,
+        computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
     ) -> None:
         self.request_id = request_id
@@ -428,7 +437,7 @@ def __init__(
         self.sampling_params = sampling_params
         self.block_tables = block_tables
         self.lora_request = lora_request
-        self.prefix = prefix
+        self.computed_block_nums = computed_block_nums
         self.state = SequenceGroupState() if state is None else state
 
     @property
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index efe570778fb43..aff8ebc903623 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -145,33 +145,37 @@ def _prepare_prompt(
             prompt_tokens = seq_data.get_token_ids()
             prompt_len = len(prompt_tokens)
             prompt_lens.append(prompt_len)
-            prefix_len = 0
-            prefix = seq_group_metadata.prefix
-            if prefix is not None and prefix.computed:
-                prefix_len = prefix.get_length()
-                prompt_tokens = prompt_tokens[prefix_len:]
-                prefix_block_tables.append(prefix.get_block_numbers())
+            computed_len = 0
+
+            # NOTE: This only works for oooooooxxx style attention.
+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if computed_block_nums is not None and len(
+                    computed_block_nums) > 0 and self.sliding_window is None:
+                # Prefix is not supported with sliding_window
+                computed_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[computed_len:]
+                prefix_block_tables.append(computed_block_nums)
             else:
                 prefix_block_tables.append([])
             # actual prompt lens
-            context_lens.append(prefix_len)
-            subquery_lens.append(prompt_len - prefix_len)
+            context_lens.append(computed_len)
+            subquery_lens.append(prompt_len - computed_len)
 
             input_tokens.append(prompt_tokens)
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.append(
-                list(range(prefix_len, prefix_len + len(prompt_tokens))))
+                list(range(computed_len, computed_len + len(prompt_tokens))))
 
             lora_id = seq_group_metadata.lora_int_id
 
             if lora_id > 0:
                 lora_requests.add(seq_group_metadata.lora_request)
 
-            lora_index_mapping.append([lora_id] * (prompt_len - prefix_len))
+            lora_index_mapping.append([lora_id] * (prompt_len - computed_len))
             lora_prompt_mapping.extend(
                 [lora_id] *
-                (prompt_len - prefix_len
+                (prompt_len - computed_len
                  if seq_group_metadata.sampling_params.prompt_logprobs else 1))
 
             if seq_group_metadata.block_tables is None:
@@ -190,11 +194,11 @@ def _prepare_prompt(
             # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
             start_idx = 0
             if self.sliding_window is not None:
-                assert prefix_len == 0, (
+                assert computed_len == 0, (
                     "Prefix caching is currently not supported with "
                     "sliding window attention")
                 start_idx = max(0, prompt_len - self.sliding_window)
-            for i in range(prefix_len, prompt_len):
+            for i in range(computed_len, prompt_len):
                 if i < start_idx:
                     slot_mapping[-1].append(_PAD_SLOT_ID)
                     continue

From d65fac2738f0287a41955b45df76a2d5a919bff6 Mon Sep 17 00:00:00 2001
From: Jason Cox <jason@jasonacox.com>
Date: Sun, 3 Mar 2024 00:00:29 -0500
Subject: [PATCH 045/113] Add vLLM version info to logs and openai API server
 (#3161)

---
 vllm/engine/llm_engine.py             | 3 ++-
 vllm/entrypoints/openai/api_server.py | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e84fda5640e4d..c9bd89a1b18f4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,6 +7,7 @@
 from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
                     Union)
 
+import vllm
 from vllm.lora.request import LoRARequest
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, LoRAConfig)
@@ -85,7 +86,7 @@ def __init__(
         log_stats: bool,
     ) -> None:
         logger.info(
-            "Initializing an LLM engine with config: "
+            f"Initializing an LLM engine (v{vllm.__version__}) with config: "
             f"model={model_config.model!r}, "
             f"tokenizer={model_config.tokenizer!r}, "
             f"tokenizer_mode={model_config.tokenizer_mode}, "
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3777e0f3a0601..993a834e5a720 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -15,6 +15,7 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse, Response
 
+import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
@@ -168,6 +169,12 @@ async def show_available_models():
     return JSONResponse(content=models.model_dump())
 
 
+@app.get("/version")
+async def show_version():
+    ver = {"version": vllm.__version__}
+    return JSONResponse(content=ver)
+
+
 @app.post("/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
@@ -231,6 +238,7 @@ async def authentication(request: Request, call_next):
                 f"Invalid middleware {middleware}. Must be a function or a class."
             )
 
+    logger.info(f"vLLM API server version {vllm.__version__}")
     logger.info(f"args: {args}")
 
     if args.served_model_name is not None:

From 996d095c541e1cd67f0a7ec2579bc3bb0a435494 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 3 Mar 2024 14:37:18 -0800
Subject: [PATCH 046/113] [FIX] Fix styles in automatic prefix caching & add a
 automatic prefix caching benchmark (#3158)

---
 benchmarks/benchmark_prefix_caching.py | 59 ++++++++++++++++++++++++++
 benchmarks/benchmark_throughput.py     |  5 ++-
 vllm/core/block_manager.py             | 15 ++-----
 vllm/sequence.py                       |  8 +---
 4 files changed, 69 insertions(+), 18 deletions(-)
 create mode 100644 benchmarks/benchmark_prefix_caching.py

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
new file mode 100644
index 0000000000000..c43bd9c3bed3e
--- /dev/null
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -0,0 +1,59 @@
+import argparse
+import time
+
+from vllm import LLM
+from vllm import SamplingParams
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None, prefix_len=None):
+    start_time = time.time()
+    # whether use Prefix
+    if prefix_len != None:
+        # start inference
+        llm.generate(prompts,
+                     sampling_params=sampling_params,
+                     prefix_pos=prefix_len)
+    else:
+        llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+def main(args):
+    llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
+              tokenizer_mode='auto',
+              trust_remote_code=True,
+              enforce_eager=True,
+              enable_prefix_caching=args.enable_prefix_caching)
+
+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
+    sampling_params = SamplingParams(temperature=0, max_tokens=100)
+
+    print("------warm up------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts[:1],
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 51c1a6540a451..1f0bfe06a67cb 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -303,7 +303,10 @@ def main(args: argparse.Namespace):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
-    parser.add_argument("--enable_prefix_caching", action='store_true')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 08d519ab767a9..daf83827a7e52 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -236,13 +236,6 @@ def _is_last_block_full(
         token_ids_len = len(seq.data.get_token_ids())
         return token_ids_len > 0 and token_ids_len % seq.block_size == 0
 
-    def _is_last_block(
-        self,
-        seq: Sequence,
-        index: int,
-    ) -> bool:
-        return index == len(seq.logical_token_blocks) - 1
-
     def _maybe_promote_last_block(
         self,
         seq: Sequence,
@@ -436,7 +429,7 @@ def access_all_blocks_in_seq(
     def compute_last_full_block_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
-        max_full_block = seq.get_len() // seq.block_size - 1
+        max_full_block = seq.get_len() // self.block_size - 1
         block_table = self.block_tables[seq.seq_id]
         if max_full_block == -1:
             return
@@ -451,9 +444,9 @@ def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]:
                 return [b.block_number for b in block_table[:block_idx + 1]]
         return []
 
-    # Can return non-empty result only with prefix caching enabled.
     def get_common_computed_block_ids(self,
                                       seq_group: SequenceGroup) -> List[int]:
+        # Can return non-empty result only with prefix caching enabled.
         if not self.enable_caching:
             return []
 
@@ -463,9 +456,9 @@ def get_common_computed_block_ids(self,
         ]
         return commonprefix([ids for ids in ids_list if ids != []])
 
-    # We only mark the last full block because with prefix caching,
-    # all blocks until the marked one are guaranteed to be computed.
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
+        # NOTE: We only mark the last full block because with prefix caching,
+        # all blocks until the marked one are guaranteed to be computed.
         if self.enable_caching:
             for seq in seq_group.seqs_dict.values():
                 self.compute_last_full_block_in_seq(seq)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 122960035e505..04a9a90a68bcc 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -160,10 +160,10 @@ def __init__(
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0
 
-    # TODO The current hashing function is O(L^2). We should optimize this in
-    # the future.
     def hash_of_block(self, logical_idx: int) -> int:
         # Compute the number of tokens in the sequence
+        # TODO: The current hashing function is O(L^2). We should optimize
+        # this in the future.
         num_tokens = self.num_hashed_tokens_of_block(logical_idx)
         return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
 
@@ -308,10 +308,6 @@ def prompt_token_ids(self) -> List[int]:
         # We use the prompt of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).data.prompt_token_ids
 
-    @property
-    def block_size(self) -> int:
-        return next(iter(self.seqs_dict.values())).block_size
-
     @property
     def lora_int_id(self) -> int:
         return self.lora_request.lora_int_id if self.lora_request else 0

From 17c3103c562e748686a3fa4bd9b43ebe98aae3d9 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Sun, 3 Mar 2024 16:19:13 -0800
Subject: [PATCH 047/113] Make it easy to profile workers with nsight (#3162)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/benchmark_latency.py |  6 ++++++
 vllm/config.py                  |  7 +++++++
 vllm/engine/arg_utils.py        |  8 +++++++-
 vllm/engine/llm_engine.py       | 15 ++++++++++++++-
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 6e3b679cb81b2..2fdc08c5c26df 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -26,6 +26,7 @@ def main(args: argparse.Namespace):
         enforce_eager=args.enforce_eager,
         kv_cache_dtype=args.kv_cache_dtype,
         device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
     )
 
     sampling_params = SamplingParams(
@@ -145,5 +146,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default="cuda",
         choices=["cuda"],
         help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument(
+        "--ray-workers-use-nsight",
+        action='store_true',
+        help="If specified, use nsight to profile ray workers",
+    )
     args = parser.parse_args()
     main(args)
diff --git a/vllm/config.py b/vllm/config.py
index 876a439cd1280..e39fd7265689f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -382,6 +382,8 @@ class ParallelConfig:
             parallel and large models.
         disable_custom_all_reduce: Disable the custom all-reduce kernel and
             fall back to NCCL.
+        ray_workers_use_nsight: Whether to profile Ray workers with nsight, see
+            https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
     """
 
     def __init__(
@@ -391,6 +393,7 @@ def __init__(
         worker_use_ray: bool,
         max_parallel_loading_workers: Optional[int] = None,
         disable_custom_all_reduce: bool = False,
+        ray_workers_use_nsight: bool = False,
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
         if is_neuron():
@@ -404,6 +407,7 @@ def __init__(
         self.worker_use_ray = worker_use_ray
         self.max_parallel_loading_workers = max_parallel_loading_workers
         self.disable_custom_all_reduce = disable_custom_all_reduce
+        self.ray_workers_use_nsight = ray_workers_use_nsight
 
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         # Ray worker is not supported for Neuron backend.
@@ -426,6 +430,9 @@ def _verify_args(self) -> None:
                 logger.info(
                     "Disabled the custom all-reduce kernel because it is not "
                     "supported with pipeline parallelism.")
+        if self.ray_workers_use_nsight and not self.worker_use_ray:
+            raise ValueError("Unable to use nsight profiling unless workers "
+                             "run with Ray.")
 
         # FIXME(woosuk): Fix the stability issues and re-enable the custom
         # all-reduce kernel.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0349c3a6636c7..6882e8be34d11 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -46,6 +46,7 @@ class EngineArgs:
     lora_dtype = 'auto'
     max_cpu_loras: Optional[int] = None
     device: str = 'auto'
+    ray_workers_use_nsight: bool = False
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -168,6 +169,10 @@ def add_cli_args(
             help='load model sequentially in multiple batches, '
             'to avoid RAM OOM when using tensor '
             'parallel and large models')
+        parser.add_argument(
+            '--ray-workers-use-nsight',
+            action='store_true',
+            help='If specified, use nsight to profile ray workers')
         # KV cache arguments
         parser.add_argument('--block-size',
                             type=int,
@@ -305,7 +310,8 @@ def create_engine_configs(
                                          self.tensor_parallel_size,
                                          self.worker_use_ray,
                                          self.max_parallel_loading_workers,
-                                         self.disable_custom_all_reduce)
+                                         self.disable_custom_all_reduce,
+                                         self.ray_workers_use_nsight)
         scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                            self.max_num_seqs,
                                            model_config.max_model_len,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index c9bd89a1b18f4..8a2573034c940 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -124,7 +124,20 @@ def __init__(
             ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
             if ray_usage != "1":
                 os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-            self._init_workers_ray(placement_group)
+            # Pass additional arguments to initialize the worker
+            additional_ray_args = {}
+            if self.parallel_config.ray_workers_use_nsight:
+                logger.info("Configuring Ray workers to use nsight.")
+                additional_ray_args = {
+                    "runtime_env": {
+                        "nsight": {
+                            "t": "cuda,cudnn,cublas",
+                            "o": "'worker_process_%p'",
+                            "cuda-graph-trace": "node",
+                        }
+                    }
+                }
+            self._init_workers_ray(placement_group, **additional_ray_args)
         else:
             self._init_workers()
 

From d0fae881143f07a558ea72b2cae3c4c6dfa94937 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Sun, 3 Mar 2024 17:03:51 -0800
Subject: [PATCH 048/113] [DOC] add setup document to support neuron backend
 (#2777)

---
 .../getting_started/neuron-installation.rst   | 135 ++++++++++++++++++
 docs/source/index.rst                         |   1 +
 2 files changed, 136 insertions(+)
 create mode 100644 docs/source/getting_started/neuron-installation.rst

diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
new file mode 100644
index 0000000000000..0aff1037d8a29
--- /dev/null
+++ b/docs/source/getting_started/neuron-installation.rst
@@ -0,0 +1,135 @@
+.. _installation_neuron:
+
+Installation with Neuron
+========================
+
+vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK.
+At the moment Paged Attention is not supported in Neuron SDK, but naive continuous batching is supported in transformers-neuronx.
+Data types currently supported in Neuron SDK are FP16 and BF16.
+
+Requirements
+------------
+
+* OS: Linux
+* Python: 3.8 -- 3.11
+* Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
+* Pytorch 2.0.1/2.1.1
+* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+
+Installation steps:
+
+- :ref:`Build from source <build_from_source_neuron>`
+
+  - :ref:`Step 0. Launch Trn1/Inf2 instances <launch_instances>`
+  - :ref:`Step 1. Install drivers and tools <install_drivers>`
+  - :ref:`Step 2. Install transformers-neuronx and its dependencies <install_tnx>`
+  - :ref:`Step 3. Install vLLM from source <install_vllm>`
+
+.. _build_from_source_neuron:
+
+Build from source
+-----------------
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+.. _launch_instances:
+
+Step 0. Launch Trn1/Inf2 instances
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html>`_.
+
+- Please follow the instructions at `launch an Amazon EC2 Instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance>`_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
+- To get more information about instances sizes and pricing see: `Trn1 web page <https://aws.amazon.com/ec2/instance-types/trn1/>`_, `Inf2 web page <https://aws.amazon.com/ec2/instance-types/inf2/>`_
+- Select Ubuntu Server 22.04 TLS AMI
+- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
+- After launching the instance, follow the instructions in `Connect to your instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html>`_ to connect to the instance
+
+.. _install_drivers:
+
+Step 1. Install drivers and tools
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron <https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html>`_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
+
+.. code-block:: console
+
+    # Configure Linux for Neuron repository updates
+    . /etc/os-release
+    sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+    deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+    EOF
+    wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+
+    # Update OS packages
+    sudo apt-get update -y
+
+    # Install OS headers
+    sudo apt-get install linux-headers-$(uname -r) -y
+
+    # Install git
+    sudo apt-get install git -y
+
+    # install Neuron Driver
+    sudo apt-get install aws-neuronx-dkms=2.* -y
+
+    # Install Neuron Runtime
+    sudo apt-get install aws-neuronx-collectives=2.* -y
+    sudo apt-get install aws-neuronx-runtime-lib=2.* -y
+
+    # Install Neuron Tools
+    sudo apt-get install aws-neuronx-tools=2.* -y
+
+    # Add PATH
+    export PATH=/opt/aws/neuron/bin:$PATH
+
+
+.. _install_tnx:
+
+Step 2. Install transformers-neuronx and its dependencies
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`transformers-neuronx <https://github.com/aws-neuron/transformers-neuronx>`_ will be the backend to support inference on trn1/inf2 instances.
+Follow the steps below to install transformer-neuronx package and its dependencies.
+
+.. code-block:: console
+
+    # Install Python venv
+    sudo apt-get install -y python3.10-venv g++
+
+    # Create Python venv
+    python3.10 -m venv aws_neuron_venv_pytorch
+
+    # Activate Python venv
+    source aws_neuron_venv_pytorch/bin/activate
+
+    # Install Jupyter notebook kernel
+    pip install ipykernel
+    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+    pip install jupyter notebook
+    pip install environment_kernels
+
+    # Set pip repository pointing to the Neuron repository
+    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+    # Install wget, awscli
+    python -m pip install wget
+    python -m pip install awscli
+
+    # Update Neuron Compiler and Framework
+    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+.. _install_vllm:
+
+Step 3. Install vLLM from source
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
+
+.. code-block:: console
+
+    $ cd vllm
+    $ pip install -U -r requirements-neuron.txt
+    $ pip install .
+
+If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index bdc541cb2d58e..e90481845c4ff 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -62,6 +62,7 @@ Documentation
 
    getting_started/installation
    getting_started/amd-installation
+   getting_started/neuron-installation
    getting_started/quickstart
 
 .. toctree::

From 901cf4c52bf65472ca13aa4f996d631d00c2228d Mon Sep 17 00:00:00 2001
From: TianYu GUO <guoty9@mail2.sysu.edu.cn>
Date: Mon, 4 Mar 2024 14:48:27 +0800
Subject: [PATCH 049/113] [Minor Fix] Remove unused code in
 benchmark_prefix_caching.py (#3171)

---
 benchmarks/benchmark_prefix_caching.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index c43bd9c3bed3e..a0307439cd5f1 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -7,16 +7,10 @@
 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"
 
 
-def test_prefix(llm=None, sampling_params=None, prompts=None, prefix_len=None):
+def test_prefix(llm=None, sampling_params=None, prompts=None):
     start_time = time.time()
-    # whether use Prefix
-    if prefix_len != None:
-        # start inference
-        llm.generate(prompts,
-                     sampling_params=sampling_params,
-                     prefix_pos=prefix_len)
-    else:
-        llm.generate(prompts, sampling_params=sampling_params)
+
+    llm.generate(prompts, sampling_params=sampling_params)
 
     end_time = time.time()
     print(f"cost time {end_time - start_time}")

From 27a7b070db526326ede3335fb07c1fa13ac008bb Mon Sep 17 00:00:00 2001
From: Jialun Lyu <43287111+pian13131@users.noreply.github.com>
Date: Mon, 4 Mar 2024 09:23:34 -0800
Subject: [PATCH 050/113] Add document for vllm paged attention kernel. (#2978)

---
 docs/source/assets/kernel/k_vecs.png       | Bin 0 -> 27676 bytes
 docs/source/assets/kernel/key.png          | Bin 0 -> 111314 bytes
 docs/source/assets/kernel/logits_vec.png   | Bin 0 -> 17475 bytes
 docs/source/assets/kernel/q_vecs.png       | Bin 0 -> 42065 bytes
 docs/source/assets/kernel/query.png        | Bin 0 -> 32710 bytes
 docs/source/assets/kernel/v_vec.png        | Bin 0 -> 51256 bytes
 docs/source/assets/kernel/value.png        | Bin 0 -> 121414 bytes
 docs/source/dev/kernel/paged_attention.rst | 525 +++++++++++++++++++++
 docs/source/index.rst                      |   1 +
 9 files changed, 526 insertions(+)
 create mode 100644 docs/source/assets/kernel/k_vecs.png
 create mode 100644 docs/source/assets/kernel/key.png
 create mode 100644 docs/source/assets/kernel/logits_vec.png
 create mode 100644 docs/source/assets/kernel/q_vecs.png
 create mode 100644 docs/source/assets/kernel/query.png
 create mode 100644 docs/source/assets/kernel/v_vec.png
 create mode 100644 docs/source/assets/kernel/value.png
 create mode 100644 docs/source/dev/kernel/paged_attention.rst

diff --git a/docs/source/assets/kernel/k_vecs.png b/docs/source/assets/kernel/k_vecs.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b7be1385aa2e012b3733835394175af97f073fd
GIT binary patch
literal 27676
zcmeFYbySpX*9Qs+NGPC$(%q$`G}7H&BMlNmH$y3cBHi5`(hbtxC5<#gN$0@1QJ?qy
zzVG?_tabi4v)0Vqb=BVc+VR`_4pvf-e2PYZ1_uZCR9Z?*1r83;1UNoNK?Z)qW0X_j
z;Lyq}MMagQMMcS!9KdFl)~0Z9Qo%8CkJVJQv3(CcS(~xQ$j4EerNQvoA4U=Aq#hHJ
zqhiUR_@x_Z;%R=%r10<eFGZuPV-TPA4>HolL3ep!qO3I2<)5#0_n64P@&4-m{`O>e
z$P3KoYIY!Sk;;w<#}L38liSe`XCmcb@)KUL_(z=^9t7^YFACjvj2|aRvnmQ&S}4M%
zD);(Ju8%C_ma7Q9oE_fZYbrncT!929Y)D6oR*8g9M=IIF=%*I@2(ALNnvi8MBJ@Z+
z8lSC7IGQxn{umMStNrR5<5z2f5kK{!r{KwyF1@tb;BY7>hx>7<vh3+v=;mcU7@_uW
z^Zgp^?ZwFq*+eqS55w6gzufCQT&22aHp`@-NlPnU39b%7Z_`X>mfvE%6RKVFk(Frh
zV*7;3OGjVv-A;{tOrS9-7QgtN^lI^_G#+{ntcORO<RxdHzqRcx75Y_6+vob7N)Z|!
z>Aexv&c2`G3KewotVWTh2|qvUNwcw-M_yGFG#yvbeH0MyRP7v*4Gu|I*Rww@mL694
zf|7dtlVOj|y)F|HLG|67p{&(tAAfeaL~~Gea7a4%iCS@rkb0|Bd&1`g;-Y+cy5qB?
z<6xCbSw}CN`s^J;G#_OT-}clL!i@N&0FvH<dc)5GavP67%Cn)s!}}spyydti^b^#>
z!q|iFea1|{m*ty8ug`={o45Y$%{DW3x!ZHUGpE?*l4eRcV^^0|sV39VW$_?)_x{+O
zKoZl_1G4!%>)=Ol9E*>H1NcqZMMi{d;NiX!B06Xve6ZQ$g=ZpzCwK?<G#&kkAO1Iy
zT?gcY?}V3rWLTf4i5_o0f|r8F7e>bO&v}PS2PgiH{Rc+BFwzfn4F8T5Itv8aHf=k2
zJ@}b6n_qCJzK~xCPy8<4;rNsBi+$V<N6vnXFB%&z&>6^2E<}fj_GvX8kq$pn*ujFn
z3Yo$Ga|T=f^TI%gpMtP?hOE*HCrs!^<P7dn+LwOxuO3bN$-g6vf9kmM?#=Vm$Fv>V
zb|lG|;19At87{FFP|`k@eXspVa*4zr0{0>0)idu-G($=~Dx6@+NG$vcB6=~IaE5FQ
z%eTDYNm7`{&#!{o47Xq6b)&6?qIQZIBE~ZfIt+Tq;ct@VKAQ-w58ml|V%TC-pk-4`
zJ^gf`quVH|9K9Ur$MZdi^`mqCbFLcF8<I~QukCZ+ywD?K!HD~;>#rL&|Fx*6dbPpM
zyUviGq|C3j({n}i$jb|(1+Ep_2gL{3=h1T5y)YT}Datvbj|l%;5iNN-@<}WOY+J0<
zXA;5vV#b+-ucPUxSZHi14k&A|%AVZ>XHnrs()R`6NXt-{P{c$+rM9ForNMnB74Wk$
z{7FnSWmIvIVd1Qio_(bBg*fPFacprEdDhuP=~XHR%1bH*YS@L`oT-VTi&7=Isab-#
z5T#5;ZN{VM4>F9={V@bFwsg&z&n&w+Fl+>w!!$(rBpM|=3*^+els6epV&WL%7}Jzm
zviOTdG|N>{)%%pR%T%%)q)ml!%id=~^BB~IG#86J6sp5QN1yKKHTbp!wuC`amqxz7
zmc!?lV8{xxk00xMsiLaxsP;ueH(R6BMW*Ca@GJRveluQ2>=U}mq$|tBVUA&ru`1^Y
zXzpS>S7vF>K?#crt6G_6YssB-K+ey+h}?4VEa|lf8i@%J$C^!%A%Vm#PSE?;RSVUK
zIv;f;s}HO?=J4iz&dF9?ya!E@Opi^Wjz&QDCVBEsRN@MDOOgs3<u9rZ(n<73R27SB
z<qt!a9AgQ-bjX;;RLp6MUe8_oT|2;JU<xqhN2&pI0aM6l$YD>U1Fbu#I&1=M0!1Y}
zB-p>k5=;>+a?=o0epL-|ed+o#EIvN5GrpM9k`4crI`>AxX2Nu$AV*fBdxB|#dcq=G
zPeSF;#Gq>usMeP`nMsAIpSe`KJ$5+OXkal`f1-RsKTkGSQ@+($ynIz_LuX3oOozQh
zv{+M@Tf4R5bwzOPtkHzw$<%Zo*P=_>{v$s1MzY2!7iJgm9`hd3`NnxIQ9i~U1|v}-
zu>qeB?`NWmH%rc$d_zt;yOW!zU9(-<n|fRP+eYh=D-{D97)fmy#%1Opr>=(Ux>7rV
zo$IloS>~So{CUuU#)0Jl=fNX(IrAX1y`-@%!<Dv+{L}3t6L%W7yY<<%oWrh*!eftB
zn{DRPr9)VV@v!s;(Q4Ci`B}sj$7P7!V%<PpNlOse7n}jkODhCLgA$gFeIDJL-0R$D
zz~%W;!<YE|_`dR;+0WIDmmj_?ksD|Ic@<N>K>Tx}XL@+IRGk!Jc;jr?srVd@yMv!9
zL(HYkI6wL;8Ej9FN*d-GY8$o=cMoAC=?mUi*6Cl4{|ZI%==2Xt47!LAj7Xv&6-yBB
z%bLry9COSn%91qlGX8ArVPs<z*JThq5rW;r*|WBmwRX{iIdYU#>8)_DmsgME*tnH2
ztjlpY?lp9^bE3$c&b-3<Ce~LzNIGBoau9Pv^XL<{dvN3PPl$SL7;WeowRTB%O6%kd
zcmlG=QVa?NQs+`jiTv@aF)ESADucLj!Js_kT#f9|?2oy`Qa7<AQ3h#md!>5ckSoVz
zOV6=8v%fHPk!H?88c$`aE#B7_t#~(kXHDm6;Npn!$R@P3e`sB(*V9qJallcCDa63d
zI-;MVFSF<iGhUhQEz^k5;TCl~wLi6u>{(o?&Rk<iWw8TK6~DdvJX#0t72~xgaxs71
z=PCx9DR7aE=8G~&EsdGHY>UEO|7{B?0wkYVi6I<!L3_8QW>s=9Y?YL6hNpPG{=h2Z
z+b@R*8)fOStuz0#IPM=dbc?@ijK{l&<%g5bt-Ebvtq+z~t2f@2%h(&(FHa}v64sX2
z;M#iDdbgd+QioFG@=5Wn^LfaK$|SiG>@PH(IBLtx?mMI%U+d2`PVW=UGB0==hh3yL
zprw4y{H!Px?9-r&uQaP=s-j-;qL$nBaKd}+h8JRXuh$H6^clNjeN6S(10#^sy3D1Z
zv~;X=Rt<fo-|8{ukbo2H+u_em-lB$n4>RwvXLe$c^($7?RwvS@f^#k*-o*!XM@RZk
zPXpT}^6l-{lvHI@6&>{)XM)qQZI}<3pSJ4Ud#q&N^&P9e&*Ms=Nf{P`?aM7C{M4}A
z9jNRxznR0U5_t2%^FZTf?|k^AZHv|5SF2rVz=Zu~W=~gC+yYBV-C)atQ`eQuntEQ*
z(^ekX^ytpQtlQ9r&WZEPF0(#+gTnH!2K)2e!y)uqx7yN$ROiMbzQUV>lzLZ>1GGbo
z1=-e%Ysdj?>@g#$5h<Ra3PkN#>&AZSaI<yh;>|@e^lZQRnrgK#K=AGI7om##%Z0T2
z=<BNWgr!O-{Ec5!z+J#h+KBtoy~wrEscBmpgU^A7={fWgwB(~t{FxZ)!|Vy$AG)q3
z0An^edd+YVaot{eci)Ufv||#GVn9rM_AYTwc4oKRM=nQlK!dGSUQh3?f9ZDDf>l~V
zgqG0Awtb@_;oKbI;NBC#o$kVgZ_1rct|7>KJr2RXROBCX4g10l=cx-Hq=zbuQs|mF
zhe*r;hU=colmf%?y`k$ix#wFd3JYerE*bTBwojCP<D#qK@6#(rdau8DKPmj|415hz
zuR_MLNa`L0JlM~sTGD3n@^B2mF$x?aJOLaMa0Cy01>p(*b1VT*2lwdrbp$xLPnK|q
zf1gnR{vST!!1tlepZ`aZfpCw3v**CqEgj*Xrx8ukAN_L-cMjZxd#frcEe-su8atSp
z+B#Z*oz4f+{DBLoc2e2^1@NdJzVOm2RKI}!XDrpUoV4WSc#Xj}EQTgvBU2VP8@q>o
z;P~BmfkPWpCqptf8*5ufUN-@X-%s!Y#}AiTDad|5;$$U2p(U?GCJJ^iCF5dw#qx?m
z5RHtCjNie;j8{cW;%{@{On}0|$;pnFmDSbNmBp2v1?*tX`kIG_hxHX3D;pa#@C37?
zyRDO<8?&t=<)2Re=|{}e(b&P#&dCyNOZL#Op%K{GNq~alVW9tf{>;<V&GNq^**gAR
z7O+6phdZpVSzfXJr*EJs|HD;YB}+F`Yi%)08-QlO7=mmZT>QVE|JR-Wj`%-XYW=t6
z>(^`?|Fh};xb=UVsymuGh=Of^F`Wee%dfwU|L4uW4f$Cgmi|9T@rTa8uL6V?MB``u
z57PwEXk)qQ06r2~iYclA|A3VJ{&)hv57!_5fiDAlx0hTr931_<wAfoUH~76p)NeRl
z_}`B<We~)S5sxT*{ri<wx=f!KU`~>$AdU(vGoUIXNO2(1e9q+%e*SDsEXvmx5jP7S
zo*bb~2@T~l8bUvL?Lv^R5#oBFw2$ZzZzW%4Ptzig%e^X6qvP4pF673nb!I}fZS;%L
zTR3<`EaCtCgCduX0H5%^xDVyu_g*2P^o~CJzb%B}{Ct5Mh))z;P{@4Y5dQZMJpT3n
zy#zdg^$iX_!MCjs<KKgjk%^N2pE*68BH)pcVZVAaOa8AB5D-naAN}t^ALbzZ9k_vj
zOw0GLB?|lcKJotFtNuM8ik7dh-1ilePye1UR(J;Hzt@HUkB$|N`Z9U%-z0%YU_1Ty
z+Jwp80XN_SqKN+wm4H)W;XuKEuMGu}Nm!WXon3|Kzb6bZ<MF>o{(p}dpM_T@3seh@
zx;~35VHf_HtFR>sRyh4vm)zLj9C(fOqrtgIr`e<W8d_AQBI7x08dqR8l$c{}F6xDd
zK`K~`$7)b~zBiktkjlS9=N7}Dt`^6lUxAU|dv&roq{`ZInW<A{5uwuxX-+WMpQ~06
zKz=%@=yR>=e6wF$R5{E$5P`|~m)Tgv05Qib(PjJ}7^c*Y5Jjt~vDn~zf^&H_;!|qi
z_7iCYwii_My)EFedX~i3$xvcGesl)EI`$F%3CGrI<C#)@Xei%-lH=BJHJ5wKa59go
z^Y&<)qY&R;vXb!wBmA6=V*f2rgGkdQx_N<533DG-<Z-&4LE^RouU@3V461%Vo*m4?
z+r4Xoz-GX9^h3~W@a0r3c)9B15Di(S<Celyle?p1wESGBH6N{9e0qcHq3wLT?qAUi
z@&|SUT@`}qZ!rwY`97ryD#2sXi>LE!g}^3zBPg+V>zl~QqP}R{hQ>hbM+7dincZ@I
z2gkA`)wQ(~Jnmr^6S-?OjsU@AY(0(t?rXnofH%Pkx=PO<#_@ZJ+KX5S5ZocU4w&;M
z5qaoW?a~keP8n(89xKBSA#rm2HHY)Hic|T@ls1O!NG1|tBqaweHwRUgZu?Ep!lFrk
zB=m~ZtZqi)8p^-ize7anT_1^z{!K3#DPbCm5{;6?tnL>euk!2lC`F=q2knRm%#*DV
z&4E~E)hHTyhh!Py?zU$GxCw)<b5)k_FAf&pkEXw+cb0k)&SSs&fv4ScB3GV%>+8$c
zB8wioCD6Uf5e-I-;#ZG6Nxgo(?YC!x|F)ph^PA<sB*w4+weZy?9p{huVbh5TcT>1s
zEV{;oSbSqeG}$9K@;P+*qSN3z7)`IL25?-J;chbtT)11-x>ODu%@Exzk?(t(a2&GY
zVDcU9@3veQ2Q-1fOKHT1Eu20HEHYZ>%T{emJ+cWPora;w6>+OI^R}w2t+y54s;kZX
z&l_##Yl5A3zlNmO-yRQ?s2e={qr=F=00|;rjpF<*LHtO7*cX`Kh#XUX-iW1kKUQhT
zaoFhJaY?!v{~FF?x8keQ;8YN)tGRxj{~4FDFjqb)5mZ6QVUb=Ty<7sPLfi35#YOu1
zc%;SadM8gt0>q?UA*NoS(lw|nak_r}fxb^Mm4FI+7;<~m^L}@_1X^4<tg@BrSy~8c
zZQ+s0e?L`tUhLfrU~eN+lBC`yf({;sOU*KOeJZuK`s!r{0?uRNFA5$cd~!n3Y@Btv
zSUXFjRL^p{$m0xZHaE`2R#57kq{{-2?>=~j^g=IOC0P#ptiLy5UD6nn=*_4Xo0g1`
z^`Di)`K=(Gc449aC8-J4Xw)SE?B;^oY4mL--p2Jzxv``%?D{N~2TIGGlU)Ypetl|O
z{gr+A4W(&XGrxX=(`%>nZg!g+4A>;eT502fOj0h1X~Y}T&B25~j61i@`%Y}dmy)4G
z(nN=65|3j-K9P9P*|cb+oZ9eqdmc7##YFGA(08o&MSV>b@Fd`s+SE2+#B`7GXa-bM
zAckChZh~bNhF5lV1b_BEGA4=jwr}`Yo&8$0!;WsOUZwpmCNZy*oxpU9J#?~w-o46d
zW*CBU_+PeOS-t%4+Tz*G75d|JZ(dyME${ZJ3phI6Af#GdSDKHU%Pr3x0rWO6jv~1-
zvYxLQ3THi@jsDrsSjN$EcWuRp-cy8FO{0*!&6qz`ph~+|IU?woy;V)|Hru)%>LRDD
zt*!Bvnh}i3veTn|GdyB|T;@A?d9<cF9=D>GD;~xx!=s3x#`KR$d<t)?J+3M8_uS<9
zCG60r?C!Eh$c&e17v@p*VXoDub@#}&mZ_d3-f-=BV_;}Lp$Q{Ol5QX?`lOlvdOHj1
z+Io{hsVzVtxiiKGJ(~f!T925D+XhF@XG4Akkh-OAI(vp~p|bE@pYIo~zg|y>1=LBw
z`Tl%Tt%xZvR}#!1>RRx)ml6u7?I_A%^E7V_*l}E|leMgYebSaxXap@6wNQn?mEQDU
zVy6EBEHxTf4&(PC#L37UbM@@IaNUc7TT&!;o-vf}Y%r2#Nyg;++}|vhy7zY9-(9qt
z3AdfVrSUD$B<)Y+g+-EyxWDi&3LURiaHpYktm|!J-q1HWh~cuCbEBbb3yo#gEu1RS
z^dyPe#VGTYD>G=G1XxYf*L>7X2n{5(wX=5AjaZoi3<Oc52}$N5<f1XurueVL7k-$Z
zKR2!9Z#5c^7J|>-eQU&&^$K`A7qDefDp7OQR(aNy;P7By>Ti@;QVTTiV~ha}YU1?O
z3;%vB43a%8cw_A-vXGbR(Vr^6+AMd+7&YC%e2qtOb~>S$qzH{@kP;SdDK@Xv^VrEL
z)UCCZVTueu!lsc^T5NPxq+&PfL~%ZBzMz?oTJEK6-k=xRN_GsrJ|19=Nd@)DO%yS)
zHlIc|z|QBRqNIxlwv&5EJekT_z}Zbgw?8q}E6s+Yt0E?0UszjS1*ou_4cZYgGTLT&
z?LF>Wuj}}PSqqkFnmOr+56g@V_RPT`g<NJt@;5<^Q{rWX&D#8tpovzc<qBtBF~J|Y
zCR&2Z<x&U#QNZNrVr#?nGJd~(CgCQ;Q>hUDp^*q%4E%&D&Bux=-%Kb#-qI1ripvzy
z4#hLbuueeE7RPUUgzgN9gdqG8A?xH(t5D%j8Q@)q!M4mayY&P!Kq-CozB#8JFK>+q
zc6Sc%JJU-b;-2tgyOx&huCknX0&}8XTWJe8&sJT!-(dBT?vvasKGJb|MYZ8JAl+XK
zMwQ`Fe_Z;+l5h7-ttN4}G>;-zZ|6W?k1T$Lwi6k{h8}Tv-=<pVKZX(k{gsFW=;yaT
z3=c2zoxdeI{$b^g<Lv&T>=|3@)ev}48NV+qqvkkDnU0O1tED%eg!u9s_}Ggztg_`-
zJG#KdKu`!^^~Su^s<z%r^BF*{Or3j(U8!u;p+`NWnjS`T(ffUbOI|1P&TC(}X}}sr
zM}U3IwN|WsOXAvWI)GR0d0~4JvB5_BHKczm3M(n>drjIr;5D2at)AOV-{{)bi|@P(
zt4?&&Z*nUiSL~1}Zfut3t_Xa_W@}{q$2&}S1N?9t@VpB9Z%IW!LnIC#3+vaI$Wv6*
z%@Vr58iCff!b;X&3S5gmefjEp`H&U0zOV!WSx4Yg)fh(2bKXVg#)au@(zhuUiZs}z
zN*N;kF11hD?&Xd%)9$jD+_zHJheDkNt|fwl2YD)uy3l%B<QR7l*Z>n_+rR%l!PdUw
zc!JVraK3V3GnJgonEvSVAG!G7UZOi53NIPuNpTZEI7_S74(n1mq-}d|Xh`7`<Q!|>
zuvL{L^SiTc!G-RhhJdNPYsW;`R>dXIq{FEunwSj|+#WX85HOjoH}adO5i5!{Z4B<?
zlSACgds3nrO$cnZiXDl(&V^T#h0$V>T)oNSeaP2%rA=~P$<eq)e>G)z7T|LcwY?$w
z<E1~Ni}97?qEv43_`pAX<GFm3I&<Ivj+^gT=X!lZ;rA?I?~OZ!?#=7jrVF&*x<!$g
zoj%3Bt3JejwtgD-BedS$wdsxW-E-<B-ImTwiLV6b6VVH~)NiTr9ykFEfdT?Nic1?|
z)ZeE@bbwRunAIm0BO+*^ZL1yW8&Ta$S>|)QM#?BaUs2t6QKTs>AwdsYebH){eX|R?
z*MXgt`8Zn-RIR!+#sc>?_amOEoQo@-7M-s5pUcx90$%ic!NmENtm0z1p(O5+mf(wj
zY(gZk((si&yoZ&NEdjc$!b+&NhsE60Y%?qJ5gXfQF^MQQX12s!<t&NKtoAe2#7V%>
z6_6YU2)zWkv2ot7p5d+Nb^Z{-Z`%mCzM&i@U5sRu@X<9k^Sq&PJ#GX191nh_b!{2{
z$sjvav-LX0I|@2qTSmb7KH8Y@KmK|sFcG_jKHT3E+5A0`?`g0-b7<QAt$BYN+pMko
z3+Z_@Ya%|EL9aA^(C2mXEa`*BL&2#!d*c}<ZkxGm=uFw+k1bq4tV6gS@ty~mVkGR(
z*P45KPXBf9_NC(wn*k@B@+-|BKRY!t)H?i58&LVteV-mIN8TX&dLkU*KgMU}!T2c1
zA^)aU7u^b8c%LZ*qrO>UQ%gFPOrv;##FKbByJaImR02v-dwhDh3|iuirIwA2S0r?>
zX6oq&yy8jc8U46Oe||%0iEnPqfrjy6-&)S5b%{R#6kI1mU6R7;Y3a>sUPMqlH(qb%
z?<+Ly2&Bi}?-wld4cT){`k^UtTdd!x0eEua81_uh-6C`gB46O+uXFjxAKMrI<7~qr
zT#I}_`XjH14mfW-PV+N-<nwMPD5?2XBORT5zt`>8%s=1SVZG}HSVOe6&7<`Eri9-~
zxZ6Ht{xhUPso(vYUe@+ggYA61qj~=oFWH)hbq%3rsa`^4t~xF8_gA5$LaDFmy}yKI
z&`xlkszyD5;m{lRhLd-14kZz94aL9i;3O4j=(JC+rnrLk*b~YpaXAYp&UIPs%vR>@
z@&N7}`}x7*u5%*!J~SpRe%n`W&XJk>rU=6&=jCv!z<jlr(SKd{XYW3Qv=kctQNrKD
zgn{T_(4Bv8Yr0f_p6D6Hjfse^duAwxQ#>WU3F60jW=^(?U<|f+N6!!v0i6=9^7d4d
z)#f|4G7|+n_og$Rtz=`{89yXWyA|OxK%i*%z357GYc(8_Mr&KnN;=&G9%E{-`HM-&
z_flj*7W!!^eIj2-OWsvmPDlvc{_5D=f8E!kwfmq7F$GS;tD$Z&Ri0;1(<KF}tdE5n
zCGSIVv@6ZNMoL?nt`9so3R}IO-wY7Fnl>)mXkN&SYU<Tv<X&`&Y5?3z_3HOi(b<jf
zrx-ZRhlysK_O}L5aNK2-(*(T-=TnQl8(sF<=G%s3Y(db)##$atc_3<y3HcD2_qK0=
z9SDCaBkjC#(O=j73?v+^x65vt&SWJXayMD>*k!g-T|EP=u`*k`WU58A#TW&(cdyOt
zH^TrRit?LhsrieK2%`XEEh0=S``Z$%V%ar*?Nl7%b=k8(wl8hHyDnCUyON}lPc$L9
znN(FwILR_`T>bU4V+wY$3?{iAmt`#=5%4f^)af>6TP$@vUYBO?UBQ#<jdx3Y^bo=8
zRKJgT)4s2f**Waca&@^DM(jA+bUtTQ2!zrba#J^__G{e*e1}aF_TL+vcL(QFhzKk7
z`X(i`X*vx5QNWTwWaQpSMgMz-<IjK8*=I%8ODssptW%I@sZE!G(wyIkOML33s;JnW
zH=hK2-pc+`z%zozXd1i@i+LA<XnI_o%<C*uFVH_L1L~izFdc}NHY<Ta3{2O@9-@;s
z>Ii~tMtN9Y1maC|%D~DDKd`^d)q=|a9cF48;}+x48<ekyhrdJVc<u$bmLnS+q2hmF
z7C#Ymv8@C8Kgfcw@GWAlV%h;O=+hl6#lG<_)4F$owjw*pikgQxp`%mhG|Gi-(>SG!
zG{8E=>=w}0{nz}vUS%^Z`VCa$@)r`DQe%`($B|_-3vWD3qz%RuON%fpveo1tXFMUE
zWIMHyaUXHq9?hV?>O<~3X?sjM$SXwL#D2O>GE?;<=r0)c6D{AalGgQ)Iay`>W<AT6
z1;m`%-qNwGhaOI!Gr;bzLy34+^KY>g`3-wtY_Grbt!CMR(jw#Q)>um?(^@`x`HIX&
zNq9uw2vA3YZ6&F!-QlQ)1K4umjsyMuPXuUXGx=wm#vIpQdMsg&c<7wTLt+|suN<Gi
zvNNk@nLsTI7C`u{u`^)OhfS++%C9kJ3<U0$C-b%7G@e;I2lvHkQ<Y`F6&N0MoUhxY
z&t==GgfzL#@%MxhC)JcmEsbxgaQ$NsgL2_&nRXcQ{+=HL;6@iTf<@*rVl)ITJhEQZ
zL`5qh|146L<@FX{gr*h6oYy-376%YJ_St|O8e`86`IfIj3sv(%c`VBoLaA!a`>ci%
zIVYB$Z)}n|4oW5Un$)nB+vdo`3<mp{R_I^RV#A<tLGuMIHN^aGql2<gij(GEp@d5<
zTl|LY%rAq%5Zx+^pNuxOQ3}a#GB$@(tTwX(oHh~Lx6*uCv?c2|`Vuc^_x1j>zb=9O
z)ee3n^KZzGnnqYSx04_lO7Va13IG3%{^CaeKOz1<MF^~5-$7k)^TwCv;$o()liT$T
zT2gu~C3kTiQu|+PFVs@9y*Bi3qt%Z?8os1?h}SDXpxB>W7{K=%UVrS%bGqqeR|vGr
zxo@Y1I9gv268Z4-R|j3$tJ_J&e&=J=v^P~z(XW4Xp;+Q`ScGRV>X!NdfbT&4DdafD
z(Lq1k8Ul^85_D*~GaTY!6>>f{rZqjOyevPNAia&2A3idU(4!UFIEyMvj@lKu7l6&G
z6~nk^?!F)WwkBhEzZ|{4I|q9vIp}*|6ZDQa7dBdKmYjkv4m%SzRKn8lk(Yf`b;)a)
z8EQLDT&tE@z0N@-5J9@z+wUhI&z%}?${D9wt4=`>`|KNzDjx;Yo28L0?FP1Qm=Eq&
zx)Wj@%g+WXeelS+PA5b!e1`fIbPZ0v0XI~bSF#?$CjjoC2~o*dgR@nZW|;wgSfvkL
zUGv4s+n==g!or(Qn#aa3T!c3P1KSONYa#Tixi24FB>;|7KY&QyJKyJ!3uQs@D6aAZ
z76nJp#(Wk9WoI`R^UB)-1%*k2JV;j>Ivc|r^c{4*E0SVB8$R60-VdER5`4T%DhSz_
zJv<~-OC*{=GU0VSux8S$du3mf#%gqqY=4t3MRjo0t*GSZJ7_5<<YM^PWTByLdo1fK
zuQPO=C_dxf<G%eHgk`ab-R9(kXJyO0V)CYJN}|c^py#Eb7->3YX9lL7eq?=`Sp%~D
z{D^a(<&1T!xxN1W^q1uM&hZiR&+IXK)7z~dcUot6Zul4uqpks?r&et}t8&4U<7C(E
z1^8*1;~-!4uLPWf&bzGu6wkP5Sv-{tf~D@7Kc^6Skg+zF8r}_+G}SzX0>GJnz#0=^
zrLKodGI#aNryy?J3!C~XzZn0tG;AL3{HDx!MHbkM!{+ZWBK-}(n!Xm|fNNqFYBYv2
zy3MEoXH;0+T~hP+9ZHy&X79!pmHvJ8Dgp0gq*ZyPU`slDf(Rb}E`b9X0tv!%EFv}&
zDWfuyw;3jc=m_>$>G$0kF?5uFK+7ER@6OVeRgUG;<0bbldp};k9?cH7e~Q4?HCL3c
zsRJKeRbx3pv++_Odvpf$M&-HDBLpOz^m{z4xa@y$LR$DzS%=ltl^|5Nv*p*u-T8vv
z*YP>jWZ2~rS#P??!--I1bozdAq4s|x!*Yhd22bh+9shCRlz0*J;-VFFUl_|TUW32j
zF-ObwP{=YpV!Ni7dB69gyz7U-WI+(3Npy&q_&$0q%PglGsVUGywC<;hzdh(Y^q}GS
zLf2;Tx=g)C3)zIrb|D`Kp}<pqnurvPKzxrRIr|cTQr=OoBAhM<abtD;MG`&$WTnsO
zW@#^^xd0V?E=TR{l?pi9ok^<ol8b-U#voLNBBnzI6fzzxHZop=vy(#FXAvWB%Jz&{
zs0gvZ!1Dd#(X=StHZaARP&&nZcO&bna5kK<?RHxo^sEus&J6_@&VNpKT7M}NcHoG<
zBb<uO)U!qCgT*Nu*vkCIGghg;J3BYceHn*PhI{F&0O)kJ`^hF4V}%&u8J^TZtIFgL
zM3YGLk+?aO3y#m5Sl>XWT728Bv4~~LFM!*KCYeh{(GUB#mkH>|j50JRSNPaDI686{
zYEgrmBZCXfrTl5{D{H}OzuD*V@U;omJFR2Zj509{;R`;fz1ali55ZPYlCXntATa;x
zcRR7%R}ErR+pRhCq^lZ^wRePHd_(488XuY576@XGC3MrxM^fM4ULJ*tjQbE16^m(+
zZG(uO<)&KzT1r?M#ZF%zXGp6eW~Cw9B&mgtfyC*)B)!#%;3^!wVdaXB-IDAbYHyJL
z)ybA_(WX@@goDVk!e5vi@$#1W1B2y*JMk0sS3OCp08bpq_f8igs)C|OwtXinOw&|D
zvxS2CN`yw_>sw_9yztiSexVzRRynQTaKkD+*69m>HwI;BKAP4HH4uHMj0oNoYs=k@
z;!jOy>h&RL+zUW%>j-$AEZS~Z;S<*J-ZfJp)%~LHGs_$Q<FAXU06di9LjTyW{BCDF
zr_gjDc2gVC#G!NT*C-MOF>iJup!j}0%;hvU7fu;#rJWY7L>T`M`XMX~h*JhBP5=M^
z7+G|Umpp<`7`O3!PvQ^vLXbQ?iu!MopbpJWf3?58J02BCTxka=>wSWQ$AA2o5d}xX
z7d@^0I+FLFYyvB=GtV`M@bw>3fHs6OvDyH>0`ORv42+<6x&Afs!{3q9lOHw#cK8G(
zIe@u59}*__Lc<C_LhZ3cxqP<L-V;W!^$*NT{vh)Jp^=$^CoSxc2l9aFdcBXtL%>I@
z`_%{6Y5T|f#zK(;Xz@M2FW>h8I|z~;M|u$i;Amp&35!5{qVBvq&FR<Q-#@5%CrXBa
zFz&OD4x1@$^Zcjv10~6-`--p-9%B$GJ&%s~xC<3Qi83QcGQtmk7S3rkoev;LxZ{CU
zf3?W}Nd5ZxKjd3c7=!=iT*4tdKs!w~KLY+n41eRS2S)=(44}?iEoq#;;K5(oJoNhu
z$g8i}bdVq(v%Hf}cJ0PD&&OldSr_nrE|wk+$VK&5+q%2U)1`kv2ql1=h}80eAF;k&
z9|HNuDS7Ti6Enk?666(Z6UFJ+BUVs3sP4BKik}E*T48a~?*=2OWzpiXr76RG;EV3(
zREJL_8wD(v9#h%fUfMqduxGR8X$3$;k-a_#<Uf9g&ij9JNMv$=`fQ)1Jv>H3Tm-gc
zbZyt0RyxFDQho$B@v&rTO;n&qMg`&S(r2v~Hy4MRB|6nAyEA1MOt;XfBKN|KJd8i7
z-1g@y4OC3-$3vflM`3Zka)LTvIgD7}!B(Dy2Yai=njjV5S(Wj_<2Op&E?q#UWWN$}
zv)HtmBFb;{$4roPDEwiaKOWG_Z^8K6i*8tz5#yg;Y_MaU+Dh)MB+1B(XM&b*fY_Q2
zAAYU6%4S|;D1p7lihKSKH&NsP_K$A&y}?9ydjSGMU<;1OpjO|r07+q*I~(oKoZqg?
znh$<wE<Nv80x%1ENPQM6SHk#@gRVPB?^f#5i2rc*1E3$JwT{=5F#XaWHmz5c&a-TC
z(2_~1Kc>*F(o`7;1c*{`c3``%kQ<`O>TH5(`vidUCQ_mS8Lcdq#yvJ5aZ}=^(Jfg3
zOk60&H$<NjatMF}R-86-9M#Ud8fU*cF!R(_s%H#Tg|7M(Rcb2XA0!A!*laW(TmqSk
zDImI^98Td=Z}vD-1E3RiAO=GR{ql7H2t{U*HVfD(QJbYE?Y`Xv^V9;C=2I2tqplZ4
zfQmJZ$y{KAjY*OgC<<P6dhza4$LU2*x!d#5{kkh+ojYmRCZBef59>r@edT#>Eh==N
zom#VTIa3P;tpWM)kF3Io4le;mQrnd{JXqM0PNW9G^L+1hgVWCHcCCic!C2O(j5LKP
z8<ldwVQ8zy;zmr48={ll>zpv~a!a!tgOoh52mInuqAwam9~*@LRxHmeci?c}^Ps-x
z5=g5UQ$)1H>&0bXpjsUQIEL{<`mg#Ppv$BikcMHyvw4Wuw+5X1gPu{cr4q(H@;_*G
zCnTpF*^Tu{^SN`UXlERwj<rB<y1U+m0{P~1CX-s*#Yx7prq%jZEtk{1S=FzkLIx>5
zcUOggr(;&V-8yEw2*7Ctgy1ISzM>h5EEAJ;x$>}BRQ||Aq~Y@>^j?rMYtt_g$%gSX
zpvC@=ZXjEY6X)x!Z&QXlL+$c)WjXF)F^-cPaiG%mDJ#o`y7Fs4fYbmmstF^w%cWE{
zb4D(q<wTp>pw2Gi{bEBk0CG87Y2Lt(@SlV<+#EFSCmCHWm89~<t9@%&XC}jTYD1?{
z5kVz=WB0vjG)_}{ouU;0c-{j{hVIucD@Zi~SQj)_*H*gt#adED1skW&otNYk+Eu_-
zZ%|A!QnrnWTNy0kJCI0K+DdUvO?+(}xMq}R+!Ol#`gDh9>scQ5<?UBhs|}6m1a@<V
zosRPytC>=ultTJko{dT-h2MR=Twa4gquAn3cJ^62GJV?2*8AK%@xFa{TiK6jxD_Nu
z-q1<<lNyQor*d}y8Y1s~Grz3CHmP&RX)!7c6f$%$6N2VX>1e6}JW9*}<b4)%<*9*S
z{rtA|xSvt|3_9hVW0OMVtW#s1z;mi+>phx<Fr**($%~|azej~tdtGY>lj#U+#Zf?K
zKRRp|qdqoPSGgp|Wq%GjXH~{ordw=fKkEuqUljf9_@w)(>p13)u-Qx&qUEcnJ6WQu
z*$mKh^YB?w=QwMv^=#yEqk3o~P)JeSp$5c0Q@EOX4?x3i9Z8%c_2L^=zY7oyJ}QoS
zJ43*o-YJ;&7)aJIzh0MH=CjyaY~&_qd-vV9QxLyhV{hOAZ_Y2fgL}RF!ZT$}PY7Sd
zvO211?voaj)wUZ*c@y7U?9ad0f|@b$rM2P*8+rQb&I?|xM+IL?f#OEMHzT6Qp5CCk
z*Wjh5PM)_dvp`ZfLsF+<3D1f(Z-HyR)yGG2Y<;T%&<~coG?v>!G%tagljJqMo6i_(
zFM)l|ekyL(U6oyYZ8Q$?P&mXS6||O1els@u_AbO-dUj*_ou%=W0ON+|-PKlQKe+2N
zZucWh{&&-h2ixZcPYxS)St_4qmo?qr-Dq#Lz5R(+2iyCkn>jw*+%!Y%VeqQQydB8S
zJ|U*!R6g?}>h``^jQhcwhGY){Hi=c6-=oKg-g05>Z0oK;-eXyP*f2#%*tXE{ylc`S
zpEET2jf8Du6om21sSWILV5i{DfIaA8b0%4kZTo*(96cs4yq}A^H22P*8;BWtJ$)n@
zM#8C4qGP(<x=tt?%k-MBwwg6bvrqQEdc+M#)jff|P-lM%o2jeHn`2$gQ%Hfn(Ks-y
z9%0P7N7(uSvS!QyG6uO|GsloAOkkYjk3H9`8EIU`xE)9DTO~CxVJ>@&H8CnTx6C!?
zvYMoxLN(V-Q)wHnAFG)sidvdk^|_GdY2icVOe+tU0T@>s4>j|L&*RxJdr#K4_^91;
zFHn)Czjy3k%FPEp!8f{t)aJ$QG<&gZsF;<p5%D_pvNK*SdI37=x-cpxO;4>-43z#-
zXl8$A=vdI5eiM+{&>9HxhGY9Rn-zYqd1^JtVD0V`)%9VY*!Q9L#y3`@Gea6n$B!SE
zUA>*Ju}RtvIMyF;p}J5M!2jl5n{ikymCOj!g}&ILesDCqCBgM4!+h8&!*@RmzzMHd
zSJwUemJ0TJwy&g#c>1%mj^h|_UQc^oeMxB1@3r?j?k_j)8En0E*GKH{Y7(0}!*cB@
zaA-5%pxo+bZCmexkn;g?)%v1%wSf%a8g929Yi<LDDU;zAtzw%|aR6k`+wv0k2J|7|
zt}9}1$ionk>if$1>|N_UY-CW0#<xnahrJn^(e;tuBvH-uctCDLW~tR>Ux&R5$YpJK
zmBP?NZwCi@)7MWOwS9n+nTg4RB>iK2lY8$3SrXfx2Jc*HTaQpd(YdkxWWAbd@bX<!
z`w2@0Lf2`%Adr@v8Phxd^w!G%7y`Slhb&3I(uU;e%4*FFJ($1MFRVmDjY-_#nnIJ7
z;Y3cxa**If)T6=B0OM}}L?JLI$N=M3jni)Na?iZ)JH&>aQuWYZ*JO7>+~Wi1jqX!+
zx!Sw;lk;uS8}9E&;o$D?r~6!-;ovyTA4);@Q&2DUJFa>pEoE(T#BHYL;sp7FG%Oh`
zNYqrtb4YTge-z|QQv{WCsx)e0dn?_PqEM8BzmmC-*939KYWd%!9Ui4kTmWEf^M&h$
z<Dq%#)&<{y^`V#9umJmEf_o1x!FnJDv5ws^E{v0ICsbzhEB6u|gK@>6qQIHjPe7;B
z6K-~33WfAL0eb*%Gud&t!Z7B}EXhXW!92>oc`$4r0S#7ebW66DEgiMW0o1U(?9sKU
zo?5**UyoK*2=2&-G|rY(PwUugKPt0j0@qe8%Q!oJhkuMSwUw6!(V>?qg?VmVaL>f3
z1^iMJf;lPkO@k+Z-bwSS0L2-1`DwXq{Rl{Q1g}*?#YXfw59`Qwfm8f1F1sbQ8v0ya
zLeTuI)jA;UZlAUqcsmga)Jp_S$nzy|+32J#by&Lp@B}Iz+;q6cn(tsY+}V1b`_-Ce
zByLU^ezw>8AfVEMef|K*8>wCbW|9qd=9Ld(2C94<b7Ga{TRYBYH35U+4=I`?6<P~n
zNAo!E0Jo*T4PY>xIl8M_zW*A`R}NE16Kqa5SKN2`!dw$HpE{SGI+IQK+DMFOGd%!9
z#zS(3(56b6$n11`tibTtp>k?}h7(P$2nig$P@SEnU{?`UV8yvTB6Q#M%D6kk-zNI!
zV-2C2PoxmH=#8~7ArkJn6(OKn>jl@M(?mN%QL)OZD<H*jL4~AySeiZJc-mAdOV;1o
zki{=V<0VPlsT9_L^g&36b<<Jzk+-J<P<`2Dc)sl8vpeOyRkiF@y<CiZ2~_mFzz{ec
zty?VIXTR*t?1CxK{wni<73DJ*31H&0QF-({A>=CVWWAl<&A$j<;!$x(c3pC}VT9(y
z8R&$MgAQpK(*)uor8m$>8L5g>H}qX*bJE{_d1^HhlRGZB8brAohIauBYUb9Z4{km~
z1Ii%u3@7uIKcchLrKc}L_^>K=cJ;2zdHUYC{aP8kon%#ld?I$yicS-B$UY;rPK8Ql
z*C&BncsNNnUfZpFZ%_3+(0&7Cma`fJ*TY5o9aK|Rc+-Tr&txS+i4v{}$*9)=zv{WV
z#PMRumdL*D$Pd~yFBJ>dR-U4+)12ks<&uIu=iI_uD&4MMCVo&xvzD1cyU(p~nKB^l
zuS;N0Nd@^!dnOe4n5!AihA}Lq+~=Muv<hU%4KZP&CGc+RboHFigF%szyzLp!T^F5a
zD14mL{nJ8$L(Pc$o5NO8=XLA&z9`xrLzdRt<IGFww)8Wc<l(ZW<NnN7VFKq`zgX0=
z!pq(FnlF~@7i{@=azhoX&!V=@lQbl%uTor>>hp(+D)UHHi4c#ag!{#AFh=VSq){W9
zNJ}|=$H1<%4OL-JwMh~EZvl~Xey@FI%DQ6!2(#2PT&*8#z-HAB_eq=p_W+SpN@Cdf
zXj~@;fTay}r0&9QfLhWVaYnDxG0Bf5wM)F5ww%O^;bDUPTGSeBm3#+vl<KpmOP+_+
z0S=3Phz;c+hjvuj&V7T=O7mcK-0dt=i{)z>u=%Ij#^vN`vFo|R@Z;{#ry|<>%A{tE
zea-<QG?S9t)yuLh^;D2)>zj%C?jUK+uObr0i{aBi+>j(k5jR6;;-F-6vE&I}_%#U!
z@;d6$f&x$&&Q<g2xNLbEuV_XaG@sQX5kBgc52!!;(mq?3?6Oi1Kop&pw2C#eW=U_J
z^1Z4?!ynp|c614Rf|kG`a@7(CP>-q%Uco^lkyl^?nX!+fE>Jk{8C{-R`C+(SscUd-
zg-hXtTm)H>z<V>xL{;tVcNK=tEaD%Kxuj`xWZ8;KIVbW7XsH!-ZjaV_;>3KeHsVw}
z-cKv!m@3G~dHYv2?a65*?fAk!l4)mTrH`yyC+GS0)P96_w>Z;Hhrn;(v7fD4?{<oH
zn&pi(R5bgnlD(ccV>#KHv-*@TRN(wa=GN<d?sV2xZRrHwqlP3Lw#7-v83r-8t=?N3
zhm#?0*7tE97-{aC37MNFxNJaW*!u`j-8Xg<rCvmaDdGfjRpPusrQPC6g@vFTD%9ck
z$5EB$k^PCqr0eV6^3oW?E|Y0)tHE}S>zC8IVBFJ{5MsWX-orlHw2+Ny%>fQ`n_deU
zH||E;y-PJ!kh@K5jWt0Pb!!tC$SQvf6T16>yhemjPm!*T-{&e^Z^f)Bt*YXrl5s{N
z!`QLtx~xx;+izrk^wAAU*K)O)ME$c8he`*7T3nXXM@8d<*Pb%yd6a_Hw$00w-7j}Q
zgcS88(cQ+!2N%SSTme<8S&zJ;E<59^o6^t~m#wgEg`N8&m&+*jU!b!#eS75$os`r5
zwat&PILBH}O1RGyZyd@)W4qQj%bqd{u3^rOg-EItWQR!sn$e2`U233wZxA0gY&!I<
z(qa8)6F|$=#i#q+C5zhv89Bp0n=_Y5clVf7W>cgc&pWn({gi-5<$4iM9u>tP+d@@^
z$5kH)tKUjxWRM>h;<%HM<oh}%$YGf>k@-ZGB@%gZuWA%);XUsJZ#KCeH&J8tnreaV
z@f<|7zUs+{Ai76jHXIBv_99rJSiRKnd|%2J8s7%|T8}GMVEzSEH{!XTTPK+CQS3yr
z^NC#UmNv5Bb2Kc0TVW~_7)MYZ=s>zNG<}x!irj|&?e*%gz{P@=+0+_fTa)o&KnZI|
zN8oKCk4(W?J36aL?ib0e=iAeeU>xl8tFh+;>w}Nzu}XG7KpeyKrXim=)mYYA`V#!0
zar1W}T&O$9Gd5@mO|{DoHvKHdk^ZB+Ry|Au+f~&)c-Eq`9LH`;3?B#O{MSHGXTH{h
zi7Ra!@f{AjolFkCqt%BZos+~aDGu<r5&1-pKES}pk5%96Y&0ExrznTTd2WFujs(D+
zBPD<mJ93`xa~NRWx{bT<^^B6|vWLusY%eIHI~p)1;4(KH;<DJrYMarcRSvQZyeVV)
zd~j8eF|4Tl1qPc`uKi*3g-3{*y<rO^tlsyMHoOvh4tzIPHrHG$q&zu(8;8Ets@Irc
zmRPVY_yVh!5WXzj4kF+pUfzr_ezj4I+u<zZ`%uI<%&7jk^C$p&s}-x~BGvm`yLPW)
zpnK-031D8FAi1)2OuQ^WCU<x~h`wPaAHBKbj0N-HGh?aU(x^pUNT|WlPp?na+XkF3
zAXuBX4wT%>UMDe65BNSFYzf`hpA1Cx2M!#c%4Vjdb?us^x^EVfEgZf=SX^Kob9=iH
zbyE2w8DwyHyZLOa8uWSVHd63nAMC@|Fx{%K!#y)`I0}wSeal09MwsSgRn}Z{LfW4>
zzeM+bVM{S^VB#z*Ob~i~w)-M@fqAXHsb*Pbznk%JQAL&+7wM!QcT3*BQ~8#q#P;XM
z7olaG)3828$aByQ{<q;nEkb>(kRp2%?k4c9wJG<_UZv#5`}a3z!Q7qBMKrAhZ>||O
ztr*T&Wjk!*w4^9glf|)*ry?nBh-o?kr@y6hGUZ%q?+%-yXsO0igT}R1q-j%d2AE1V
z=fSPSrbU*KE-oRllFp4;u+1V^Zb?V%P*(Ot9Fx^N{muv{B5#av?AT0|>sLtY@#<cc
zxh~%F?3d5!vo<c$wkTPyYO(V9Ei8v<mKvUN3eD$_`Yspo$9r{DmwP?j&!#ldwOt(i
zi8ewb3FAnAlX1MIxTbIsG_KcpGjw*&H%VG<I_*Hc+q9b5>(d`iKh0OWwa6tZfg90t
z&X^>b7*ntqHf|kiNfYGVnmvnA-FWjJ-$qKZvT@Io&q{ZXpKEntC_F+!)2Ah(f%NXz
zr#tSRuDQeQ%z^uzftWN~JvABcw>KB9_xet(dRkxY>+;GFTJO%v7BL~e9&PyqrGPXK
zZxat!_A&T>32%W=0JgS5ZwCr`mlatZ8H>M(;4CvbWU@YaJ5eeHDJh+Z>siTep499O
z|D?HBw|sYAJIomIxqmh0Ej9Zy5rRdBC;Kz8v|i2S7Y%3p;C*fz<d78STo1I$MyJv(
z(~OUJ<MbE^^<>{VpUaB1{+vQ9!pCaduhFe;o5vf*6v=c|R)GUAOu&HGRzl6@a$)D!
z)ab+Dw<#fb`c~6HOWs2c(3$OKqU7~c)y57}r>2cq-G%GjGLTc5_6rC@-3cE%Z|g_f
z)0Szz`Maf<ff>bGSHN9TtS7s4M)9*fj3e^;)n=c@MW!jFw(mKuZT_@&)%Xt^ZhMBe
zXDdev%^q9EHec_#Y`$}te)oj@6YWy52!G}4{aS%f;-AHab#I8dtm97t&nRa(Z^RE(
zzs|rU!}G(uI-YXlsDDheUV)q>E#oYDuJ}D(VD-5gxJ;cZ=+1<!3XeURvV}Ts*#^pu
zdi2ftu<}|Gd80F5M@5u6C`i1FvhUZYt&Wkl@vN^qo`~zqp76MJIJ$jaT>6?RmWglX
zo-(}lT(@*6_sVoO<&qSBm~4~>aS6Z!CYK-FW5F|s#fgh%<S@N_0o5EaT`y5+VqbzO
zZM*BLEcR>HioN~bm(6DW(<ijLT+-&Ph{#t-rsn%Qd1)+r<vk=3ZJ^nnYXHQIS{0i~
z{CPbpiiu102{fl}l115@pQX8)-MvMFs3R}TE^IwNS?@x~yfrrqR_NE+;VoJIiCd5`
zVkV_{?mYe^C2q|hw2Qi40)x(?*%Xf}uMKg`XJ$4Xw%o+Z)3U<C-|Na1ne}p5$K@z}
zGf?lfpnZhDgZ5P~Hn+%rj=aF<{?5J$Yng;oUKKMkH;FnWS}?{16s|0UZ`|*|mQN#l
z!KIjTgU^$|Cuow>1@n5|t9v6~GbJ-)Nu1?1jiYjb;8$h(tu4qPG0o;4k}J_e8JW4<
zPc%G%!wI|z%r;)K?;uKW?snWq%l+|V&5PQrZN0<UmV?ikDLN)0TA%TnKSe2YQ&v;W
zZaTxxW@rO)3<iF*BD-*v4o^Ysu(oj!0JWpIATc$~d0DVNHw9J@l#MB^ye6L8m^{cg
znHW0^svS~Bg});fAyeGfl*ooLB}^U}SOj=H9uSk@$-78Svs68{9lz<G*JOOb83!p*
z>E#&AFDo1Dy(;5T=r*_arcAf&OzdY4kXgPOcBSZ$)-fty@keiXsKUp^J^}SmtjV%{
zUw5qt%FST@_8lR@&>%<oYg8MJJnRmr=RDNVvCN?ect6!iD|h&O`@{Ks<0-v+?X%Hw
zHTh393dltr0?Z$)C4ShvNr}^4ih431YE`C|ZxJNeyWtw{z$OgpGi5%CY%|7xQevW~
z)M~ia6Xv?K^!AixY8(}RL2-BF7I=rk74l{2d4$*20<rsgL^=HTNVt2Sj`?=`rOl9&
zMp;}W=~(rFy5iS{NU?fs$P68K+6uCiCRl37(NPNCAHNmz@APmY9*dD3!Z6OJnjN~E
zftQ3LuHX`|q-<Lv@~(xaTb@~%-%PowO1L*ykQ3~BA9^l33f&!rEdmu#+Ro%nE?qk*
zIn;qdupPy$oL<X;>q3QZqK|6c_dQ2Tz_5V}LV`yThC^r;UCI2y@_BmJDlffi(LZvx
z9=3-`izFNyCC^5Gp{ehzzV-B~9`J<t7%c`4t;{pt@5Nu5*4XByxD?6=(z9fLmvYo$
zZSBC|6HAe&=UAV_pMLyg7SaevvPy#0*XC$lY1b)r&H9I#_05k}TWdcB;>GLyjFhPC
z+i-ue#!ic^m-Jpu@W*ONtaHtMxi@jGw_W;x$6s&-l%IXKl*EgX=aak2kyGf$n~gfs
zMeeEjX>BrF6fT5;faME^NCxNEkEqnEW+sO6upTU5;RDp(_@zVO&5w!=DS}wwy}O85
zDG&~!<x!HoVjvTMdo2xSb1&G5(y~lJ?4ulVSV0$T><Vt%jPJ$)G<^uDE*6qiU#03e
z-<X@nZ~(9?FcG!Yv?5V@80}o_Bx}ThF_XXO!$@$59#L>;2}{MUGK(|_6}P4OB~{o{
z06H*1L*I*sQ2h*Lf2Mnyg{jU9YqS(;u9<AW67K^J#`F;9j9x({lXbjZX$o$W-@M)I
zvz{tb?YFOn*xV25w-&-6r&)GIg;{1%78=r7$;%Dw&X$qm7syVFtdOHG5~jFA2V)2F
zZI(P8)|EqeEW3gRc5O(u^ES05e8kTmNTT!<{+fD9i9#;#Im*5!yI#*%!WY2_=YF|{
z$J}%63S|UmHXM@UR?L1GqMsxJH*Qbwrp@GRu`J;HMj@~-<XquDd`+Y?#Sw3<!|Rr;
zi31(Mk8;GGmu1rv(soXbv8vJsFCP%jJ?rha7^w^~)S#>k!JDgkFQA!nnefWA@>C|Y
z;Uv;tQOGfeigZ6W+AO-ZY-7yqk<r%xU~Z!_2&b|-(n(DO^669iar&_Y`T^N-vIsxh
zM9HHDpu6rFqBqq6)v2E%Dd8YH3bQGg<244;w#b%^&u6*P+uZUF3fTeRL+xWgZL5mw
zkY!xt|I^Nu$3y*neI#2cTgh0mGuE1HS;~@q$-XaN3>ggBMwTW-Bz(urAX|j7hO$$Z
zq#8o@U3OAPijvfG$MSuC-{;le&-2go-@N#I?&scf&pr2?_j#WK2qNFCtYD<D-{&>w
zu+UKu+%%f`up8P4$*1I~kH@;#Ci!{+G@%zYY}|ynCbC+0Ur+R7@Eh@3u|m5UZZdM3
zs@r6{Sd^jATMb6HfSs3x<)6ByxwDH9X?>DM?!y{lZYw{aiQMvV*^(3pX}?GroDdfG
z-hD|`{9E5=5G==|81c{NqFC)m5JGC4)lxdI0-HBWy$3tpfS1yn0PS$zm~ZdxhN2bL
z+${1iDJVr#dstbCow&Qt*r2LL%x=w)K~Qj%VO0qZRo^4Q_QS?8>Qn~Gs!auGnHwwE
z%T?a1P4V14AGYqP<}+4nK<b4UxD;7n_2nH8#FX8dYu1I~V)WfSU?rp@;C2eUQp&4;
zsq5eswrZ`kdcOTvO5`cGC{=f=&lwUeAsGjH%-thG+7<P*&@>gWE38p^7u*x2{4}{0
zZ-7YLB7kOv7A)T}8}md;E}C@az=yKUDBq`4#Y$~U-BvCc_kO_w1KV=<2Jrnwy`H>0
zeXpDP_!$PyZR&6Xq3OW$K75)9Vq;rrTjR^scS6IJOB^_O91`{pa1RH?LkFZufrAs`
z`nz^8sK4(jw~YWrsaG?-39-t0U5EQ~w6X(DA<hp}X|k$2-ceyZAt8UCtht%f_F@bb
zX)aV0Kg)*6;)OZY!nVfGn}u^rGk}d$=P^+eU?l9&WMh__<W~ty)jwYL>Uu&&Nmc{J
zFdf*)InZ8dMCsNX&#NuJdLS*%YP|=Xnq*OWugz#FHut7_lOaBPZ?1+%$kD?dVH)@Z
z9_5|Y`^Cx_2u)SPZVsmCAvUJ(CNM<)4w+lVcdd74-wV_PiGKvC*p+S&lFB-^Qho!u
zOX<clz|Z=7XYvzziHaUF@bwuH7-8kGsZIOBg*9=$)!LAcHLN;Adt1$FeahvL7MO|p
z70UO2QIw*6{Lj%{`9cZ*y;mvsbT(Ghy4x9~X1zeb?Ge-j2d*1m%ctPlK{xND=&f;^
z%}FEyNs>c9Qaom3;0pIj1)R@$V7PDk8IIOXHeo)2BRF+B7}o*_UXv~1N5+l61N6NC
zK%cb&-moZTSw^?bu<Y~^qvS1Snq4f2fcim|EiY3HI$-DHQQBfYhOtkewr%44YmWY}
zB;{-f<}poS&YtUEIHmWQyGM=Et6@3cZQD>G_UBCW*#=-6D><!jLX-`$;J}p?cRvvC
zJPdV`&PM6ImAO4}IuvC3ytYQKc?W}u<HiLersMgQUl$}wX&(c+EffwS{0VUYnymqF
z(LMEyajUIgm$Vna*pOBa;FRHBg38T+Xk8oFaYr>j@IwQ;ZmeoPyGIf%K#W}e5~qzc
z4bYeY<>@z)&HkYL8wJSoB|NB2BiGCT^KY4-LSKu6n6UTBkNc64WRx9@*$&1DC(uDW
zVXA_X(oq=*u*G>sGhCEuSHcpm;##m``=^17;wI_4Z;3HI;Uq2ii@<F>NV=EeW5XB<
zv&cBJez0LF4xf$Z0neW2&?Yp3N2*dT*AJ8Q?dH!I`tVDPKzc&9ly?q~*<ElVq!=Z%
zYvaIOll)LHSCB!Or=PNrpEg{K!auZvc>V0*MQF~P?a=2wJ3oA!LKvQ=k|brL&K*dX
zO6EG-cDHx!#7r9B;RN<?&V6%1JQhw$sCR))OswmgsA+T@x2Jv#fbr|1Wpz$<^ECU9
z?cKHi{Pa}=0QjZrq?m!<yNm^vmR3XLy;pbiv?aIfV@8&P8U6D($zsH`fP$Neh+`6G
zo)a@w7hkeE_=9XszTlzR`p*w2m(eH!6x(Z=kdgrJ&Qx{%zCN=CY7DwgU5{L~qX3r1
z=LpsD_jY7>#bI!K>&q@h9@r&+lJT=M%T%&BGmf@#!8gpvz$2zX8a%?r>8?O<858vS
zFPcs%0Q2>zMV|;58ziF+S7glIT`a1>oZ6pu&m_qR1@59kmOXbk0c8O2LmW)!Nzvo9
z!ph^@3g0IDo;G$1J9N|LAw`3R?E=p`3odPriO%+;LkRg$p?xW5f1&M>T>dGq+UiMx
zSe1?JCa7w(%9^Z`Me|X*3t+Z-5X)E=m>iA2Jo%V+!7#%H$=s8iifC{)-W7fH5<^(;
z*aZxlWt#scN}H7TRte~KomS+1ahYa7i+lB<iLsbXF{5y=wGOa(#uIRoAta5U%m!?E
zN6Yh)Ia+x26ldT>FX_x;n@d<=B9Cq5G+2M$R=!6=H0)>8<e2f-YjDS(e>i!^8Ae~m
z)1ybRSu^F@p)s~^nS&`FUS>)tO$p-&NmXv##rAl;c#>jeyfwsJIDq!^R@2bJrSU6y
zLXJ}xkUzGq=k)5>j`GL8u6mAw>1{NaZ3|DCFATPC$Eb;ZgjPL$q*!A1D$<|-@~Q<(
zv7gn%o0s{!Xf@3h)u(S}E%kTAUqdeMt$=uz{Kw$bfR)fU){1xI3)VK`3r4}E9lG06
zIS#5jgx#^FZ02$Akw;p-f}0TYt%kQZXXyg~+eJ$Sc8f32r*8)Bknj>}8n|$a=)_js
z_2#5ovF=2qafW;U$r*kdaf!B1Z+gd7m;KDE@kOUm*iuOsTE`t*?z22?UGnClx6FU~
ze)C97+1ly)M7{bgeaNKjW_$VAmW{3r|B1bw^_K|O`4lbn+geAhITC#rDT`R`cvn*6
z;526%;<Zq?m9eF}Q;j4T+f=UBX4t>TxB66Hr9>ZK6;X0Mi7an>n3v-u*U1N@*F+{N
zPiDsB6bp9I{rO)#2WKfhHE|dbL)98MD&5Du#RD`r_9XJ&7HsMJl#!uP=vn!a)s!AI
zqB+Q|*(#V`G=#4^-F;yAB>(mQJky6~i93bz*(?F!n~Om}teFmN%D$fM-QwJDDSZnt
zMMqI1a|&!0JGZ_Kx)98unfu#jB>q#rQ7`F3ll^Zo*V0+uZJQ(up(VZ(AiQW68(wzl
zz^3o)e>Gf+!s=72dE+O3lah@O^Pz1#B@*)~3*c=lU)c}n)0O5vMi;IB=i5&wcC9;1
zC6qo6ud~5qeExA@4rjkVPaWE94AiBSb6>j;%;LckdQ3@{E;gm@k46Ma&H^Lo)gAel
z8QMD|WRz(-z~Y*G#>)N4Bnk_JspjKp1NTJqAJQWIR5k(gL5SEh9Xdk)`ab~}bUbWk
zUr`P|bvTNf2$+xlOWoN&?Nl}%n0I5riSU0faNniVC(WnmhBn8ZfjAuRKumZhWge4`
zNQFO+S_I}JbxQK?p`&-;BJRB!ZVZH{lX{){PoqjDf%)Xm3i$j<?1m)aL7CV7#{&09
z%?I=8H+{x`;!k4#zu!zH{b`WZ@+CDE0qsG;ZQLG4O*q{XUt9z^!w0UaGyiCkaxQ73
zrHuMWVG9|!nB`U;znB1S{(I15<MivNi)kJc4Ij(c?QQ|^k`xg8f~fNL4uFT|&Q*5F
zq>nrQtuz6$zZ2-4)g{pPC4A(n5GkL@e$hA2CTJmP@`S?5v=;#Na=vd2?}D7*R6LiG
ze8SUi`Hyt~5|(MI5h5YOAs`QzYtwP7C5(IxbcG1?>pUM#Z@#$=Ey$P!kBWT*T(Q$H
zt!{1(PJ&p3vp^yA`?~N1z-#STyg;Ps5AsG?PeEzvy$*90;RUCX(i-1u89o|!|CZyp
z_4D%@NK;CF9tQ%L0B+QR*R#(WvYsR7YP)2{@sULfullcEUj6lBxH);iK}$Cy#o8TO
z3xEjCpt2{gAkRgM`_)cdY*PYWz~@GHgI_ySzZ(RSEn8iD=n;kSVgTDOb7)3;0k1uf
z6h^PR6F&p50R=Mo!CC!~^G=cDp<lWt#cL;KNyGwEbbIJx>a_-t{(l2Ng0@NUPj#d-
zK$rqk?1K@euY<QLMzqOmCw$-3yQJ6+tpefq_R<wE6>)9pd;}$aK0ub~><Xwy`tp}9
ze{N&Wg|*3cYS%ZIq+1~fNfX5-We}gfiwCelPa-9q+`eurr_+##MMr=8if!8f<l$IW
z=!Q1`AlN?hS?{?OfMulq0`k21{NxsF7b(9L3i#RrSJnWKFyK$BjKn@3ocIpFRg?Z>
zzBdS__|6muw3Uo4%~VRR1(D_M`y~6L<s60bS-@Xy5JoGp+CMZ5XcEc&;QuO)%_!NP
zZMdK=yw~rolEr(`VUmZ;@FDNP##kNbND`>O@i#<TM91|uy^L0TGiW`*%7y+uV|w{~
z&xNE@vgdTTK;fyOXz)X>Egqh7OefxGJ4C=%<*jz#X7yTI5OAd1;dKXAzbZ4#<0&Q0
zz5!6u{|g99mlMx<y@i~SmA+|L58rtX?W*UPsD2vyIi;`V?N{ONR?<-*8&}%qaAc)<
zchx(}%-o95<_(w>5=A(mAdmPnsNw+7<E1^~?#<!T#wdVr=LX$Eez$_`9<$KgH8R4T
zFBfF#&@%eP4!+9BavWVM?&_s_etPIjsmB&<NZZVNcDWgNhc9n0-_qu4S6tmp)Z}7w
zxdi|t^$N>o_ih3vas1Zr-5q>+-vc=xCze{q-yh6~q*fmS3l@#rZ1#zLI;+C%YTudw
zw?@@^0zNXo3H(Y9)~EJcI`6lavaa4lA)A1^b~U3wDKUb0r^H`Eo$F8oNWQ$kbv*Y;
z((&fv&t`)_=7p%ueqhgLG{ugOvXFWdtj8wEc->cVJR;o0N#=mwUPufa(n|t4b?Jm(
zAj6kT%>N0Q$vao!LZSzn;CA2V{v8Wn*NvXN{%%osMMxdrk@XYjD{`)aE|t~(22YdZ
zE&BJ;hBaQTk+kYMr5Yk|)l00l*Hy%#T|ZK81FD$yx~?Z3yH}m#isZ${o9PVIkHIWn
zp;PQoRvfZ<4|5&for<QVV-v`(5poUh0xgNSUZ(WTf_@Q_8UB6R>U6n)S|QL~`wj;W
z!Dpbit;Oz@>MGoZc4CZvIWfaiW+j%fNWSGyDZ4K}6Y^*~LDPwaR$$UBO+UG_Yg?S$
z!6bu})>o&CszLr$Na@Ivn$Q^45x~Sdmu&a8tb;eoX?^%f?}dyA4vqsmS<U38=Up*0
zahH1E+<(7CVGd=aGD>UAd2W3jrJ;Q>N{=Q>acn_9i3J9kK6-8L=%~LZzS3a_P}%RN
zJ)(Ne<K0DrjeyE^<<p`eZPZK1ZWdDms2>-|K*+__Ml(gQc)p9>dByJChTggo8;gnF
z@UFsPdGfMsxpJSG+AH*H>-I*BgS6^w9VCWYNC02E&1`2KmC)gonR%4|s79Z5J2Tt{
z=nU?iw1Z7loNETLs9V*(Mgpni0)@9ef{7ZMkJC7UTI6!M%TRT!L2N4b-w6OGv^T;8
zk4Cq7?piLSZm{5_uAPDzTLQ2hnYMa*-eQs<9<K9h4zhoF0stlcqkfH!)5hKf9$v<e
z$;b36>s(-o@Bn`Q#tpR>QDO?R*is{ji|;q@c<EwmbW5H+>F51jrBa(2kf&1X8X8Hk
zL!siSb{$)SLHNr`NPpH-aWbm9qZoeQx?B_r$FG3~CeBu?p3zc-Fxg~|=p?K<onm{L
zfW5YbHjlGQKQ>F_qR;UawM;k^o*@6BP|nj(Q0Ih#T0R?KLL}3{J_s*72-vK!ux{Tf
zXVWv^&KC$klwnmheDahlHav!y4-W>E5;B~=f@k8P-gA!Tv0-LJeq_E|JH}R&1*U>{
zsFD$vVErQ3y26n<FrE^bV&S$J8Gsfl%n8L^pn9Y#NJLVvLnb;9HHn!s+=O`XzwJ7l
zztk?ZKPz=mS~6cwlu;8zn2t(7D|N_yOf#)^plI0~Evfc0n2=qXs)lZRm8q^SUM!cy
zuVHIk^k_a}4Co!j&cwIfn0mnTn%hAK=Ez+w!B~;VLg?Z0oXgD~<{WSHS(&ubd2&a!
ztUCl<+%W2zPxKptM@1V!=3KE`sFR`cLhIKF!k~>b9UH-${)~<KgP#Xpb2y6Rv>5!D
zr{B}!^W-I%-cFX1pFPSZy7XzKn8f2~_SqZPg`F#hrdNQCX5K-BjcHv&KGQm(z^sCj
zYHgA0l0z8tFt^^xbsf&Ts1TyO;8{>f$438?aHaQAA^=*G&)6GCuvh4W8@h%Zue@5(
z0o2LKdpt*96KO1C4_TICcL#8x57vr~?P!l@(!8%|3Fb*fex`*nu@D61Lj7j|4@B&m
zTv1A+tJkRF?bdma&ypyP2DbJBob7Ch8F6GPUPENSNk#6!haAw#e$v<uiQ<Or`?xJ9
zBBXJ1pnY9T5(DIn`??FYdDmkeKDIoXCl!$@MQaGmnX&-Sy#C{4C6ROsl^XU*1vM88
z8Z?CA_&|kBGot0IcE7<Fx{{5i{O*l9Kt$S$tYy-EFS;c}n6krJ7g%73Q7IsUjx<I0
zp%GA2o|t^|q5T$!kL`Ld5VlHP)PLZ3Wr#To?%Ecms;|~~lWo^h^l%;rx3DO!RE_MY
zPJ26md4MO`5F2I|L8;dC-1RP%%f=LF-ZDpSZD>hJ<2aq~oLu=T=r-qxr*E(u7zM!a
znhjNLAQ8gmS<BumFduz1lhyPt`s)iZ47kK)npl&$1aLKUY)LA*`k<OJ8FE!8s^uHF
zitfG;cJL6EOVz9_FEJt#*GrQ7_%qGu++-3vyi>#w2uudv9x;x$08Feq7~Hn(Mgvz=
z+{&C50`Yrgqgm$yTy<QRYj7<uPS0<%qV~;Q?6zPzt`b*lMm!F(99~NR^wblj`gY*3
zBY7~B9>{xgaJoGuYt-S^=gECu+2~E}24tr)gMlMX$vbMMN=3y@8VS#{CC5g3aI&wi
zF!2(~T_C87(Nx)pZV`BEwoONLi&r|b=57w+Voaae{l9Q8^0TCm@ru9Im;yQn&6SE7
z$~r#y0C4b*j`p_unnM{-R`P(6IID8waXj;JJ_p}+0d;Mws7^rqK9&WT5mLuAE1f)V
zQWI>A=BXS8axq22h2<!m@n|)nZjxN*c$+F3+zugK{uQC3nZM`G{ub_NvW|)y3(?vE
zc~McF+s6(zQGr&fNp`fm4<oy8I+B$Qx!k7Vr$o})nW#**?wb-vN_}0(WR~&yocHxJ
z<j&J!^D&99QR7I~C3Y4_gSSghKyIcPWb|3vHQ>G!jS`@67Q-5CDdI2S{N~=M(No#3
zczN2U&Bgj!W<W<bHu0fN*K9p#dttm#xRh^3%wyXWPeG*_E6=z@FOsNDIUVQxr&Dcz
zIh}%`#yxz-LC$3MM(i!np(dG`;MI&0fk$9rEWC&TS%jzc%m+uw9p#4xRbo22c<imT
zP3IRQR<?t}@Vw4rp94(UUmajH@=+i<GAN>*CxtvcbCeY<uvVB`Pj-$<5|z1jRNQWk
zWQZ^daqoEam^d!U;>-xRY4SQTob0=!1SZt=;W&|g*3fiKxyyh@dZF@-F_E9o6CkBN
zV~(%xdZHo~{qZW3+BiXhTn#FhRr9v^2Z)~IQE}ptp4doV_tdxmU?ie?$=-?V450E2
zGTn1!zdpkOcxU9-&Zo0}%*`aOT#~FVkMj}2ldF&~jevQJ#DBNdIlnW`fa)|36N`#V
zo>c-J*VLcNf}%Kag?mv;l=~N%VK`VBUD*1b*BHLY)FW{LAZ@~suL!nyyom7~A|ie2
z-2D@;^Hdi>)rbV74w`{KX52PHp1BrXXRI+IxxQgy*_9Wx{+xO)VhVV*z1fPX3CTll
zk4N3Eog#bqOk1s{!=w@|74lj_fzAc<sJ$7JoS?#!>;EV-kwF6{yb9uhifrFIJ2QpB
z9+4p*iVEn`QknbcUvocoH6RLmr1f{3nrq|Z*!z0Ue_uYs0vlr*gBT}gS-K-|W(;ma
zaW0R(;x#O{k_2rW*0k#h*SMht^RzA)eGYK}Is=77!l;8$P@6yrGN$0Ci-nNbOogZE
z@EA-<4k}UVV;42}g~z$Ix?;GKz1{K4sVO*H^%bbq<yby*Kr;6?3Y|Rp$g^R_7N%=-
zdoL^`r}1Y3QGKQ<JCOpM>TGIS4v_8@I8t&hrq4HVdIdGGaZ)E~x_l1AoN1}%S|+;w
zQlc7Opf;lOc=H>Hr&k=mzLnUdU5<24n@o^%iWh3n^phEL%N@XDD{Uh5202PqHEsub
zDAybRG%>g+VD`Y;Ys^K|YTG?KbFoc8)S!OH7_QZ8C=o}sJ2Z-HS9?}P@R2d7UK$$l
z*HTzLvoa?<2}<s+bx5ki>uY#I&LKKD(hKay_;$3lT@o|R;B#j3<#=yP8M7j^$b$1O
zL9Q=(<}fTOU<7v}LOMc`WR8R5!aP&?o0C1x5hTZC-`#N5Q#&H1kS+jBSPSHaOIKab
z2~icc+{eZ`BdI4H849c@K+%pFhA0sK@y_)$$92G!i293Oq2op|Cz)<9elbr=Ths$n
zKG6$Y;IRsNTn0U39|pLIg}u5-QZIx{f2lPUpIwzoT7<uS=}YG$!bxDkD7??UZJY#2
zOX49|_JIRz?{nh`v0@n)XR&OJ6fWf)^DTT(odK0w$dtk9yp8{Gz64q5Ss}{*QWK&u
zscQlg3B_pAC74`Ql9oo(-~w1ep5APvFMy4ezO4M9i5!qfsz5~gJ?Ui<Q2OE&ez?87
z;i>%iTS0qyLV$PJske-q`Mk?62?0eslO&2;vaPSS>>II4e>1jpvoqd&FSiAZyG6++
zIQ%+il0m?{lqA^Ik>LN|xWZMEuYnshc=*qs{u}dg69U?_ns@cxAJCky{NR5|R;w%r
zKIgx%oPRqo^+MCNmy*MOrgfYL!TBCnF2635L3_b`+an;Tc6p9of7sf8g>6tTA!h$e
z;cFnpkstB_i`@ftl#kJpy7P2<zgZ<g7ioT*pfO(aYVA0pAk*C-*N*Y3+zcfNDIg`a
z2E`(#<VQgTWOo<vMx+aR6Jly7QR!dCb2RV#?a?*$$#wqMm#p7)(nFTHL;NuZkwTbq
zHvm}LgO2iE+Pf>NqFV}$awkCc)-G@no$mAv^l=@^G$>)V{44xu2Fkg3m;{D<0Wx8c
zY-;kWDR5^@s<Yae1_I!wzF-HC@~i^o`45Gne2FypwMHK4{lQbiY6g>e9YN?Wt(&u&
z_ifz!axGw~4rgdQ<xK$icpA?I<XBbOmHWt=(Fs7|&FD)7!+PlC+9i_uiOIr=4BtS5
z;vunC_ma}ZJ15VsncD{~r1f+2?mJgx6x)IzM>Mq={q+x9H;(~UmBSpu`v)8j;|*ZA
z4ZB-fC-)<$JrwQWr}7EXDjW$su*0wX$Ke0}!T%TO|EALXrY2{}omi9b4g{U($+XoC
Ku9T_%74cuDm;Rps

literal 0
HcmV?d00001

diff --git a/docs/source/assets/kernel/key.png b/docs/source/assets/kernel/key.png
new file mode 100644
index 0000000000000000000000000000000000000000..2059b608caeaa7991113bd0ca05654e1a53d979d
GIT binary patch
literal 111314
zcmeFZbyQT}+Xo5=QqlrSgCIx^N;lFWA)P}BNO!}~AT3BKrGO05J#>eX(hOY^LwCbH
z_|4xN_wT#beP^w44(FU*&))lq&*#~Ot0>9fV3A=VAtB+&$x5msA))ReA>EtAxDVV>
zPemU`Lc%Jsl#o!7laQcMak4kJv@t_Mk_}HxMpswUA@bk#1~uZ-&<<lZ%Go351ph=~
zl0_${#l)A#2>5KGMWR)jMHko;Sd7I~!z?u(_|8P{A+{Uoo7XB6oq>7UH|P|B^|$A@
zw^xS){XX_D+|74{Pcu02keGu&iMj1PNN;4F-Yg=E6i(K7kU)@H{4tnXa3>GZ;Hpa6
z+8APHs<#Gn?)Tolny(~_Jl?&%)p|`3QI3WrX3WHhRe|=HiCU(cB|tst9#T186*+rf
zT+F^y!s8c}VhPkSjt8iC(T+=erX0&6af|v1W5_frXFfVFkRCo69q1vZ&vs;LVw#o@
zHo@#!7u@Un{{3NA)GC^JUhKnPWoKI*yG!)9Z01>XPct(M7s9Keuv@j#*c8`5H=?j*
zKLzPJpBG`60!+`!TO8CmhlJ}>lO7iu$t@NBlq13JzV7CiqI$&rBhbd~iXQvCsWqZ@
zqeA>CzueZKTE~w?DW!6zX^=_0*~i5QeYqFx7V+oh-x>}onL>o6I@CG_6~d!FuIM`+
z70L}LMPg(eEHZDs@T|###L>4{Fqbr&Y(Iw27ismW_4UhzKTt1B7u9H%ZTlGUk+L99
zk?G(#^&njJOu^abVQtQaF_zzJFaNfTbn-7LsX<iVzttK?2*3J;9-{aH0~y&LjqWAa
z1$lsoK0fXi@^=C@GQn*B)Mo~)M2ygtQoeOIqB0M{fMb`W#-he2NT%*?OR^1SG4oRI
zJUx4oHbSY)j&^9Kp*G?7kho^=i3JInaf%O$+9D%GlcPH6>;&6x2_Uo5Ad?v(;e5t^
z5b(HEeADUvP7C>201bY`I0gFZJ!Dzr$71(M0>2s&Ga*SCaZcj)h@nkl;|8`bFug@#
zY}Ii<)<>RbwcSHH@`vo9JP0^7dKgF}BpI^){(cVnV~M2q!X2SPw4zL?SYb<_QJEgc
zi#ff0R(YQ}FyhOLJi`1?NPv=<#TNw?QWrc#$o((8KN%kdJma`G9-wGMo`U1NV8ln5
zfzH^j<3N>$XCJJv$b5!BgOM3h(gItgIztnRLJE%JAn@(LGJc{@|1ex89{+JU#WP9y
z_sluCmM;a~r^@0T5T3tlHC}&2(uK7ggV`ZzjGDsQ=hWx*>hUT~F2P7lZTLp#1LG!>
zZ`!tn4C6Sx?Oi4x%CO7OCJDD7Hup}1PI#)RFR8-XpF8IAk?Pa1<0ePw1?t64M;CNg
zE!8>r)))&>l?1>#ycZ<)eSC16keZ47F#PWO-J6fS6{8_K!Z<<o6Bl|ZuC2&KJBqJF
zWQU(YARXQ#X_`g;Jb{Ux{iz+@&Xa2V5`xR{Y<lAOXFq}-%E>bn(Iv(sWY^^I<m`XE
zDMyAU3Z=3>Eul}2k9`k{_x?fsEdL=kR`QEvI;c$!#pg=Zo!4`!-_(indATz_NzBR?
z<z{4y<U&-kSaewS6N2Sg5_%HJ67858vj{A^xNvPn*kU!s1*Pkyy}!Lu=Xt%#e3+Qb
zlFX8+(v&S!D6UnehN<yGMW;kH+eyw$jJU)q3jt+T@7J0w@KUOZjrobQp<n0U6xtLE
z$(S2#dH(9LkTi4lJI9ovACFYkG@R8VHT7~di{0dl!ooQeQ-sU~oQV#ZDpJoaKMimV
za1B+ujv#VpQ+Tq9zwQ*VtAf-^w3>@<<bu8~LgRAFq_X9f<DN>7h&xxWiuVhDTH`jf
zdR{qGg{m8(D^s;&-9AM!wK%0vd1_@iMm0V(hWRrNu{FvMJycEpwpo;#U$1ytx$~LI
zU{FoD5T>{rHRqf}7TGRukyt*ZBXKcx5pdyjEq|?a{ra9-5L3|D{p0(w59C5^+UeVE
zLv2GPq`jm$qm#(S$Yyz;l2t^jMY%t6e-xXN@~I=GklXUbV-5}8UmsULj(-y2%Kqf}
z(d?tf$JrO%A1nGt`rK0uVg77stg5U%Y{fclNdrkHy|YOMBV{87P=#DA#b#5fvL)?b
zx?{S>x|~H4g<5*NI?d(J%fn%ClM&;?vGE@~vu>H&_XIWSY3j$^*xc;5*tXD4ex1N5
z@^Ei(StveHf(3O2A}CJz=3KJ``(1Q5M^}$J;hj3G`fEb#CM)p^<-NafQ(JLOODqgs
zI_oZKiXDVEE{6KyY~9;=(}p{mJC-}#JNGzWS-dmfN*!7=UT8hdJ6hj=<N4I%W(B_d
zb+_|0|G;a>cAf2LZudIMbU^MG#ZtpT*>T)C*IAUqY)x-XQPVqnfBP@?(9C?p1jCQ>
zrhfNs-`winenEozGawfQOtvhvum!k#2nZpUeByn$LRiU~C!9h^L7-1^Mc+XuNizI`
zDC04g^JZUIMU;h{Id@2)3V40|r;Kr~v5s-`KvzF*s)5L*WsSkv@LmjtS4ZHxPw!6S
zMB-BEs3kv2{m7omvK(^GF36TK@iC1s^)j(FN$v!Pk3<o5b9XN<XD^?2;|=boR`@F2
z>O*VMoa@&<4(M_14*T?<ZyYMKeP&w#@g?~yzLU$7JL|*yrL`YM<QZO17>26firb3)
z1?G_Ipt3^COd_msAj_;oCVL_~_em&aDN!~4K(&uJIouF>KUXs+CMP79QuZ>5>H|3Q
z<#*Zde6+6<bL6HtT{%h3+~nB4q77%T!V0%_B+8B8H#SV(U^i#nd$uvfJ^d@9-@ToM
zoqL@{S);%nHgP?4Ke7rU*+@$Cu2=^}ch?wF7y=n=6^@gOb*Jj1Dhi$WD}@|Ar?J|J
z&si?C?iUKxL{u`{j@cL3YLS*eU{B2z6`GW8dajzNQ6PD2D$MVRXLL53s+VN?Vwb1|
zC-@7eYj>=_l<qmj*}j$=T00IrPUfAoWt!cyH6896P#j1-vFWl+ve}tis`_PAChrJ#
zoFD(FM-D5iCbsj2`L>=YFvKtr3(5+v2ztp&$fvrKZO=3uI_t>8x1BN%E)1sX$G6Gg
zY%{*5v8NezSm_a25z3<Besy|}Rp8oYsv6~_FkbiF5#OOp0f_mnexsqY-_Q*RogUo_
zH<a3@#O+&g@lY{b9ebk38Xd1+*yXx(cX3sqpsvTu+;@n;K@zfZ4#I49p~exJa*OgU
z+^N~$H^4axZIjM(bX-<ZlUGxA)_0x=|4d}dw!?<gtb6OVkaP3nK+Ouulm0Y)K=gY1
z)!fHLP0P*RiXRr2QzVtbe5BqxnwMKA1Bb0^An;zZLvhfE<7!rS=ZEAO_Vk**rWu#c
zbNOWrXaP<$|MmFKjTyK{|1aG`*NIIw1I{|7`Mo;Flib~YY?ueEcqYTOzCbYlawomk
z-D?MH7k5UX`Sb#^b3KI4LTy4#BBBaWKhVB(9NS%Oo;c+@O+y@SH(t;${Rk3yIUgxn
zetR~Pd7E%ix$<$Y0)c!P@FD0XXd-jabM99B!sN)THIvzI$II*lab`H@XFwT2iST3d
zzTWP?fRWkbH8}fB@KEqvo%wcM4#l<Mkx}D9-uP{uJv`iiueS}J4SqH3Yp(Rcxw+WW
z>w?*<HbsffVbQGne~3r&a7IG1qCh&@M0&sa>S%NsMbQU6is(#PXvjS_k`u{W5BZ%w
zrWi)Pd)5>xC6_%?*Ho6QJ(3_FQ`ehY!MTFiaQ2I$pI!vp6rV5M^fUwgzDrWy8qD5~
ziV+;!Ul?js-hcQuwW|;CU?a@5<;)cok(hyN3?x)!G9)zM3K{r{Ad~-nEse~Cbno|l
z6eOfDOC;1k-%$d7?>_H=?_HTce(%MHBB29+5dvS2&nSPrjk@>w-e1@ECV^*2FV!UE
z<bYo_QztVsJLk9dE(s=F6u=Ek2U#6J0wfG~Ut~E|dcfsIp0HHccF|URC17fA%WnL}
z-o%XE!`9)h9wZ?T0pQZs%*B|-!`8;mS-?Y>?)Mu4!1dj25FO3$S6r-x>9iG9Xe8{N
z%xHMnIoLVqM6hURXoQ^Jm<y;%O8;3L_)D1Xt&59;00`vn?#}Mc$!_mt0ea5Q&ky2w
z0ebO*4S0ji+0)L&*n`c^`N<!Z{8f*nnX{>rrGty5y&cV6y~ZZ?t}eoKbaxH?{rRJx
zW*(ORv}EV}=dgeQg6^Jxp0jg+{;nG+Ds*>OK*iF-%tlAj(iV^z(1r*<AD7VY_y60I
ze_H%kN$r11K7YaUA4UK5=)V@#a5i(2u(t);bP@SyzW!|Y-w*#RC<MA2`oBc+hn#=k
z1q3aEB?S6=(nPQX!L+f!JW^OnDysv(z$&}@Aa4QxF#Yj+cRkdNR8?Jxgd~O}C;3v{
z19>YQ^U6-G@pj@lWx~TH8ag@259(w$xT+g8jWcz{T=TDZ&1xh&`RSRzyh9|ssbNwb
zh_fSPZ0B}O`0_?*^*lJL`{zx`ph(fIdx^!a1tepBZtxIXGO-JTa{Ekp7g_99iF)IF
zAl#+EMZxz+LZv}M#t=jLkIOvz067egL{`$M!2j{NznDA{a%{wZFZjE<04yT0mI8-G
zBeMUgZr~OCy}Wz>Tgx2Cs0~=8xVbO?vqyjT6FFAk|C$Bl`hP<HkCpj<OX|;Ek@x?x
z4J77=akWf+>i}xl6DNT$MS?Q6EA9ab=Q9FO1NmU_N%ViT&ri;P=G*JtopC*T*VB2w
zW<BE&;u))gqSf42ABUYA+TV#)gQ_P@>^8n8Q%O7dDla!(?l`lQG&s#KGl*ljh#TEr
zpSpKbc^7(L?RQrg!R8_L-oIE3KT`hDV$WTR`T~>p|7h`%Dau-!b4;p;Z(aTQYO3?Y
zBwed#ry(GpR^JT;L#O%k0rwflZpt-!eALsriN=BJBdDlrK}nMrU#D9X=8VU3^v<T?
zZQbP$WyrADV*0KRbo$K)#Thxq^p$G3dFs)xrq&VE%$k!a>sj)-z5B4f7riyjH^(J*
zi5i+8jUojUZUDK|2%fKen1x(!ui*-Al@6dRd*`9jJM01j+lg0T|1msvt!V~Z%kUn*
zfE0k;30qCW{7H!xAEDsqy#N#?b24uApNg_8rDZkMeuuqgAy5?sn}pTj1EVr~17!5I
zOs(+AWcD5zV|Q4Kjz#*f>S??BYR<wrPnM!e)#;nC#n-N$C5<QY9|W$CMrNCDFPrh6
zz?*WuEJj4K*S#g1C;%VlSE9c$q<b14fX{ce*FFn$PVW({$0<&V<XMB;(U1(Z&2X_3
zEB$I5*7Pf(8=cyRqXG*VB+-hCN~3ZQ1;c}ziT+jaAKNwRNx+DbVE=Y$J1&lnmU7Ol
z+cev@#_`-~)5P@&thxW(^DuROOG)@x9X24axia9)0Y)uuIFLWBhBdM!TaKmo^Gx>%
z@3q|9Ip_rq8jQ5MP8h6>8~R;`9uj#j6&POU3jz<-h#*JV)U|^?tDHoZN-~zBr!&rW
z;S5Q#?e7{xQI_)+7DCDG8~{NS)o<jczIdWx=e^tLK1Hr8p)!^_R;SglM7(o#I9M`v
zeNtraTTqY4FBrC=S%43}p#~%`07yPR8UN%j$)~jp2<~o~52SrmP2sjqng&+vOpe@#
zna37siy{Ycu`BCuC$}ptN1WES0#-$aIV6uOb-lN%W`W)!+Ke@5U&Yfa`8mm}`IT9B
zD^r+f#n{*WB607gbgzMCLZ*u95cxY*CB8)?__Z%A4s+*t3Ew;n3!P$v%{bZ3x=y%)
zyy3k)folWqJtCwv?e}n!z5A`f%XanK)vPNee%IEgO{a|@mi91e$mDj#ph)ePs(#c4
zCHOPrW8HY{W9co1n0H?%E%|I}lDwjB!kbZra2S$Dc4<iVdJtcu5pau4#jvc%uwnJ7
zqScYe-nlO5^?Tm!<l{cC)sGhC7YwxPLo(Efr)ezMsfSzv=!2rS4IfHfPiNhsjhNhA
zhOPx4XP?*CUD*ZeoR40;ce;3V5GHy>t0j}j8$RQA-fO5TdOGKAzi}0s|0MsAtgHB<
zy7fMJrEjy!%Hi6e@-FX1Q3R}`t!iAiz-(3#=kPYe3-N7K30~7hW}S2D4G-`q#V89w
zLbZ}<nGbgS+pG9oj_L+CBYpOEBa?JpRq$G07tU{RxFojZt)8&DsO)#4cCh2#EJ*sa
zV?!!ka>8_NEl_?L=ce*l7=_dR<`>7liLpdlQ?mRj<J^9m7KyZ@@WJ;dWyt{)w@akW
zvf$f`3EoL&yC!$V16cDlTXM1AzLLmES?sSXzr7&pS6A3BJ83B%FQ4tVy<%?=UJq@=
zneT7fP5fAKIII9-i;_Oqw{JXt(R{etZ@oA1Oi;>UB~IokOL2_}NeQVqq3Ar=jRKJ{
z;Q6wIRuObYPkr$-jO;4RVgJNA^HN`eLSQijQR!oHE1>Cnc0TAghrd(SO{D;6#>mi0
zJpPsUqa~y+WQcgg&@SY(o|PR`$q4ywBS9hMmZxOk+|R8unZM;hz&GUEm1J6K=y?eU
zA~i3qu;6l6+Ou}ExW3#;t&z>?1iaK)b{_<;QEb|ew&$7=?Nt7?9ai=ff)yO<Ah76d
z`4rOBIDC;^=l1g@dSa*+#RVm>Zj*FG#A_`#!!Xu%Oq<X%0vGPQ%h5ppCg$b8{Y)`4
zOl{=BLCbL(*|Ry0@Le5stLEKS^)osPLqz578d>ks`*#F}c-@O38d)lQr1r;jhAZ?~
zy3G*TH*v70jKjuiC(Eg$_9k$CNz(;_)nMK_*DM8JQ?GHo+a#23Gt}2y5>Jq@GSIgv
z*?c7B3Wem`d^FEgUaKMR)<e0tpO3emP04eWDFtfAHJh)|GwCsw>6yq>aMYbF;`#CI
zH};i(Za(j~$7*L{uPyQU#S$_hdI=vicRce%$G%R0zcB5JVznW5Q%@E>|Djyn7bJqE
zy<fl6OkTFD(Q+uPq0q^zcSAVP-Z+|c9?qCC8DGCN<oxCP#$)3V%c@v^Ey*}%e19<w
zOqvCasyC(BfBM}43I4=;OUciVjSkui<>kJGy{3>3%#=*eq7v8aP*twuE2MV%`1NbD
zPdIg1rUafa*CEj^h0k6q|1HFIU{b*_mO)<klD;i(!szwQ*vR^qFsh#6c|bQh8%qc+
z<@$8fB^nyhVi%-?C)el@OD!i&=lijKGcWFs(H_^77n8aXnn!xsg*SVR%k4c7sv@@D
zeA9a}ZEvV8LmYOk<|)fmRV<5?AWS&ERGHZ1^#Sd4<%9B5gstWF$W4skPKwvQGdhp{
z2b}8UY_H`FcajMLYxhYlJZ(`)ZhFy0z2gUweiJ=m0~@d>^=zp{usH!Ocf?gafe*{V
zv-xgnKcDtYGb1TJ-?LI--FuAJRFPh;`668dC3R`bEXD?nKJHVN(9k;XLE+Ptz)cAy
zSxy^<@_gR1ZSF_KmXZF+65o*lL*ob-8e$MlU(lb1Mliq}B?ElJ!1s_Jr?VVnMfh-3
zqDw%O8m#noNFif8I>*?E7B~DMg#9_WP%U6^%Qp|Sxaf!OzgKZ-K}M%ngDNS8rC8>{
zKc|J7-=D4;Q?Oxmlb<PvG@W&VR{P?LPU<pSMBHajm@17gTSbyp!q7Eyn8~uHnCl>=
zkzxDtNae0$gNrF;iH&9h5l!1ClTn@V*q{au(#BA`O_*u#31Bq(&WAX(y$vqO>hk6T
zgi8ceMlV(4Hcknz<d41uMfwpzehus2TwTTeSQEWHXEp4ZuU*SX?bS8#<LwD1-V=v&
zPwOHpQ-cZ(aTs8(5=-kx8zasTE=gfGNaExQ;0_FgBvNBepTLUdvc}<FC0=C81x9|(
ztxBju3h#aYyASrr8Koaps4@5(DR@ZViPX@?aYxGW&NuKT`v!}<j$BU8BTX~(pq*Hj
zZhT!ha^<)vrLtr=6WMz~dD28oFpAyTB&(-4ip{a|k}i_+#(Mc|YNE+J=<B6s4wi7G
z%NOb`a>C7p;B8x?(_u?SveaEWxDF3tZy^UOIsK~6j+vV)^Ma+U#XBv%ueO`slUkm-
zeo{-Hy>dv7=^re{@UN!4s2DkPeYiJwa{{x_SI6r~^9n)3CA282IEE@UlmGfciaZ!$
zU6&kOXAaBf3`ng$X(D)p0Dt{biNMUq106um+8`MAoz09H9(1I;GquhVF16cuerN4z
zqNoiNRJcG?6>%53dLSE9{Smqv&0uz*Oam%@Thc_|BYZ@YPHHPg<YGOIdb1f0nTb?t
zlnwL?e~but!f4EtHgU35F&Lwm^3zoC!9jBKmDx8o!mT4GiUSkxQEggr=gv;Mfi!*{
zzDv3}(21khHkBTMbR@eA;-Yl`%OmsV2ul9yt=0@ey`{p8u%|M;$SaYUA5)+j5?r%1
zFZX)V?NIxn8y&EV@DHFxl&VzI%UZ_$SMu8B#l^tjTd_Rq=D$stU0G~O)EMTur<+ae
zK>BtrHcRaB5j{)gC>fMfY(tHGauk*Ov<r!GOG3b=kpec21}m}6UtvDY6C*SsHfH>p
zZ_tcS-UJ_vOlEljWjNz{IXrG-*eF$~ru=Lgd!uGKMgcT!?1Jz#@h*n%5mhw<4zJmH
za^6TE{0dGwWf$@eQf6=NTw(2oHoNP)>)qc#0^UTg4=3DXuiq*!oJk!!JkuTWc@s#4
z7x`eZ<2(C;3Tn9>3Yc{VwXBKB-mMj@pV-3SZk-1a@AZVbO;Z)WY>9-2cnuHml#b;8
zI026`7ZUGPzE#*OIe(JM&muN;X;8tTf4BKn&`>_S32ufj{$p?$Fwf)`XHS9PVwC4L
ze)UT7;A1-EtL$2s?$_=cvypTD*crM=)_AH+_c_mj)4txTi!sjG@?Q2-%H-{a)5cxb
z8#HeB^GVZKSBTljQJP<?%Uzr@Ten%9dLF*r!~SX^=GRjt#xw`<FWkxcPT_~bn&s9I
zun4c`!OGED%RM|7NI<a5{Ar6b_)+hbr3}TD4`hRX{@&pEV$B9b6b>bDI~&A9cZFz-
zawYibxI;wPV8ik(iCs%;T~3fW5m-B}V9nPg-Jav1+4Hr`-K#Wx%TIuLJ!~&=-tca1
zXDjO<{1`3Y{5`z6=cUs5fdp@4gW1}Zia1mV9V*ngp=gg4_Em4p;&S>x0+OCN&wja0
zjCRxrow>c)y{+S((63l8x8{uv31D1b(X}r6b!CEKiKv750FfK0vI9GHr`Pbt$xE>r
zS#;27wFFpgW(zy@yKV(V)w-WZXVtD%Mp+Gw*Kq7Tf(|1UbHczazoW9c=KU^m!FeL~
z*;D7AUz@Y>b8RjG?`iTpmTTje1TKGv=RW6Q{sq+g>}jga($x{_0JhJD>oxdSga2AJ
z5E!`l%{6!`S@w9o<i73))vm;ElwDg~Z#0nZ&9E4r%WsANDhvr_j6nsBn1KAB$$pGH
zeNVZOwQyv{y_&ak5OPwnRUX=WPNIDRHLi{oxp?a)yp^{$1DK&tZ-;NN*=s(RGR~?g
z3u{bqN*2(&4PWjohEXeVo!4pFP~=nyw{*NLKrmB}TUM#`mcaGF=?}haGkROw*Iy#I
zEf2pYKIaA*ZdFcpF^cSm7sk9@FVPpBg!6el7T=VL?rQ>KuO|cI0`OboFv=v4I3B%z
z4PmJGtX6Dpd592|$3j2{cJvP*EK4jP|D^n?pdgg5ats9-;b=kXHstB3S<*~LdfKCk
z$Ff=ZE)wFtl>G(KpGNw5<Vzx%(>Kwv*_V9N-b&{qh@O+vA|cP0qTSNn<8VI2QT=PG
zy6pgHsqXi#@yD|zk*jE9Gg<kZ&~!Illg(fQVXt}LQ;md=Bla4u9XG|cctVMBjky9}
zy=>ACaxbf@;;1~5@Gk*3Wek%exEjF_?E8mf9q^ptGpRR=WN<QjV~!=bbb(7$ttPi9
zv|<}DOdDo6VO0~AC5wh+Cf_`d=n?Q_5rbH*ga;c}3+SS6-znVqU2htqgV<n%l!d|*
z`p$~I9J5|Ov5QdoQ!_d!Fg2TajqA?B8gs9WVuQ&9WA*nK!Gs(umAX*vFzYG#qal55
zr|UPF(vyqLSsLmZCuCqK8b;+=c$LAWF>M%l(sJ@^P4}^UmYO!2&Y^3kcp#$AK#bvy
zR#s)z$Z4Je;LbF2;W<*&J%xJS+C17WD0DF>=;~?m$je$v_+UhAsOB+G#L-wB!nRc7
zT3E9jBJfb`9QTGH2jLE#hlXx-McLIXhH^z``8By4k;ERn$Rkc;u+mL6t4Bnm($4ZO
zSqmi5Fle35NB1zW6H!t_&K50iD(UPSu*wkf+IbSx>Py~%ZJSM>y<;TJ#{yKSD>W!_
z2iUQ4tg)cqu;{6}-9q@(d0u{wfoi<BW#;TGxFBhpELzgIatm2E!qdB3>Kae{{k5tH
zTdDrSYgJL)s9s#Q5uXIT&!MtiOq(6Z6cyd(e`4I4VP9qM?K+wD-Uz=rO@0N@v0XW$
z#IIL~50?|#z94?m$Or4GD4zDobn1Eb)p)ex6+0vj&oVf}j<EHjvuXCK>!wF?!l+;t
zAvHhXInGx9B0$0N{7d4ua3HMIqn)RThx<GXryIAcFF^wbj_609=oVblc#&eluvnf;
z+FcxNeBHl~%s2VjHtxONo5&?CPywBV)|Q6$*x*ZnS+xk%xk<ZOyux+ewRy!pzJUU-
z;m)-!NBo_!isDx`6*uiosErF(DyK1Rp?q7rU3lsCRu&jU_PruAsMC|;05Y71a)*8-
zpz!<!T>lgk6ON4lqq+_ZS|lIZhuLk^Wwx`r`<`sUahCM1P@r>Wx<yrQziYi?C3zB<
zLR<`+vdYU+Jk5$^w<68~O~eWy^4h=}<x-7hM@u%d4W5T&USvN?9-+lCn19(sFqlbk
zbizR=<4sB@49sxj@}%YyhSA9{6OJ?K<!T@<$B2>Fc!Hvt+GLHH?xkOZMu~n5vXC%l
zQdxJ7yf9YW$a@eOngDj6bq&{CiLx#0E>|#n>=k$JIZp<0C!C1kTY5K2vL6*<(+Q<g
zdZ`Zk`2M|KHGXK@H}q80N0^Cb1Gq9t3u>byC9F2EHHNqF#ml6RJI@hg#CSm^K-4Bt
zX)97yQjUl6k&=w!DqL#4@v*iwhVNAw-5H+V7;VT%spjUWH`k61SzBH)$?-y@AFnP!
zLtYGXM%H`~q(o|jMZ117><`n)YO=Vy!uQt#lg{+F_uyBoDMrKLay)nHWn0tKF-tre
zY|g3cB+)BmK&^ipN&@8I==XAzg#kY5;7$Zfu`HwcMDTgLlWs!GDXueMQlW=*sd}zs
z8tG574I$^h(Cx%UCr_<%NZeK61-PtDXG<F$$4m1{waWDM@9m!-NuZ@8Vo76$L+&@6
zB`Qr)lh|QGmW!yTKCWIT>DkGV4gke=toH*-jpQhD=exl)j$joRNYf9OFZl2#0B#dx
zGx^!$88COA7_DXx#&^@Rt<or<@vYwp=|6JM?xZ9!aCr^8Kokgq=xejk2rrV>MW5ui
z?dB#6o%8diBhn!5iy1^awcs<B=&8u20qjmogQ4B=NJpjSvGPvU2)f}cQZU6UKhBGO
z7K1UE(h^fdG7!=sp+1dALYb2sX0-iFdR7t(UF7yICBEC%oZeN4H`)Q*f<FLI?58`*
zW?_XAbH~|Syi&o_zaryhStMk83Wanbi9flIpYbier?bf?rH2yIYRz}zO*a(gnbq2j
zyK@!?N<K}#pb`Rhz$}r<AZ$!ik1bkE7(8}J{3oRPNCxokmbDQpkN*lXb5XB?w>DI2
z0f?^jTQgw#T5Rh!auu%~2vG3XGyFq+?Ew`x0D8A{v*vdLy?lp(`3F>*#}90`Poy-f
zfA`D}g>VmibC4|9?>E+$4nQ(HqBqN;pF9@<&gbV7<mTamku=Yu)r-yNj|2ZK#H@&f
zTr%7l^fyTJ>LIdXRxxb_#H{~55!c6S^7%9A(~Q#wXps~?8QlWvzu6-J8h`%@WY&he
z&HpzS^v|WyB!KOPDyAL%?FRf21^%rbkbwc<8Z8q4Q{+D?YJckQ&ug=k^Di*^-z-xA
zE(XnlD-w3e|I{{6(IXz9dSfC+;lF17zjVVz3>f-7Z4Usa`Y%s@cd(@tKm%}6`8NI+
z((Cgbh)UVg#_~UwMnVk*Bx^s}A&2+BjE)jttYs641WfzyG5rI(bJzgYw?9?#`d>!3
z?vI25MlapN`lrahD+(|MVkUAP$KgLC@4saJ*E?e9z}yJ<hiLsT!g-JiY@5>X4}XSS
z|1mFjJ^H_?|G%mKZ;RshHv0ehu2-UI1r&y<#6pJmj@*@E?w<Xf-$enCWvY+Nf!K^0
zhtqbdvW?@8A+vl0S23S&Je|{fWH(-@^VQgVFvH-{eO&Ca7G&HLbaCHD5Z}@(IN$_y
zB_o~AhnV4^^G((x9S2-+ygqKt)vNN=;tncB2UHyeGF31(ybUO&CGGoJ<dwQ_=|(vo
zGDZ{#IYR=wmkQUR)Eyad@7OpC5u^ySx_t13(koMjFu0|IFon}X=?k)TVOi*iVH|)1
z^v(pLVUw{Jm9}BoT^~=_^OX=y7`UgWiuys?tA2c7tdHiMt66MNngRH+4a1?%Z)#AI
zbfpm$T`9XD93F>WP_xev1A179_5u?6<-IQ=4`3qNCtHBISyuGF+-*AuyGOB=?tdZX
zoAU0$XrypT1O+@MkLpdsL4(3O=aD__xt2aM*-I6APyW-}f9XanjGrPb{%y~G>nkRD
zz~_D!@&W&mFbaSaQO`(|+a{Rzxua1vK8Szvw@OptM5!FK(lkoJ29%@`3ZTywX?)^C
zTR1R(icvhZ9lirt_CP2Q0xDkT)xM*SsK=Z)t^t6(FxtZWDMEh94CE<^83kgDmidPI
z^S(?Bc|!}RWiI@%KW!Q-eam(7CqShp5G>rNNQWNYX~z2spJPr9Q5#yvF-^k6%rHjx
z?`u5vc!UCU;WIkQjGwo^j0#2+AMy^~6L8BLz&hly1_;dg5$rMRrE@$$xn@x5CRg{v
z#3u*rN<cVm%eA{n%?D;wgqZ)O1f@JCIW1DaYkHOLOAyK&)41dTY5)p~Yr{+8H$iH5
zty{_XU)pa+JP4xY;6j<94h{Vje77rCfNtCXs4R;(fT#UwF~s~ZN6P{x4N*G&<X(Z|
z`BI-_wmDH7tj1^y1OBv&Mje0v0{Bd%#N7TV8zV+JZGquYOSkerXn}y&G<!!Q%Jr%C
zP4HP@K6m!JsokxMYjgE1(p_hI!6)SbZ*~F5#9b~v#}QQioz8AZiga5b9iUW;X}176
zdd{I(o-x7w`TJBZ%XmGTvd%i5-8dPlA3aKO4XfNGj@BiO)p}OnzShN5>>v~DA;5u?
zz+(SzHcnd1WcGPC#c~rs@Iu=R+!iwh(WyK>-{BqWNno#yGgYl_JfS`RX}4+H*<Z<q
z86optn?<vQIu+&1ZtegSECd6Xm%2MjlJ>iS`|L09euNfRT1i{@JzTyP-!;McANv6B
zT0fg#P&^gq2@prl^H;09zcH;)hhEl-D8ZeY<rx{#iy?Iyn>~&tM*KbL2LB&as2496
zLP)_5w`bj=!+BoU@g~jkXwye#()*flILN3Ga;~cHcQKNAhle0|W*pj&6jT!r=L&4A
z`;LiK4WDdo9hvQe{LUl2Pv91_sdGoU0v2e~fA5x-?*twI4>$uj<{PVhA4j<CM{*UN
zD=Y?gdcD8j(QL5mlj*hL4>*9p?RNA9(K9bbp)<;mgAd{*gvX<irB+JY%UL`(0Gqiq
z4nR-U{f$6@+3C3xp3{XOJRY1U0rr``Czc2cdXU%8-*HI@YQx*)Er38q+{?IRIht9@
z*{#lKxZbuO>?B$e2I5@Z;EbAHiwsZmrDmsJTQ~4Q$avMuFe<ON?Cj6A9Xp9X+|gnN
z-Eg@UAXk9Z^WX=gAm(t@Jh@#B?H&7^*1NkVa#~xwoci6rX+JK_p$+p#H$d)n_?;^p
z=3KkY2m+}A6hSJx+ir-_Z$AJPHWiL5`Y+!}5#tI#fBUK$uYUHfaAuyrT0*NXq$Hg(
z25@JS4=Gd4M|Yed+@_-6Y2O>U<QgMNc|H>434kM+UBpxAtzgi?<IJ(CvCBSSi6onr
z@VGGA0o+Glh?bxiY}QQyAj?mG4McVu0xYCb78PbQS!Oigdv*D@2%pY;c$Fz4d~MBJ
zPdMAw-exzAV}HG|_;$JUk}=b#{8;zfH6COy2*=I+@-wMl>aNGUJ395-xsuw|Ph7s-
zsOr@Bx0?V-@~)hI$h=zl{9)Vr>ZaHBAYZF7pO%`r-VLpxhL$MVo3Cj-EAqlhFz&a*
zDt3(NG9pp{DK5oFAyR0;o9+v)48jxP+s}UbBIrssr4om9$`DcbQ<-#XI-*$*8C2cv
z7MNG=Pr6l2T_Q-lclCaG%lB7d#Q@gpTs=`%feH3ofQd_~7r{jb;un>k6#|ru77Z<f
z#Bls{q}Q>}j`6Ya_-*SQ@A%<n`Devvz=}m{I2;re9num^y|qJaPyjV@k%oEYZUY!D
zj^m}>^6Y{wOCPYUKf)a@HlQ%#TvBC7OpGL+gWjAg(O!ht=g_*JAY^5qJG!1868{VA
zKt%G}nYNPsJbj{rQxWk)f_?$oEZ1gO@g|7cwl@tGB<d>b)QUz*PpE#>`kMND+K~s6
z8j4y4WQYjx1BoT^+V6)kXvmc|-X;V}yM1pSp754pIqe`)PJonZQhX7H@_33G#Q8HW
z%AE^eZdHtI0}$lc>-a_t4<Pl$&*=w4GCd;E18$bgfOs|;u6AAkO`W_2QX~|M8&9Uv
zXmlTHee~B-ua;W8_6ZC(Y{^<zb^1;l_Qq^&jf3@wvxl+Xn~2(sZUqPBiTvk@`z9XW
zEvNBL6H9y;>d!TGTChEjiDu1D+@37JGkg$DC1GFELMx`i4#3{q#PM5~nuatI{5R_|
zq6qIRGXjJ$^>9(DKXh@n)lF1cp6;^UkryR{jJ{_7$sPcz3)k+oVUINL_X_ieBB6dF
zSg-OOXSFMirbfoJ))d0DB~iR*l8#ynaIj3)Bi}Pcn5HaEAizd@h^zO&vFUPlSE;yW
z35(pe8q}Ki7Bz6Hk<8IVbcb1_>Km2s(a_Pdo6k0oDq$7m7xMb*LTv8ap))76-BLnC
z<a$FH7PXX@9U~7N9g^|w&13pZyvG$`3M3y4I^u3X)+MOrcL1L!i-~Li?jEAOXR~)T
zNJKa_>V~rcFA||Bz~yqr6FyQYp0#c-h`K_dw5-hw65Fz&RL2tvf0qy9!QC!+;96wB
zxc^!rQbLF^-9@$jCHD1LneTWe@4h%i8j#eIFL@{3n%7n=P>{u<LJHMzI;4L?s^!Dk
z*Rc4^hdi&B#oZ=LyozczH)F~82$PPuHvq;-BCB@sQ|$}nl5~(98I|ezH@#-pJHE}$
z0I%a^U&vEIJc0Gktmz*)EmoWCA~LTkon}kzpDuJ7YU_P1LElc)0)Nc&ZtM}p_|o6j
zX?Z5+Nc3C_KJPN3$hNF9i?JaajA<6`KM^G`!5RSGZRTM=0ZjQ2UCXs`Wsh@~^C6IU
z#Agi+K8dI%1b~R_7loqzoFw`}DBTwomsYWzcCo?$t=S%NCxGOT@axWQt8ADP;(Ap}
zR`KYI5D5wQJaFOww%qwuCl!w6QC75HHg#>q2y;ru@*vbSrR7r6>xQN|$r}&f(InGY
zC45|39$O~JgAn}KZKev^j&I;dgyovnFWX1NZa|(3s=a~jrpK*R%VvHaCThS1!TRxi
zGyCfKi*Xp1J;IAgLEp05^~r+VJAOdzliAIcD;m~tIa=ZCM%cVxUr&w)RT?<v4P}M>
z1Tsq&Ll?^Yq3EY~aC~T2I8_p%XW_{EpcZAr>4>y+i=sT*H|^^$OLlIo#PXgG_l1WI
zKiGgxC9%C2!oH+K-c2!$dB)nERpQZ;X;=l@>os8FHHbAB+OzOAi+=nrJ9xoOzXT=2
zOXNbMBL0qA2=|h-;%hG@A+}`nJA3@#PTE0b5!!&bnehQfEkyMZCHZLTvy3gnz%d{n
zC=S<e?5g8pB!R6gCHLgLVH|gmF_d74JNoI62@0aoY+6;yRtD2to)?TU5UQ36XLrlx
znVR$7HJDOL#<9wt=Admdr%Usid1kX`4ck6y8VS=Um-bYxujxy~^eES(_A7b>_2^s#
zJO!3Wc47PWN<@G~7&?0W*rlQ+X<4NA^L#bwK7~a$AhGTKw8T}}ewn{TSYs$h4{Yu=
z05A4^OE%i0wpSKD9xXSr%`92ICPgmUUg~^;ictJKL9XsIV&irdZSRxKP^g3^g!$U-
z%tYtHON{vV{^&$S%@<XYuz|r;UQlwFcFV|>o!KJ8fHri_u66h3a@QVYcIo!6C>Uza
ze&aN0gO*z^Em;bEOFq~;!~zI@5G!ds6c9U^fbn(0wzWr)tFCQfpZ$>MF~a(n7x7-i
z@R-AcYZ1q6h0hb0yX*0^7J$J$MMt#IU`(E;at}*7m%@48c}f0&S39z<$mZ1DTx?>m
zrGEHHx)kYs+Ib$QLUa{cB=S8BF9osv)g4_h;RVV2qoI)qbHawgM_;+P<g>qTWP9t>
zI|Bf>0{vc<6WQ^TGC8^w&3Cq|OVuzNHIGgDa=4KEL)+CU;=NIGSk|{kof2l;c;Kz>
zT(*}y2_I}?MV$&}MxU^s4e_{Xwp3gm6!1-NxaRROa80{xK}1Pz3zn~8ac-|NO#xS*
z9^=R>sGPU26i~gZ&XLVW8(u5~`9#Rd67t8LY&iu*R6+vKTFN7u(g`lcAm8r=NUj4H
zn2S{c3BuGTW0Pg<BzqA}c0X`2-+6K9La8EY!N)=@P6!gJtG910IC)(WqkSc9Gk^o}
zb*}TJnfj<hT;qu{gg!}U0`H{EpK*VLheEuG;Z+8H|Ioo<=B-e&*11NuI!=0)&_<-a
zgOw5dzOt;$G+Xr~-F3y<Lqir`3}n!x!(l%!Og>|io8jl?H7pn2WdlSSIDqp%<sY0J
zgQ-m*ZL8c*Ofp42We21{e&eth?4))`%v%Mi=@S|UF60yuV~jzpS8W6hjK@Oleu?Xr
zD?b58q~%JM-O!^Z)dDGMe72@m7)4bE`vqFYq6VxBiO0B=Ph8%}aMBJ{TcjShRM^@%
zYN3bHL3yBT-inG3(`*f`&#dA`@^TlHcSXt~rRv`&A>^S2!}-~5Kwg>wpQuT}IfiNd
z((>=XXGwJh$em=O$G2-R_C8w-TMI2fV23yAYz9o(X!I+gHh75OVVYdDJxm)uzgiQ`
z^u~vS4C-pp>#N3oB9ntJy=rttH8E5Lfax`4k$41G=b@Gs1WX|wEjvzu&FT@i3qY~S
zclIi2e%LlAhy=q~sH>mTE)MWiCs<%cEwU8$!<usLKb^{NB_4GB5>^+jY-UO>7Qq|S
z&f~*H{iP9Hy`)iEK#n|kLNTUgHHJRM>+4$`t;?O;i%^lxv=zP!qqLQWwQ+ujAatKk
zjNi`ZC4ZDma-gUttHnPtC6L~Z0|EJ~@Z*J&x6|SNT2GleznV|G^`nYTLt<o1Ivhhy
zF%vZx)H6mU$_#<D1FJP|XA{yC9`e^C#r{zDH2ir0O@Om*bl}E~<6HAbn_w;K!687N
zA%hHsqonE)@>CO1=vgJ(r-R&YaYIs7)3yeh0*y5S)r0tZUe6rc#EqgbCR0l^)tH@h
zjYqHu>z*am0{JGFKdpGyQYl|pu9Z!jB^#87y{^yz7yTr}mIp-1nlN8u?U`o5vao@z
z8do8_r;F(!wK&Z`5$!+0;te-L6Y5pE{PdypRb_dk_xF+>r9+A?ziOXfgFOC){?Kw}
zGO+F8+jf%Y+tMv+*(wqmqn{(wn>=)%BGTWH3!}K&(8PeBz5gAnFH3xetAna%2~MEO
znHfROK%&&@6l1I5P&Kgggke*T+gKiMcFW~FZm+$pyhEwp^o-mE3v~7d=@lp$BCNd5
zZt98^RKe9nR-%&Kj(*$LhP^$C+c)L|O{c*j^rM3831uQ_?wdWRLRFp1ua(ks3GfPh
zm%<qZqUjC|yH6gYfHi%#fatLwBRI9Tx}QX+ltq@(?`r?+>uO;1$krov?pbP=KVO$X
z@xz)(68SOTd!3Q`Zm<yT9epiGyu3MW&iq=!{{%2)_mm9~=!*ixdl#+Xb0(rauEN@n
zYC6GAn{Nt?VXZpI%F(TK7Uar7_<-)LTstxoT2U@^$d(anlw-I-;Qd7r+(%|n32~E#
zk3*tjWW?T5apSa)>01#ljAF~*NazQV?2YD5IKU?@cb1HTs`n`%Ww&ROSlW%aot2Jy
z$jWXT*ID1gE4I&661wq=4Uh+mPnhk@mNbQ~Jb*+OOVv+a-BWMY7#>u2=n3XO(+5Pj
z&G5LVHzongSOBgY0GkD;OYKkGK^|J_@o7K|LxvGdUR*8Jm?DnELfCou4NY|FPA*@J
zM(lW4wc1&N3~|OD49u<Q6&*Zm>zIClth~{ASlwxS{2t3KF<i4cju?+9dU^9?GAUY9
z!uA&+8p{Ms50k7pXDK{L$cR-@fF7gX@lWhxT)>EvgkEtIHA-asW)uNL0suiYQ7iLw
z-x>yQgCP5nfG%IW%8nKJBn9OUZuhiZ8_di!7>OyK#v23#^Cn1=fz8EnT*u<;$b0R5
z#R-NNy3h;_LdxE$QN*@+NdAjBwvKPHQ(F3tQiRH7M8(s#tP7TNzqHdv>JzU03r_vb
z0f{M_ne{3;hjgPPhc--|9c3K#aSp!uF@4^894ClNEo3!dkLkLU_nLDYn9#-xb|49Y
z!XB=D#7;MmpX~*UuOSgs{E+Pi`EFWJM;b;QVv5@Enn~&aDS#FqgIC@*m95;!{-O>!
zxz-I$C<L}N9?qDdO=4zJAH5ksl%^StbpqoQs*+U!l7m9mx*|EaHgk*74P_0dJ%_Mh
zR+7P+UKmsSGvZViW!vg4YM&a*wJ0=O)b1%bH-=nUvAnKW$Nq9%$KsdI`N_L%ER!-{
z7adr31ug9mWKuzjzsZXA(3T&JnQ2v^(E$~5HR1K60O3l(%IHoI+nG^G0b_Em*AP}A
z;j68x(vq5X4*jZZhElml!{-Y!dX!<J1{3%rS^T40-R|T~Zw(>aZ5WPt;*F`YeWOfY
zvV>&&tbT1ZSbq{M{%)D@KDeK6kak|zUy-_)SmeZ%H1ek`MKl#wnn^RQNczO(cfbwF
z<m|@M4@mA1?++&hA)uT4-KNuac#FhJYD|{&aa*wL)D^~8B{}^2TlGQ3A?4vAiUw*_
zM__#EmOMryS+$8bFt0t!0rCCFO{t1+iB2lg>j3o<jP;>V%;-I8Tfe&AtkAZVl)M(+
zrbkm$If&<AKHMLgTA^r2bcn!KbdbLUl8E}xha4RUeZ)QBV4l;qboaTBB9p&(!EEDz
z_e_uo^LUQUjT_M+3OA-sAw&ui_1&+;Ydxc-IKa0bn;ep0?;tV+35cw`mw16|)zP8k
zXl0DXUDw930T>&cX?&HLiyr8GbtWR)PMVvGZ=vsxv&Uk#NtcY85rLXZV=7qOUN*BS
zZ-IHz;eK`0IXOjaGm+EE6Z<@9A(ri*DGT;rm9Mak0_aCxoZ^#W;btI7n^%$?gllol
zJ8f8M^zw!K&W!$5|JCNFDQH=Rc=T-Sa=cM4&u3SwOKxn_<@ZSA`C92B4bvP`S|X<I
zh;LAyv&eR0$}lI)eYrg5gNd!Q0~-U@xulR;`{rw#QX4-2J)(p={7Tfy-JZ#E3G|23
z4oFS7?tBiyl^+?WBQ=989a<Ukrs_WH?R&RPvk<Kv5Kjr9q3do@%~T6!crSg2hT8}(
zBAcrxJL$NzLaz`C@@J9uZHo<5IfPbQDmvk07Z(=ovO*&rRfK^+vPN}*+_P&wJj7H{
zqCP{)Gb%y6Ps6V%oDc+NXgmhkjY!nqTV<ght_Gf^ho#S`TsdR6lT#R)PkcCs)b_t`
z(2}+>HEf?W{gm0i_235kz8neC`$f9;+{a)mq>)UhS|-;<GvgOmcPOO*{z0}EjU#VO
zR>?bN@5FjU3L>H{cyB7yle^>DYQZwR!U8&Eof5Epom*OG=WUtqp3~G9dSN*-1mP>$
z7P;Ia>QjR4)7R&#StV?+`~uQ#C-;c;qE-IbUs0q1cGDKiQsFah3N6)s+Zbq<#<g!d
zqD1hgV<#LhWvH{a`KyB3WZs61mA>|Fv@zTs%sT&8G;1|teH1IgTtNA?Iwy3*O4j<L
zTK)8D8Iy07YpNpl!=-U_?dd}mPZlS#r{sP!S|VEx8rETR=|^#bZdTn9&||%7kl#sq
zZ`W7%l*tVI*twvH;p)XPaLhcO_*v<gU@)6#cv;!@%a)PL2ufY@B^5F^S6#L|g63qj
z<|$@K)xO|HhtZ+N>9t&sKY9S5_@&eQcU@Pna9;GA)r)%Hh72?wB$jNYL^bo*1HjHa
z8=&|sC#iu%q5=2EyG&Fe^FD|D6&zq2NAQBf&8DJT@!M`}4$G-KT7Bn$`u0p_EevC2
zfLm}6Te$|<VQ|}uex3o*7AJ6{|C-3neyrf_f`bBUndFWx-=4vCy5WN-6uHxP*>G-6
z)}A|Q!h1x6b=$D{xeVqRL))scG*X5N>n6sGyAwX;%jJ~w-u<?vt@mq7%^=rH6g$|4
zXX0M9dM6R>?h}Iyi{u6agVu2w-nW<YyFSZw?!pvjpzym>OXD(6*E-qPnyo?=*kDbx
z2q4Q%kJoZIN4z;d6T*(W=H1Lck8iWLOpY@E;(4EOD*ojVC;Yys8DqyC5Tpp|J#lYX
zrW_a~R$CPo+$HxIIJI)UJI3P{L-`w$e&mZXTRAA5iN#-tg*@N>!!w}kymn<q0enCR
z<yi+(K{Km|?ptP0Nr3ScK493k*$p4I2dGUGC*OLcRpTTB$r+WxS6QTudD~28K~sgm
z5hA5|<u*a3dIArE-i<*J*5Xm|k|-i~T8cy`XO^k%WKCu)$Fe{eRO}^luw*>Fe~vdi
z;FqAx&-{KV<%L;<o^?^xu)vU;B&F9;ItFm${K{fY@Kw&PUL>f=<I~9|_*4J}oLX>w
zG>zeT3VH;<yG9Q%T<TxBz2@CHD@bi-czQ8HN`PY!-TW?(G2V+B&9A5$QDx@+Gi2%Y
zsY5^I>xW=VYYcy{&P0A+Pu2?Ij&GF`c<i0o{Lhv^a$nFhidp*cd2EiQ6ld2>p<s)P
zPnONh%u!1dDm`xL1{+$&slWJR)L7mn`eG)qEgja+U$f&TxwL}~XO^7sg!dBsy->Ss
zTq#c9d@%GyQ{Sy=sd-y4=YbGr9B`_Jn(USEDG6-k9M!FQgg3v+->MAMa<~w9oUHY(
z*Uk*)RnVmrnVXkvnx;&yc^F=Kh{hL_3Iu)thmj|Gw`Y$B`T70D_>reiRLLI#?iee*
z7@p8ANk~u&AO89&DcF~IpD0G)c-R{_jP?m4rwbh1Q1H$<{!m7GnwD8Yw=Gu<V6b&#
zT86^g1`|6I9KMQ2hh{l`O@gqyV{^kG0fcSMK@ZF6$Y@*5)i$pgwB(~nZi$0gV`l{2
zPg2QantFh6m_ZlWLk12K^%8rs2t$wTfozbif~2P0>Pv=Ft&^|PRgN#+=7F<O1A}90
zN8BpsV6#MxVbNowf`bV@Q7w6^ip<!j*YSA*u)EVZhKAEcFk6~71u4v<%NxBR8xs;B
z83YZWpA5vFeE&|UBB=aC`-+QK$ySM9G+ln`(Dv^55}8y$kXZ8{q|rplFe4R)dYTxe
zy$Z2Rgy$WoJ_W#5Ft%47fi2pE3pX4|ol3db7-#1tBY23HH7t8eN8WIUcAQ9dM_d$2
zqP*XT#j?p$Bvgdl=NR5RulO#jJE405bl9N~A3($1Ncms3H*1&j&Y?~b>~YDItulAL
z{7j9C_zghG{2)R{wGz84N@mn4c~_JT)8W=O90L#kksc7~c2%$c@IXdp69Dp>)@s)>
z{Y2a~uphmB-lWJo%jKSN593Ju4^j>+pH_;WM}X`qlPTzKXZ75$*h<|Y1<0Wo%@Cs`
ze^(Wzbe~3r=#Dj1RY?KzY@#Rveg{#~#boiczt_nfNhbL@2B(90zn$c-J2XG-fruuP
zcHy|51fTkEgUx|jnLpR}k#mHT)4Y#EYw};2CMg(y6i}2@q*eU8n!l*IKbW^lIUqqr
zQN`nbN#SUb1TsicF%UU_<^%n9mjA_q1@HibXB8%v{l8d{e_sM!!0;e?na)KEoK8j5
z^bDXHk!{kGvT85bHS9~n=Dap*fpjvPLY?=Xe{`^g1K?oE@0$(ZwZuV(N(fjx4ieO2
z;4ECneD!d1*>{eQTkl#3WGLcTEBanoUjOEG)Rg$YX770bAoLx3s2=j052^ALpJucH
z!acCnYQi%$tad=1YkoD(RZP`<WDd9+>^M)z1BLg;OCq%YU#aokX#Y(%zApf>R|g2A
zKtT>?egGhq@)o~}(akvz3iJAy$`YVFkp-A%3=(FYd@f}0Pf`**iUlLTyis&SyW|*p
zs$JsA=A?)zM$zro?|p9cu8S|h$=$Qzj`R9tT@1%R`#8aBrSRAP1z?fk2<TutzCla?
zXW;~vu;6ltJ;qT`OB%m3R<@35G<Vuf#0(HmuX3eJZfR;8qrm{&s~bA94D9WJ{!G#4
z#ZMWT<n|4Y?VjmBowfX=>;TG23Od4-pcyx96ofMaoT4;z`O#`jOTyREdLx#H*ItB|
z<ommu5-kO)IBg^t58mHx$9o+wM+pLKtKnVew0v+lKv9lS0_S7l&jo=r;ZxL_TercO
z_xS4x{;N)RJnV>(8E7NDE8Oq*d;;A0?=>3@7N(a0;Fmkz*;ty7A0Guc793cAG^}vP
ziSF>sx~QEd!5TUhKXF;oSQ0!#?TQAH5gb_@4+P^-&l(l!eTVNRSA`nI)dx(oFb0Jf
zKSTARTVRxO<FpufwVd+YR9e}Q0DySz6bGY9_kEMg+IB{OYxbYltLnLD4TO06v?{RE
zegP2>{$AJGsie|hRA$Qm!`64lQ~Cb?b2@Z{9E2i!zU|81krI&>*<|l6dt^p5kxeCJ
zXYXB<z4w+1*|Pm!x88le-_PUw`>*%o?cC?SuXA75^?E(mg@YgkDmZK}0%Y_sD;^7k
zSD6rAy7Pix`IygwN$x2$Ztjf5b>Qj$<)J&JaKFu3BvD8)c5?ZNRX^K|HLXXqZt1Aw
zPYg=I2+Asd#{bU;kU5uZigNbwTNG`$a{OORFmk7>8L%t(9k0*)hx0~WU})hc@f<ME
zuA|^-WN>kR)!?b_dY;0~?T~r={|#<<LJlIdx@aNP_~f~lsQ;`C|7)(E)qrW}&!-80
zc8LuSbN3OBaIfN-hURnm`gN!@0{sCkTS;U;4%a?x#{SO*WD>!_ATbG(j}v^f9c9k_
z8lS6n`*K~70&4tDQbeN9p<6V|b?Ol?QA~rnvMa|hsw!X0bG2#aPNbOHc$17x_q*$o
zMI{si2+RDfJz*hA$3^2exP$t$!NRAaZNCZnKJ}*#^@^B{8?Y3a23*0Cih$*E41^M+
zGf$;|sX-f^HuDsGz0dhh|NYJ2*{=E>H<!f+h~ZV+rT4T1!h-|L9xXoc)$dBZb;(%2
z4v0=Q1hj0cisA$<)m%n@wZj`a?=t4BPBoIf0;=3-k0vhNE}Oe@n-9VJRl4jsFK(_S
z&f*C~QUwr?g6E5iB5}6o_<V-I%QEq`DYcfyy72iPu$14&SAv8KDX0?JqCmygxW?{`
zAqt0ob*es<mnD(j&+`C_YvR{TYFik(n>Y3F9Z4N<{74y-bw7wNS31VVi8_9JqL9;!
ze>~hz$o&wp=r6?H(8Em+gkFYfZ||KrDukg&>QSkAkFPu>PXAvry<gk0t=^K{?RM2%
zsQafe5bN4x7xi0=6s9G*e?bh_zrdYjLw$AWrLxe_yNsk;755Q9YCOG9cI|{88N3>-
z`XXZ#exQ@D%FAB|AfgmBI1ee<{TTQ^z-t;|(ei>1d6#D{Pd>TSZb!e<eC=cFhtIK%
zg^2p1iT*u8<NXE`xZwK8QakfgiRr_oi9;UR51Czb4r8ZWKbiwL>HyLOr5S?NAITq3
ziB{rUgN8kc;fN2|>2<jZUggnl?vdgQW~y(-rh!=4iAvkf53y+1|8NorM`uS@Z=?>D
z-I~ZS+#|xZ?X_gx@voI80NIQD)V;V`p%JSYkToTo%Sa)hmO{FUHMs3>ZgyzD%Ved{
zz>XmsB#S`X379|M6Kav#=QmjR#ljV2#{n7n9;NNjvat;)!Ob+)=L{?*9a4Lxw!l<^
zMdwfl9NR-H?KXIogApK05}}FJG6qEaAm?G#R~`2ssXq+Z0aBV!j=9K6BjoX~?VkC{
zLZeF<Pq7_|)V996%F=Fj@i1qF)KBq0tD$=~iw}029W*(~9F-R2u2X?j-Jb%!u?+!{
zH>Ned7UkqHW}y}Cj6Ev_)4#z~C3Hr~cVc8a>VU!V@Tt!}b6B<6eZ$hh2<f(K?-MP_
zs(~-PRZ@S7t5mT6egk}$1!HV?cDP>RBOcB8DJcIRlI0cUy{0Amkn(z$P{kn8Yip=*
zK88}jwM9vHS{G6dz`(1;>uNq|(fmFj$2dxZt;j1iQtivm4&(wz|89R+eMu6ig}V>o
zPQT^+c2H%Rlw55j&!CO1Q5h{f+i+Wnr_bdgP}s3?H<@DJza%}$GrjsTx+-g_@ZRBc
zj);JTynXYbNf-jk$s1b|cX_&{=w^8GmVbYf)sFU=*#0DpXuYE8=)l^lvi>VPEOTzX
zo=OaUud^pmh<$`yI`{IfF#Zs07_!83%GKufrU|EQ<X*gLfI(k<g;P^P$}>ptr5{06
zT&I{Tf`dJF2-xGF6*QCB-`MYLjF;wfVMYAB7kv<u?+Y+@d5KChN?A`F{Cddloo@PV
z;upLHtdFU_QP)dBHp653gJ&E(Gy}&9kTCxi@UL~-9rMu*qJj7JtD~cpg+^*X%hp}m
zE`7AW7(XTo83_d&UVNe+8FmxD(MtRLqqBm!Zpsn2p3j3ai7jmJUatGSk09nT{Vddh
zc)Q8?^eN@L?0zlT`gl+VQ?b$HP$jI$KDY?#J??nrk>83p?w*V-gJOeYXO>-B<E-x7
zJk@uFuT6ZX2C(7T{<_;rmD_RsRF-laaykXSdhgs+hV`$cG3?WpQ*rLvsnzuJ9`-)a
zjFR(>zxSzXyp)v(+owYK=l>-)CJI4{(i5_m3$bTKaJ_9$T$4~E*ayu}dt*3*gBe$T
zyVzsiE%(7_eQ<y-W_I3Op(z#~wUfiVus+E9T%QL)0}F&5Tsa^4852G(`JJSDojTGU
zf#&C8*lpsC#a|~(oU@{&nkdK<cmjq#tJCwwpKX2iM!HMu0}&=MQ|KvUm8-{>f6@Y2
z97ZON4u4w*s(X?N8B!6tBBrH?l%lt-K_V)&NsjHJWv;y78DO%tPRblPD6!f4z;IS}
z!BYHxHbtUP8J=J>qEq;*vih}9HxQp)c`cti^&)Azkv)D{GI~y1?k*7Q3?<M(iA~7y
zN!BbCkBGGqXH6y6xfy&x@c5zl2lNNuuO%!KN9y(q63=LZeW54ib#<C9<{OBEI?4<q
zMA^jq+YzXtWF|sf47zRe`$o^#Q7^TSYzOGe5_CbNFP(6DsW<+aOIHw9O~|b87V$ih
zL4DDs4v8Mvm|UjvoH7lPW3M(1zYAm%U@2|=hvX}7g_Vm2;R7Ukgji18|D<?;P<+o;
zo2|E_*oJz;bHM~9yXoYna{|A$Vit@r*;GZNM|(a;d{mLkJTc!^JA4iuPl&hO`yKJH
zl6gJ1Nkrg^Q0t{M!jC$e3j0}0jL!zl>ut)zgF6$wcYhFbp|~cN-8wwFMP&rJD<b#$
za=jz_ly@SPKc)_;j;q=CFCA(Ld)zs+{+i+XZ-B?&;z!j0<-B<7mO=dMt(z$7pFi{x
zd!&!LlUy%zi}$;2hP)_{oSvRa_x?56*l{PiHA(s?DIW~MR3yy^AA9h1puXx&qJA)~
z%l$V~g$_TL=H>;Jf;D)Fp0oU%qg)*5?{JQ4PDD(G)s2KAG4!b^Q_tE)!Ldu@9@!Qv
zn5pq>aTxJJn>A+!?Yehyi0c)S|E_&$eabc=X@n>G8nHRhv`c>T^OQF{8u45noFR$q
zTY0b<Ti|Uqtoy7jr^SUhwOD0ttBKG1!ung+2c{mALtZ=g;+HnW3hj606&!wKc)udf
zFPhR^ay+t@Pyf9TUi9|2$Yb2&%JM&TT5Tu63QE*z*6?{Ug@NaC>Fm3gDqjc~;;f4`
zI9u^ZuW8kw%udj>>Af#ysYl;Z*5bli@RpmXJW^J;X27aj<iN-ixGs5$uxgL)L5%Vm
zJ7Zq!bs0jTfb4|jTyAaQc|+We->12se9C<kSJ1*odF%U4_7RE|mM4Mxk1K+XC`BLZ
z)g=|j^6`nZjZozzz+yh-WV1j;as82WTj1G>`-Oz789C`HMGFFVH5?g&R)#tCs@V-H
zZV7CDJ}CJov0D2|?cXBze|LkR`J4>qHhB|Fuc=w;>q}{ZsaN2xR}k6hj;?;CLvk2e
zoPcpPeso(KU#-@XZQDHGU;pLWo!LUOt1INi_D@US*o`Fu)6@9gL}l@jvSw{=YZ2F^
z*WSoP`}7F(NnhV6bu2?Q5;JkBU4&WkH=>0TANaBb9Sg7MbO`e)k9yEDAO%nCuo0gW
z)FH%SW(op}S8xhfDVCTgDQ^8(7DZPcuCWOHS78l>_gO|16m#lqjcC90Ar}xsmyOc9
ze^0RvUcOR$rz6`%y8@M%B+OT^{{8LKcKPh7cVzii)vSs|)Tp?ycVQ>ECe1?}{!r)D
zCXyH!>OzKV&ACCe!1@Ia<vj~%%*X0haaTASqY}3*6&`Z^UTGj<j|#WyR$X#!8dh5^
zERXOV2vuDkAYE}cRUpEq<KcP*u95e>_bK8(%X}@$w_4Wn^lL-~Xtb{I>F--~mIq}J
zVu&@0wO*1ija6#j?US&e4$3U+WvSi4W9WA4dX#jT(3R|kfVPlJYz^;Va&|&P2Vdhe
zE7+{d7D7Ms`I27i`EAHG*}_|sr(i%m&3&0`HihFUqdq*Y&eFXd9?ObP4NvD=V_veQ
zy5VOlFJVl`6b()BBu11j&K7BqDD)MS{<1i%`&ij%{?H-oV(GD*OQn9Q#0!eTrGk2P
zw1u)SA5#%;_45mdkc-oyyiqZr>k)oqyczSiO?ay`S&(#K#daERYL=>ITi#~8)b0=%
z_%@mJU`>%+>&lS893&g1b)QJH_9Zb{WMO9$;(Iky5gqZk0puAz`*W4)$Toj>M|0P7
zXHX#B=Nv8U;Zmh{%q^V`ZOrwye9JNV6t5B_32!d@zT%-CK5z9(zYlr2GO}djhgIT)
zvJL1Y+I&oB?$}-<Rwr&hl{Bp;w5SngppbrNXgktYZpFzGPh#_D#q0|uhx%lzy{@O;
zuL$3sAZSsOc_m#`HM4_zz**Y@;;JERB6%NTwye5X$AlpYfP!8okIXz6ZFPc0ASAZ!
zR=2^1Yp@I}L3)19)ZN{NKQc&3f_Trh7SRjW>O&eoxZc9y^8P1{y87&}#IPdK2xM{e
zP~2aNx9)#BeY%5av-w3Wn*j%<q0{+8Cmp#PVQS<>a%R?I@_TER^YVL8L`N1#2N3)S
zvKGme7t<7NBR#us`Ky%acc&alYW@3Z67L4J`NHl+C`Vp!crJeCrA%q!5zrPGJhyi*
zekxY&<(SXWf#WyN-xU+q3VIJxg2*9ork{8qLi%u9+qD9FHBWEK1x&zBn;JuWd-a0W
z&dMr|(@j<LvO|saCe6W=cL}U52_l5XC)sC(t^AWN^IUmx;&zJvF+|0A*N-0$<qyhu
zVTJO>F<xB3C$h|pR-1T~C;e`*fsFi>QgkJZ$&BnIq)mM-GT0h>I3chAh?(iHeNZ!v
z`Obpm#IyaT*@>cI-urnBF@TU=GeDwif;O^!O@Gg9^pW$^QEGwW&7i?P)9vTCwzz?3
z^mCV7pqH`muX%R0z?vf(n-oxOau}8)31u3iYk7Fn$Qyn!2>sfds}J-C>70v^zk5SQ
z^n+Vsz(Zi?-z<ndgs7U->r@1;#s*H_qgOv2tL>@0?^!YOPXhqxPrb@??&~70+wAkQ
zPeCwQy|L%fS7QKzxW%iprY4ALt-D)9c8<p*Yde)rwQeZ!KD8Q+_~siVa>~(~f$N<o
z7zEmN?sTMglHV{8wCdgE8&SW>n{7fZZuuLU*u;3KURrQZo`1IW;LfrXaT?EF@`s(>
zFWp>X6N<Gj`Hem3IBn%s*1!D28yB=@5S%$|`7`7FxJ`b~zTLF>LDbS$y_k^Hsq}nF
zM!InRe-uZCFM4b(1|_;MdJcMjyOh)tnbbHqxWzDO{@SLinX=C>Pb~XeIiY<?0ngWp
zw$B_(K(z~9$D1T12fP`40X4GE*M&C(qW2fCRpxOeN=nD%>}}@yEP2kf?bsaJtj)Gi
zl$>2>=%j(}qiU5|04>H?yPQp~xlJpCE6v^O=GV3(`uUh$05@%B+2(-W1W(mQ_X$B`
zXc`=VW9V6_`ih^Zo7vio&li$vUE3x+F^W(**^g#(JpN@A_P?K=m&T+almg4VE%qc*
zRmK)4(RVyv5L?KasdOE5xs4>btyf`t#FAw~4Zpnl2We~-7p{zuPUQoff<Qr=+OR?R
zxnt93h(FL7<&z<pi6a1JaqCl@=-ul8|45sY4Wwh!V||4=wOg)LLD%T!=c~Rf7QFcv
zsYGyRnvi4MnS&J`^b9f8r)RaGA6<Y3g{cH`%S5G}gjgr3j1@epvnzg;x0BSJm*X^Y
zQ$_!DIm(W!;uJ<#UhdPK{vYu580tN99{p1h?m$QBbj6<X<d3u@m0(l+sT>aP16@RM
z|3)d^U*_}QYhclp5t*#Ge|Lvt_{NWuEPaMhgF^JdodbeG=1CoB6%izFvx8eS9Sy&Z
zKOg?Tpw8d=4B653F=$u6^C+Vi>BN50KzjYETyJ-TfG*GWZ%tHs5;~(cJ<7su3G+Cp
zDiAA<5LRu~GfftKCeZskBF7h#X(BKyorK!!4%ZKjyQ}r}5^&{rV2C8H79st=xYfU~
zaBVXgxB@-9m?nu6FZ?5uOolEz)d&6e9W9gA*S84b_z_qO!MXkeBmcbV`@5|!n(<%!
zit{pTIguRrPyYL_kPkQ^5Rst~+E+#raOb{)(L8E|RRJ{agU`7aC;pZNSHu1|N8a8$
z*+Ps~gg=dZGlc+Sety6th^tm!IsKn67h_XBR_A<fx`Bh~eTg8Rn=Ijd$SwW_5&rYH
zY_Wf?ORi(7K#!7R0yMMtS?4pN|GKi@dFY}<&!7J@gam$?QhfgfVpkv__6pi)4dhlW
zXHWJUh<ojB+zvZ7Z`u4`^A>jTafx?ddc9qNNgMdOVA|L7+uaYrJ8g%N#(Eru`*?_b
zJIq@d>a{)XCp|JwU?;8Bj0G!J4clKUR`kIyg263~-}yws?`Uis-XTsLeJ6s>XIl4*
z?R@<YBJa?@m_^dwasZLLeW7xqo&r^q?YH>bAE#y}@XQogtUTUGsnWB3kQ#K#EYuUe
zoCU2{7UWz=dmTHNH^TnF(whGZmZs#s-)UzaNTFu8kXz#oAav=k@`cg;opqWV<wS`g
z>&n$n#PiP6kj-%OD3n(vL9FPzQ*2N-rvr^uo}U)Lc454tBHWEAFr(39HVsNQ>pq&+
zJVf<QO!ey}t`oX7fDP@JyjPyUhZ4OK#ti2Ib=C;jwcVe}!~7>c4r#QRPk5-p!{^bG
zVNlHf#|ix@9;c(^LM{%H=I1LDhyKYHB6+wyWAkgY%>36Eu}g(9n<;K`wKUWgWIz~Q
zjgc9nyZ=_3614Ql`>=b>cV6r0J{dXkd&73O{$M3F%vad;(>r)IpWx!s-8WQiHUdX=
zIty+M<|!b2CKe+AV{)1sCWKG#sLgjbxBw2^j{dpjIpcO=dl(E~|KAV8wC17QJ{`rE
zkD$sN%&H}OwquJtGrDu@klMpbYvnIPPta@s#D~5>dL`XepZNR*0_1h$0IygHR%t?|
zzQmT}F$Q_YYsl(i+};}zTEf+Van&(k#-e{-OSU2LEhi1K9aALoK)hav11yh~V4`k~
z748rRWeo6H)bn!_@O3j<TeF_6m=&C}R(=f+^E@BZB9}b}OOLV9t|MLb!IT0wGB{$a
z@ZEPFyj`ZpwcYso`n)UyYwZc&f0KgWD)`Qia71ys{fQe&wDr$E;XDgtHz;;_5t~kg
z0AZXbNaT(e1xWC7t-e=m21YgH!tM;U>)9t=t<l-c;CK@{MahRfi-*|J!co6<*H)k^
ztzfy{%|WSQM2fWaF0^CNYYbprR-347MbuNtzf_!U23ny7%Et$iF%O~iDTBPd`YK5C
zash%Q-=ix`xA%_cJrb+0l|PNgPJcB`^6LK~RSY=+6-XZB61tA$&uWz%h5l7GPxlbk
z&N3kVob<PT7bKX^oWjo>d%vX>sZ%qeXi;$pV6nYLK_x`*d~oQ`ZzE5i+sn^2`onM|
zV8ZQ3-8C1QZL$koA+(eo!9ohwZkJg@k)|{E12UJC5z`GPm3sO~q@_`}Kh4~!UcDA+
z6k{CKG6b@mhYuq+Af@8P07QzGl{V@F%BWBtUT{b5I8;X1Mx3}m@(6H-ORFe#-Wmk)
zDi5_}I<a%r8By-cHZwoH0Tfu&-2C>;(<Frg1)AegMxqH2M}z~&+E}6a_E?J@sATi{
zmJKDi*Uf9BTLlRKoj(*R)Q}TjheQ#Ykb{Ge!X)cdEt@h{(WwCP7f-zw4K+sY?Unun
zGIlTD{UIa(`dwi7Up|pPPYpI4(+BcqXlv+V;EBO*WF)7}*p3+BGDS|(en)_nlr{?n
z=btt(CioY)X7#iPbda_nt;=Or@6pTdVY9d4Vy6bO=Z_st`zjKvjW%!Np4nZa_j2mT
z6wjr4p(_dgAtO89js0-3E0RlOMXS@M2@xk*CpQZcGPdNOFNr+_`xlAteV;dr*R;$*
zw3tfY^7&rv9>Bt}Rv%l~e(3oqTIA)!X7}$jJWGgz-)iDTo6rb1yS@|GBtkBjFJIa;
z3G*ntcdzyiqYu>F$`D~&rTsRK22^{CRt>j_KRorEWiE3B1;qSG&D>s(i-Y`6=zM@a
z_TJi*dZ#6f%a@bMNIwj_Wnnp)=<$n2^t=}K%L1WCUj61iswiFQkr>4BG?MjE(Az~_
zqjmvq7yuLwo(g-46<EUh+H}WazxxR{(g<9K;`^<VjGii@ay{JSibm3Q`uV@B)rMK;
z%Q<FpPvTV|`L-rRpdUrnd+sdidhf(Ato2%$-agYqWjXDWbMI9WVpCK3UN$&@lTkm)
zW(-}~;q}WgW6AxTD)aqEF0T2PYiZPonP@?}z#lSCV+Bh{Ai9?qQy)ESwzpj4vsFp+
z%9m=8<V`}xKdSZX<zG;``khKz*X07_bOrl@X*VDFs*}WIM05yK=C9O<0mX~sr%OQe
zfUFlxm;F^!#I_2`>!?I=U{1!!Z%RzrRPUI~&*wVA?ilL<%G(79?9C`{8<-}pR@ePG
zLFj7JCOBXvo<&s_Pt`TrKq9@UrfJTK<j9srS1lS_2M?e>dn;H{f1Yi5*&|})Q-DV~
zzM1o|3~`Go%bDjw#^{e~TUTR3&+h@0J0S^4*FNnedj9wU8BPtf&b_xp`GDsrtC)O5
z*zM@u{Ql*Ux488AK{wnE9bmf5dt(Rhk7jXw9Mc^*L<lQ{*?zcO^}8JC+5R0j;-0mq
z2SbK@92JN)PI3-U;x{BPN{rbCCxl;fgFrR$?u}?&ck>b?{q)|iQ|wf1U=xddrnLi9
ztMeFVX4VUxf**kg*XZ0>l&<Zlde1^i7UF<9X^RXzF8SHUb?aI-a<6dApy`W5s;L-K
z(-c}C3)A%Q*?Ct-$h}HGj#H+#%<5Wm%P6$k7dzcylAt~^2T`~RRJXb>@oT@VS;-<a
zaLN*j9p*;zns+OE>XnRlZX-FgnsJ4aX-y=(S6{$Sa%aR+)dsuNig;6L5tgb2BEP_Z
zYB=)kVNZYl^M^g{i$&S9X&ep~H0E`urQZo@IW-cx{sMUH<G1i8sg5wlGX*vO{7Gm_
zzJ~Ju{>ZxKi3Q^+WvfMcLSmmkp!i~8q`!^(<PN^B4Jodrf6!a^TOZ7aV_y$>M;1<P
z_G!r7DC&EF2d<V$^Xa*VHG5;kL;|B~16c)!2=F?j^rQN~;e!D-JDv)sM|54F%qdfz
zai8$Yk9X}tM+P;ETtz`;i?D_f@7xo~UwvbfF|?{~e02M{=QMc|q*W~nN}DVux_yh8
zMRG~{FEI>$$o_U(0>J&3c!ZRVb=?kkbNwcd5Mb8AQslJO&@x_C(26|!X~elD<vvM4
zO1<pZx+*rfMi0MC9woU4Eym0s4I%L=N`g%rkk0**UI&vnh5SkhkW;3Ih57?~;Oz`5
zeWX`(a-VY}#;4+~Q0P%-KepdT;)XY@Izl(Dr?|YLTJQdGez8Esd<g>Rh<kj#S0)x9
z<vqD%u$HK#y8qbNM(xWs_g|b@R6Vh5!XYju!jdcQ<_jGDRfU&4thqF5EtCR3E~t}K
zmP<~!>JCz0czxi$VSWC__+um=MP>p+H0@_hqn2`6wfNqQYIw0)&K=yF#JWv7Q8pqz
zfYF~}JJ_2zs?J^sz4k^G@pXmTFfQeVTj#lC;+_%Dwxh1+OfXT=S@hSl?&G)V#}1tL
z?)S+&c(WmxRbsX|D#F_~rEa7&%F*|s!G6$g9}K!e(O$&ryjoQv0bOeK=G=6d=d+X4
z_Lj1Oo`8{@?lcq1zx3$p^(UK*o}bINPe!?U)&AolHE3F#?UV15t;S15Xr5nV$34a4
zqPZieT{2=~Z*hzBu~EN|_#WB0BuJ+j3>dY~Md?cT*cD`m<9D?$-mP6I%I>igFNiL_
zoin*#$AFg5I-1$|;HoX`k=dLVsf=;W#@HVhIW%be)ml!=`x#y7d>(2~?ljzb)CU{z
zK(8jv`@Zw8RFEblH*;4nKhot^!#%ys>>jGVQrs*%ZxMO_rIabl21{wmmHoUY_;Ign
zH)VUN?k@mXMJdX|LAu|ZW1_;<hhH<S_-;+kP|=^mCO@Kydx-kzpCvuW)+sAxIr||k
zN%_u$#KED&z^QWgh9+K{SCo-yR<}H(fk3C!b_>_Tc@^=^;0YC8*A9>vyb$q<@D{{#
zygn)M)8%jJ2(!<lbh%__yu7f#Csx^FJ0v|p2lbO4ce9wV<AQacNoT>r7p~kJi0wDc
zhiq4Be~M&&za=+Glkrj`FI%!mVf}ZY*8B6!BEPAJCxCFefLb~?TW7driF5~iVt14p
zzPqV!wf35Yn&hbC^8Hf&9()va{MNCfbaSl}ybCW?SaVTamakPJOQez)d|NXgI$^OJ
zho29d6a_;qv2$J^Kc?EgL6NxLwfWtZb6hXX6)Z0efEy)uxcCBB7b_hnR~P%1Vb5to
z()1~=5y8f$axgPr+yQU>Ys*(73&SPECKdG4R{aZ|Le(>_&Q>0M*Yhmq8g*q;&^NMw
zjpP)B|1i<3xAs(I9^`XDGT*D>hizseC+^o4hIhY7^gK2idi<gVtXN|>R9!Akj_>5T
zlxRX}(;MwiDe>}E=ExQY)cEu`mnYjrW}Ym%bFJPrzyk7LERV*wKjKj|)$XW!sIBBJ
zvQuo0r+kmml#2J1oQ0r~kc2uC>mF~vo9qGs7wnp(f-c~VZ?Dgo*S?hY)QB-WuaAFX
z#r|%Op(ku;O1)BqyopR#rc<N>@X}0^+{=Qu3&W*d;SJg1oP6y6e!TO|DOiik!^;hP
zVEF~VlJ1CD@h_*5_Qm4tUYwpJnOd$l^<~}=$g?1PYuaB%o}d$l-l~r1k?m-n7(nTS
zm!_y5Gz$_5=H7_0RIe>SJ)xE-s|by!E-1eBVQ1}8fgC@O-ivl(C7Uk=P2~jp$hlqe
z&8WIxOujcrY|~!Jv_|YXpqmSwHk*&97>0lCH;#zD&tEy$-CHSB96b)S6%}u))(A%t
zcPMq4Q1TKZdkyvJ^GCFftrtXTHoFZIvZbjno*B4a)rpRKb(Jd^5c5t~J^IFjhn0`N
zIg(6BkVNb%$vA!qn)j1>@neWCqM@H}GBWPAEIprLnjM)_uMLJ?sHmu;MXh7^er;G%
zrhTkfF?e$<hDmhmYTB=>67KB$RIj1t6}0l8SfOUJa(Dnk1<rNvLg!tW`Ni3H11NBF
zLr%@_DkpDezn{oJWOV#fXfxlNuB1*@zSj4P$V7WNh>06am~H`t9+VTXy#>0L_r)VA
zTt78f(+r3iJ-J450RWl}VUAE>!T*$Vqn1na71QZVCx2oKj!*6Fe(EfApAIV|e?gY3
zOOgA~YiI(au2q%ZSMTSzHxwnhH;o-ED(EA8;jC7>&h=>+_NssptZn9Q3J%7V7$a(c
zS@FEYKNu*}h8cWn?oM|`vZ9w<LgG?kkz~2x$wfp5G3vZ9@T<iAy(VsM--CO&1>k&c
zyB1R*n@Tyy;kYHrjqC6X*`ndW;T<`!Fs1Ah?u#+8PW^cCaxEA0ow0RiD{07}E|D@g
zhu&301dr9IbOWPegH*<6ZSfS31yXF0SEL8~yz_g}dh3}cOPX8*MuI&&Ny>+z(;VXJ
zY?B$!m!m)M5k@7JNVbIT4}U5QiX-|m^wFFxCuoBYDB34g^+Jva-SJIWH{?y$tq%^8
z&?20Adymgg^;d$8bpXaS3V+WZT4?LefXDPGe^z~u+ePkM={ov45=(LgD?l_sT1tM)
zt6w8Rs+Vw=5qd2#m)b<@8=GS7g1UHt4sPy1u}aNw%vfRMmoKMrbD!LB?ik2F0O)>f
zH^3PC9fVoq$-AeLgPwzdhWK`N@4Z<8+=3<REeMyObW`d8_F$+_m$lY>Tk-iFO@s7M
zD<Wduwrhsg<wu8mVUVg%K`fq+N@RN%LGx`j-d9LAyB(6)MV%I%#Xm;`1)Tl@k;P4!
zO_bTO>{^1{_Zp!8GEC_cQWQLykdUit@ZgB?s5s(#sv@CL;V!bdQ$0%4f~?`wr}1yd
z{FbQ_`GGjGdY{TvzLs~*p|EDf;-yVgZ@wjY=$>fVuDiLRwk)MAM~ZMJcf{l!XBDkJ
zTEB5UUGt3InjH}1r2{il@JDS9_y2&w7%6_iQ;?Tys+w*YJCREtF|Q@f5%u}j=ilH8
zT3eevg}xe!;9AW0aCD5=C|3$mX58)O7(X>!Vq$y(Uii&TW3y#5A1BMj4_o|-a;1|X
zb}L%F-rQ4ZRcDS@j19h$CE~e~*2K@!n_-{H_Ytuq5l<@dmaz<A)b=$u3!8_;_<S_`
zctx`5sH}Q~ba9xr3$%!P%BDlPUO+0saH36_ZD4&tyBOEJo;~MLY`m6SWJBXXROT0f
zUzgJv>yma_O+z4F_I6#MSn+@ERU?zQBjRGA5sFZ8|I6<YCB`q0&)qH+9wvV3g;
zYn{|_n<T)TJifOTlXZZ2RdfV6rEH}~uX7g+%*b;)=~Rt9N;1H>Dq2G0sGuG_Fxs@G
zaek45{7YjxJt}e7bT^q=dSd3{*(=DabhCH^2J)JCPt;9z>(tk&5hocZDN-BC4oUNt
zPaT9g=Urg@(*`DAL~pF+L^5v%XA0pyzHVVRoLiLAmFB2vuBMkM_*?s5I;L<sc0Zs7
zdvh>o{bxcEyifDBW>b$Q`K(we70tT$d8V`n=|E~wwxZTZ;C;phditB{Ns$L1`Luh~
z24)4=btT?iMI~yFPTm!nlatkuxY8%|YxZVlOH*-X%2-5?gj=23(0ERm`t@USY2Pv@
zLF6eI;7NMSiez;ZxU5(wzx2OxDA9B88)5kN;c7-aT6V<nVq^9i1PS^i?gGnkjoN+T
zac=29eAK+}qLZ;|BY7VudTvT5<cM$`ZI!#*BdfXjzI#gO(w$raEtgmkZ3EeZLVI)R
zcy(PChUq^zO*I<WnP?A_?^3y@RqxS{mydSs&LQnrd_~TDP-4D2+oz5U?9)X0VxIHE
zG^>tW-u>T2DRMD<;)~c}gvmJ{JUqXvK%HorPrFq)jO%h(Q(I@V6rb;2JVKILrw6n9
z=Ty2&RMCZYk5V=*0SlR>fJvmBjnEyAsVa_qRsA#VO7pYNA1fDjVefj&z`Xo(tNO98
z@Y=l>L`YuXSqZ<=hxbPrIK=x0EEZTcB6oPi>4k}Ig1o1AdJwT6bq`hR7^QG4d+1ip
z_Hwl)67InoX|aMQZBeS37L5{&uitdd42ni)s^xs2lhtb(u`jmArqR=nqzM1~q0RyZ
zL>BE~hcG<F9TK2iC72>U^UkK|`n8od8OY`051rQ|{Ji^xM_u5IruX6p&xCihjSVhe
zP=7LGRFzR=uJ!A(4`SknPi)DlcS%^gls~~%CuAp1883x@TyY=&gct>jGlSI@1wxa#
ztKO^|bM*wf@>~lfxf2(#FuawSuye5NQ^P!B+yPNXw#wJ92|D%k6Fhh7X}>$@T^PCe
zhF#v6B8f{=m1oxQZgFv&R<H!11s76J>e5<J3c(mfvUhffathoKqWhWtDV)|qp9arj
z^e30mqCo3A9mXq%kgAz061s6eylF7;8FVO=sn*P~k{&K{PcyMnR)Xj6W_IDHJx(?p
z@rSCd#l_4zL{3(v>t`Xb#MUW3UaP0&)a1&IwFK9e20Xp+mKz^m);Mu*p4EaPyWV3V
zyX4!qGD5dLwK?y_{Lk+lr!*i#{*nn5T4JZiPKV}Zh!U~;!bYHOm`9`cQoimPmT?PS
znle}2>PdAzLrr@Xm4;+PN>1^}J9bDSPTYH%>3h4zgbo3vh_SPZ^!xmJwy+XL@(Vf(
z``%Q`&IM019490p=<%?@v#A|yaqJ19Ma#_jx3u=1+*!nFrZ!z)3j1w}f(7ORDN#Lc
zUJ41ID!yphpA}^4+#KxAJ+u|5d}&;rL`Ye$t2HCd8<SQ^S>(9e*N&*Td&buotJJhM
z`?)^W#1%&vcPfh)&|9-cJ{(Q_=RvZ=vrdWAIW~`8U-kI|F6n$ReL=KILHx&#D~Vkp
z7Mhn25eT|Gu2vN?w=q9|-ch&G&bw5hT*v6Syn9}CRkHVmKmlv%&rh_>QzJ(ahRoBy
zxT<zM+V;EX#LKpwlvn0wn=TlCIa300N@dZNv#i#id$O+`k^S@}5k|zlsm60OcWLG4
zJjz7*Ov;gzwZE(!ZQyUcy)ne3^Kd3ySd7mpbqC^LipC)Pdo5~p6l@?!n*76I=v+VA
zgbViD*?P{<jKN7ann6o_4R>xzMn+XoolT(MomSIvzE|>vGxMEK!o|ftN<<aiz|NmF
zKeFO@HT3P3S|2BeLplB7PpWYbK#cDQ+(5M!Xedv{w%zpldFL0q^v?CzAS$O(kMZ?H
zzxM;24$`LC>-1mZkMh1rt@x3Yf=V&x-m}ro$5y6W!~HSeT})2TzNTBLJnLD<{D*IX
zTs#@yll^DUT+~2>OZ-6qCWjlOyqqg0xT{=iIbtUvtB-?r)g{8QIKzt4MF+?sfcF@7
z(yxzLupU9tzb63iJAO_m47a-NyPdT*!FEdWFNh0v@c#0&@L`R}_Ecs&NwD&@e<~s4
z<RM6*8%w7dC%l;cp!7?3E)WY+qbpOU*)9Oja+hGGmb&i+D==%2scm;eYei6ppy*YJ
zXb4K6nV;9?5<EnLZ1AdXlW6<O(EQ5=TA#T*R|!>Gjsiah<<AJs62G=w)(SDpv1j=#
zFGBsz5hH;8mOI%Xa_~^D_h#np8ARPSPu0v()eaHaa*)#|{_4|L64bbBq;MInAxZ)V
zUu%1XmRs+^aq90?_=9sXhw;I-J|bja%}6^R5t_!^2YG;R-Qir%^lgyl<Uo~r+7mZi
zyZ-zW7Wh_XjVwuH5TmnuqJw0-j(GtMbKX~Fj`Qyy-$)R1&U;KMe!M*!!N41H;D2mb
zw`REi_zZ$CD4`@KG?{+3G=Prwy9?N81R_8My>VtB4+Xzon7|JR!XsG0c|D0@_+tk-
zi@{B_JyECl4j}><`0L=3#Kky2o2hVZ^PC?IGOC_Kh=6<ebV0}u=L_e{m~o6CT;m?X
zHEM`o_(LHubP~XI5;}j5+R{!uC0!BYGnJJAG#`ln53FWS{<V!9UjiW}{S*-d`vOWg
zHS-KOgnZCHMLyN|l(Dn@<rn{QmuJuTmAA!&`U<DV%2fWj#VK@o!e~*Pykz<H%s3~s
z^L_JsqO|tkgC_FUSoYkV#|A$LWVWZqdEOJfIX0Vidal><fK}pnypV&@j=|jV!hLcC
z5(SzrIiMC!_8>pX`~G9Oo{*3Gan-()@DI)#X)49eFKbkKp8N2{I>oCAyZsDGuv+-!
z+>jHu+Zk?ss)smC<mTAmh;sR<o{YDLyp_V?76(g(Qj-Ypg^c9$V@FL^w)k_G!Z??)
z#e3D-6)_EeTt%<o=}#m}2p^SqXUpizW6{q7B}i8&LwUw}gMIp%+qcqdnx;KW&t%k8
zjdzq!>7??%-Sx~QGqgfX6a1%;xmA7UFFy<nfNIIfsO5`^UW^5NX)Z^!%dXefl8$zg
zcn*g6<@d%gLL2&;9F<NzT$wa3<YWFn5SAPO-*2;^L%)H@S&f<~)ECRfZ~25Zi6308
zu*Y=Hp`2<KYzrn15L=HA4HENgo{z|Y_*((d(58Mr<kk4s)f4z5y}d0_vVy-AGSco)
z<Dbq>I?)>(*mnvzQsLJ&KmCb2ynh2DC)Mkv!p>{-FuU||LG&xcXK?ogLjS>$Ss>UH
zDg&#^rS`GP(j%X(c07fMpXC$k5?5|Kstb{2Fltmn5irxB(YTNBf~P462p^#ao2N-B
zIb`utF)XjbjSeNI_3dzI1WF$?vSC7=Vf5*QD6*fq#*hUBv#62j<5q+OyYXNGWY(8^
zz9ek4ZnU;1#3Yub?Jay5`|)|nqvN1GX2iqWqe7f>mT}VhiU!GN3j(XU%dA{|WXtJ#
zZx)s)R}{;!HshfD-vRw|*#igqBI$CKeKpEo+-V}?CzhDKxd>hD7x6<UQsjcwo-A2R
z?75TOL05*;Jvwpz&BVQFKi7~?{<#aoYlPNYyFY(hxU5nRPv&YIo6o|@I=8UBtV&U@
z4}MQbFBT8-euZNj9-!(PiOyYQ87;1Xxn}TacFac1r{BXylB6IvjVEb@bkKa^rIL@3
z)u8^k>kM%L$Xf&KAGV9mpWS(^6*3{hxw@zAsc)G+@4+-*{DFslB!jZ%C7bfaqgDDN
zgH=i|#mbS_d}3HD3DEeMjFmg5R<yy(U|PMIZ5z21O$%M-n;l5Ay6<8I1Q`F!=cjxl
zeBy^I!O{WJ_9waWqlDgNml{KW;6D|`gAS1o2v5@x)B$H)!Iab}cX1VKyx3D-ye}J9
zCACworiERxV3ileaIkmw^caeXi#O=r-8Iy3qZ+ZOE+$mFTT6KOwMWfG;Iy9w`ie{<
zwDc(o{NgRmQj{1o1}s&L;pYfJM4!ZwXfN46DQ|^rJe_o?;kB^CLvLo$AnwS+p&q&y
zKg=m``6R2~v18K}Wk&FlEledO(!=>siNTixt{bDTj6xbb^OvoYCC2N)2InYTwy!4N
z-aqV-K6qIM=g`SI?wky@)onOF1K<$b8Kt)^$6FP}Fqt|~vO$EBN0;K9nKIkGx}s|P
z^6Ol^yI|{S*;-3?jH-bwTG5pM`lU5J9n#hN43px1rQ5WU?u8oFo^EDhsKjNuMk$O8
zgEG4Cpi1R1%4)+r>|~0cZ?x47RdIABV=Z1Sjk>Er2*t{ylGa#YxB3VX4M!xVnY*%E
zi?}SZn2cv@?<xbYcXl|m_2?vIg?YiA%tPO8V!K9XWkX24!!#)<jGs=78Yk-%k#Qru
zgcqcNYuoqXo(a<WTz>jpc9?*OS|B|=thJu2zAk7UtvE9f8|G_zeFgCyGPuD|GA0@%
zTHP@FuUkvD_g%x+{X+CS<_4Eq7RL5+Jmk!NaliJkTQPaV?Exf_g)(~g{9M}D%oOmp
zCLvwnCXhmAHp@+4PY$<Fv6`J{Nx-teM5M*oiGv${;_ivDxn556n-WKRng<lB=hq@+
zesHr*4~SD4s8XO(SGnX#S3cm(APy-%HgX1fi;JpredbS2q9x`o2k#)ytxfQX?Rf}8
z4*H)7UevACp<$VQO)}426$a{>h9KfgRaI5haZirV)VM7#^H%?D3-0i?MS=PQT-=*I
z!YD1|<^=p?GJM%>*I&&)&X=iuh_bicVDa|)O}(Xo>e`0(_wM%b>m`Pw#*6FkGOuK{
z_ADCWqJ!UBLL~!{Df@Og;D&gdCNHrgt<}Wbh!bjSh4rsld%6U3l7<ct!u3JX*MH(z
ziOD-(kO+=;38_KI`qga#NEdn+>>EX>Cg4}NK`H~o<;Bi$NuG*V`@Rv|s#btM@l%2;
zpY=>8-4;1+99u^H$}wgw!9Rb-v$Hc!x=mwRVGx|iJ00pm*`7s-ph8&d(w5P${pKrz
z^Kz+iV{mz-7Y1^LK><(GQJaY$W_Y|E>ASnT-!L^bRglf0ujbQSW1J9GD<HwmrKXJl
zs2H369Bzpcr5k=a^>s8lM2~Om(X9w{e~ymj)rj+tedn9Qr;*EiyegyQZE*0WP1myI
z{Y|ul@_4-(MnEPxH@9Qfp)3#o`Xh~4C7G{ygt+g@ywi&N$r!w~zQ*;xU|fMs<23^8
zLxSd*ouo?8g(9STR5*dlvlD2nTy1`x<@aOX=<`|@5LbgP)WNBIr0#z~bDPPxck)K4
ztCH{bX20OsZ!-F;(@iGz0R&U$B<gE%&&wmfrqL7MsC_0w8*|Htq$|UrNuL-y#ZcNf
zFO0M2&QV|(T0Y?xvBL$FqxgzSf*@_(Ql#e;@_OAy9UL5(w7g)QvT?f*wM@99^^vZd
zl0^0dF5l4WWEYI9oG+oTaE@6fBZk4c>E;YEJPi%H`G+o8RM+c?&*H*YKl3g2REVn@
zmQ@7hgoO32pek(oOW#kpAKG&qG$~LRhI^BC%^s-6+~2(QwbY>N>ce@=gp4E^!=KrU
z;#DeC4EH2mAtNT>4EB{q3!GLx>{NQniedP%YE0=NyKmC9@MGZ;N_W>&@hMa+wCLtw
zr|pb;Y8EglD%IpqF3BXfA)m6o8a4c_qN1YhXJ2Mp2q|;Dl!Yt4f<?mR&+!Bb`&X%^
zg%$msi6Jgt!1XwNSb&UyUh%E^!t%>cuHd<yn21Lf0vR5^xv$Z2U2m^t7ST5#!)wNx
z=dz@E$zfKZXJjO^h;R_e$C6>i4Sy8Q*7<*@)?*mqmom4Bk?IOLBO#)e15tt{Uq=g^
z5>o!@uJobnlX_g$^c8evRKUe>s0hvfvnAL>jZe=TPLQs6Zk!T=*HQzFi_v&|J=ZWY
zwb%asS}EodkKXRkb+d&a%R=S1&pd+gDxG6VUV0Sh8(qhc`W5CU6+CC9-HEGzYSxzo
zW?zG2M-QUz0wPY<DgYtQ?-G2Akr#fL`C`IBL4g*CEV(Y(jFvjNywFXJ@(rJGUg(bj
ziN{OR$<+QHC#1-gal~wFZpLWyQmF~SIfdWu#_ID7uce@Gv4DD2_kmdkSYw;XaILAh
z(UsitrtZ6(K9wMPxO8TDR{S#xR~c=!AKC^>lw57ktb`S!oan@dRTs*E;<{8~VaSx^
zm*w>JJ&mGXULCmX@-`CWyRdY=A=Ki*<??M_6o-@L0&nslm#LBqBD&x>-!3TOe?9DX
z9yxM%3-FM9g_7izn{?Kd^61J>=Wd@s-^F&i;Hu&ZK6ocojd3NN&;JnQOCf_@=`AIa
zV}>hTb&2tmq8~Ft$T+wmRd_3FIL@{!K7jEflT&KKu7u%G`5GOhz#gjmaaCri1)GP$
zpmZ;KW(aDaguLHoGfc{x=l#+8-9onG^rb9zIUHQRb2+`IaF;Q5xGU|f)`A$^-0W@h
zN)?nE&gI%na)z{`NEqXEQjF^@>MTTor4aRJCbhWd?i5*qZfRg&B>5`F^}5Uq&Kzs=
ztF!Ril3g>&qCqb3_4Ul1ZXKGMu3!K8b_NY(1GC;@Td4Xrkn<ewc74W|nsE}|(-W_v
z8SXqdgGyAnruebuLDQ*qARJ;qH9fWnDjX9CFX<VI5z8t@o{M%xLTnCVsV?4>J}5-Y
zW^G`dpL%ZG68m6Ac=!}b_Z9vEGEy&CWu#tlLwboNvNipmc|SHbcGVli*pdZs)K(xX
zYlhKkQZ49^wAR&qHr20=bI|IvOa9hgN5TQgWt-vr+SbLPf;PyXF@y3g9(!VyCLCXS
zU?4|_=9CK|6|cZkiM0?n2>OwggB%g#0`RErnmeJ?;Q4SOIgwc^f@Pq0@Qw6)Lp_6w
z%Xh->Iu#=Wn&pSyN2BYQw{PDv@6W1lVV=qS+}MEwE{))eN<%;0jJen}3fg&!r)hJ(
zPjr}2@>C}L^5)w`hv2Y^0S8Hgle2RZb(Ar#ZSylm6HxF4E!bO`{dwJ(koo-BS0TMS
zF_ErkXy{x&FFC%ap1`LQQ109cJl=p~;n0=b$NXA%BSV(OmkRF~d>aEbqAi@6yL_&U
z*uCqGJRTP!_i}`0n6kL16oQGqW|F;y$(3Q|6opr5&=3?eK0Y35B?3<fQ@M>j`Bt8}
zRN$;d@9`P}R+ZW5=va_w5RragSh!TC&AXW2QL@1ZS$AJRc-Ey2THchYw^ze2BJlRq
z*8Gp2v^YRSq$Gq~!dB4}f3pA<@2`W%ZDAqVzX{`dI!}UQ7cBGCZn{%&sH8rN3v#g8
zC7(mjbvX7hsGo5Xwqfl1*T~ir;u}>nP*i>;j^X_UI_yfJw|E|iy;eez4Za^OQC4-)
zJ!~%dvj3C)E76)?2G{gFR|7T598%pj`_#i*lg~koI0P-l(iE<ADi0~&ETuoP?$Ejn
z);&2d+U*IZ%v{$T3FaHR+KgX8k=6?88bV{~{S~=_^>%S8lX)cFcq)wN6yELP7lOFC
zQG;2jPl)|p8goal?m;ftYp{fi3Cw^ah^zIye63t<L4FH6vNB`7vnR9C-&PjeZg`4!
zJol+tR-hZ4^$)we-J^f+aUv+MqNx^p^=+B?+a=apKUOOy!5D6PGH}jHZ%_K~L*Vl{
zEjHK7vbTBwzH3PW<9fcae~j(U<W}FCwNx5EyB9UzVlCH@&e4jtuJzsi%2dNTaLW!|
z8FdXugmZJF#56)@Xrsd?St$cALQaL{(5k=^W?WPAJJ;%h&;F;T@c;tyQ&L3e%2K9`
zF)CUge;=H8#Yx9&F+?HDO@b1roevxi`{pX)E&Xp;=@p}7KNfv$MIZN6ShHt;%B&j&
zIAk{-^M#;`le#)mm7>4-Q_0=sVfGPdSp)C=-818FoI|%MM=#!|yk6&+v&OV0|5Z)P
zXQ^o5lry(hahBaUm<m(}>2(OkWwbb$50%$t_igss{i3$R!<x7WU86n&YaDy6wm{&x
zY6NHlT{uhC?`^%MSt01fG-<v@SqvK#)a4F!UqW%1{E%SlR|K{f!;oXczMd?}{Wg%R
zmn4m_chk`neMmp<X_sk#*0S!>+?R-xe9xPcoVu^?005f1I0&N1yOeLq*;TkXO-+Ys
zAxBGo&w8(J>13c7;yRR#nd~2`Km7g$dQ=ULb&{hMk}kOnEp~6`CR=SBrb$HIYX7_r
znh{uYIarffYw}zy3kHU>e?Ug#%aS|_6fss`#u`VLIL%@zjup4DoQLyOi*;V=SH|G;
zx3RT|4fjpMHCpGpQ?D^r4m5I3&qwU$w$!<*TjpV2>2f$gyFo?!a>#+{{IGoge0ze{
zT{hQ<1j+bZ`P#5zdpIC~qFs6B)M_S|_sjz<E?*MKIGxUR`{xS}{>@vFfnt|$MQs7I
zLKem_^1x4q60@S(WmJ_FXxp#*fKk^6)`}Yp-4v)v@Dg39<F2stn5qxFs{2Hr%1yK>
zr{Wj24gPfe5RBqd$_jC8omF{#vCa;d(po_)W+L1M7#<fnCU9yCw3eneAds}v89%!R
z_NpvZ-g1Xe=S*LNOp<4mqP6{>%Cj@SYmTaR)}@cu?R@yPIuV$*tEYYw*y*qFaJbu)
zJHuoQ0^nuc&~(Apd@>82C@uZEro_$tv?+`#PNtbi*k5{Q$>z8tf)STT-VFJx1<$$t
z`pp9i(YTnxmpDgt=ImZR!|l^VY$^0!8iAD?b-Z2YTv7|x%PT4f3ydCd*|#lF>AI$5
zOYwDN7t90q8y0ww$_3nt%OU8?nx(<#FCPr&Af(pADPlIGWM244TFp%8I}`d<%lhc3
z`@y?(HT~`1*)OfU_M-|v*^g`(g)jbWxY--VmxjBWqtCgv{CwMc8rK%HLVw;?yGu>(
zqlo`?-{|&2tt$5-0$nGAPfzLfRz8PPaJ4o+XC@qhAmh7TtMz`kYLDh;VkV0R^Ljf|
z<k}#4fjp;XvtP>u^pj^N7pEI8R@j1R^Pfc`-4_OsLef)ToP{nqhy}f$=qPd$`)feA
zi@v+Y3WZFBkN1+**yn#Hk}!ljr8>8WG(gWJaIJTq33<Hyt?#JS3BY7-*##ME>`D~W
z*<Z9?hZ2ANfWyQqzR@Yy1}g%cJD+P^Ai?;S1rmcEFoH-r%3$s^IPWdIyeV~|PKj^v
zJ%O(Gv2Uv0B{dRC-8lH1l!Pa|S+P6o^T7)adzqv7TGpg~Hc%XJaejpDmYS5x7;H}}
zQ!W}qQ6rw$o}4w<-P<$nL3eaywtULVtJiDHt={G#zJ!)QS9ZVFM<l;8nHZ=$XO}Ag
z*a>ib^VHLvBDb*QEe~BIJ;v4isp9TQzav&?u<PC0*s-IO7PIr<hq(mPUuV!agh}H*
zk~ly<>Tq_X4_2+%3x=Ow(b5RpXmf<N_)RVqOtr>*=a%$%)cTRo=D8rf&wf4};%nGC
zP`y>O(pfa1Yl#@S?R46GR_acM_iB`b9e`%aaihGvycMuvvn%?AU%^ylJG<ldvAJOH
z5tUXz^+=w9-2L|UYC@X<!L`;(0X``3yjZ-$S4F+wR=;rm??-z^QWRPk;c~p;M+`!^
zh`<dQ8>-eh2Lnls>#j6GQPZcQ`1EXy-rw(!f{2q&bVmip&n<uxpHyO)Y@Q#v|Ef7U
zln)Lfp{EPQBK0h6BLub`MI|kY@EcY2YI|+=O^HG~&*ti)GbN-s^bcR}AWXpUczTJw
z_U5olS6+Yontto;jhi>0juDv-!Gf(DD0T&dWR%Euu?<JVK^`$c!BDjuvU;Hln`~D+
zV^=6`AGd)BhubM6fybiC(6kJBNd%B%^uV3rwV2ZD{UuDnNGKPxuKo1Rf{7UP@U9Ag
zBFr#;3S~3@Dl7=)jc(J*47`>&ej8-XAAH$MS`X*+4vn81CP7lmC6~p)H(p)DF;i?E
zF`3T)c#$uHRGuo-wJ4F#HhXDtk(<*SIUcm?K+R<FV4>zPa|lxJLfZ|Qb>J3rF<nej
z1LaqmmNh%N?%OO47fr1T$xoUdAK~KHPuj^bSQw!)!(%>^*)|TZyQ{}ZXJjF77W13z
zZ`p^_GV1AxxubZr-q*FhA2~bf5=+upULU99Z`$408&r8&Xm_i15~Z2mkn>gR0g^}N
z$={Hk)h@%*ngr%b<M~0yxpgE%^JK|jRD$KhQE1qA^j&l{X#KuFkM_=gm*iW#P2s_=
z5`UkvpsTcGVeQ~~Nu9C54jdA*4vo!MS@j>xNOh5msoY=tx&t(|#Qy4+yjN%6E@kIX
z&-k5lt<#G*wlYw&CND{gXe9Uzq$f@I9C;Nu%qbRF^l@HtUC}Q9Ggu4*sW8;2Z-*X)
z@!)42La{FBuou7YdAKt_Rj@d#T(rh?D&uXGyv<DvdT?c_Z~{)PZiDJf_pA9UKi^eG
zZ+rg>t+mXjkWxp6Ask&eCr9n&>kDC!gGBI?UIT295VxZOTh9Rhn;Dv&i_>Q1$3U3j
z@!yUC*ZP0{HklgJeBua@8nan>dhiN;n1V2bT6$=MYQV{0N6Fh!yZ%}DK5|45w5xxB
z-tX&vR<gA;A=uBzqT4HT{KdKWX}252QB1zA@uEFXXoTCFA4cGzVlji;6mfI?9tf_|
zpoVPG*?{rYbeh;MNDjM(VO+y6y~D?(Gt~OVX2lh*g6L0(E?x56aZjZb#8<?oRQt)c
zFmkeNu1ooNjfX4XAwAinsVtVO(34##yh{YO89Q8se){2Urb$*BlHNojXf7EhSsS7?
zU$W}|SChklWreQ~8E$vK2qW}jN--O|5`naXI^7c!wo%*~WJ29tUC*rymD6c)a|y5+
zZ|2ZhIFEb>95w_7(MWfAkMVDJ2>qF)q~!a8f=gEKDk{W>KE<kv?`V-TsAb4%Nhqje
zS3ZcmlEW&H<MQH|ubIqvU9Po^XA~?<JnmLCI}#~r*x}|h$^Tae_6I=uIBw~^bPRt=
zpH3%5=^n$M1Q%JaGESy8K0mh*gtWUCziq#A8i2tO|L=w~{SR=34b_&r55N`N$uj9m
zEX|2td|X=sv-8Q7ff<AR9u>8JA(ce4(mC}D4mWI>{LFja$3VPj+^x4)V>Aj(NNbdh
zl<u_v^rw_~fd`qC=~sO{)2R~{X?qX^bSUhOd;Be=weLvJBwG?9FnZ&fAz^@G65`V?
z%}b{+;is6=Z!ko4tv+0ttTWl)Uj1}<r^&G30^*C3vA~#}u@a~o*t%gCz6!7L4;sI0
zqz$EJ!74LY-@`BfCR~lcj$?9z>ftg@`?Nw<uYW8pWvj*if&V&<5G<38S)owu7|U#z
zt5p&R$T$xd_titcA$V6rgxm3AvgaWnHq{=Oy$_@z5j7asJ_Y+@<l`dJ?6o{oc&~Q+
zX<hg<n@a`=nw!t!@=*Djq0Dsx5GeVFI4)XZLNdUs4+9t`oImwW<PQo;O#l$4z{esX
z=1C1m=Y`=h^ZgeD9Rt9XWqTt52Z71!t1ZxPW}tf)ynX%{{KSR1J1a*$q#5@gXjwo8
z5Js+8nc-hd$i;S5YO4c<EZA3RQtQe)(6fOQYX#m1?vAjJt`bQkB?MX{BPGFr)Vk?~
zLzYYZ8xR}ZqBRI{fv$hR%8n6ieI7LiTO%m*Cgk`3f<3F!__4_Nd%*4UakKtcNuP}S
z_l?fFY=V;~5PgiGDmWgfSZA}!>=#D~XS2#p?#w3%BZpO;llJx$VdjfbOaF6BW+A}B
zdiUG^B9QMTwgOUjV3x&pygEiLtf_GL7}YK)66&F!ajMKd`wgJ%vQB`XAohNxYkmyB
zf$aSkTD6uqS4oj|;v7=RH>Z0HcUz+?i}llPVG0wGi5m0!4`3z4#N?k#hziL~tjp1a
z=PXiv^JE#m&GR>Q3vf1b*5tWm=dVfoZ7&qQ-O-Z3;Grvx9{6c8PSV1B94~vvgU{$&
zh$kK{^cZJ#gyq@Y9yNGR5KI)tCI|u73WqXz3I*2WYkMe-2MPVekje9Nku&Yy|Kshe
z!>aDKZUIrcTSBB$l$4YgDM19KK~lO^q$QP*RslglKu{^^P(VTir5i*fB@~cS1PSk4
z8+^|>-#PcY-+i9@-2dLkbJ+X$TYIg!<{V?pF^>mgT{=t}R>Lj7<Fm|0oF4@6AW0k#
zijl0?2(kgrL7rSOx1W~GQVPekDJWlo6$5A5Z>UuN3dQO@u~znx7^lfqgy)h<HF84y
zc&H49gjpqEZE5Mr9vgQ>2ZC-{eM5ou27*Czt_Q0y8&x<ZGl<wksnA+rpk60(gEG5#
zG~uMNbB!|6Y4edO>_LCPOxE6zy*9`|wYy&g%OY$P)@}uiaUO%ThZB);*8ZS6avT^r
zo$&#(4-HpWA$iQLl71T<3c;p6H*E|G%5*Dm$nC);Y;*zM#D(Z+38{@sC@FJpmzMzI
zztbPor-(VWS6!VZU!mS6?}G~=6@5gCsw3RzrV)oB?vLXmV6X;mM;un1F&Hjr2~De0
zg4)EGL|7SzU;l+DGcz-!kM1XKqE5d!3$%#QFBIzg=Ea=gYiig52<5~XLm430K2I~N
z69>yP1%3PWP3wEObh8}(CSpp2_b#M}5@2_EKC{@byG~PdCz46t=)Bj6@0E5MM+9Et
z6X5Vg96_UCcF>Wlo7WD8F)tY925p1u0nljlHwA=1c}%$tA)A@2(pLe|mC@+jzqg#(
z{;+(5GspgVEd-A^?%n($ERarR9~ojIu&XvV#9jv1G;t{X^RBq^k_HV@k~g@(Gb=@d
zZ$Th~x02%UN`n059a2W|C<d?BQpDA0;k^{(jr}t^Of^Rrt&ZTNQGps<UMT4GYdJ*8
zk)HI;-tzDFXaLvsxOa2w9e4`S<aU%nr&rIW9pl1bA;R_J1~zf+$O@>ZI`+wkjZY!N
zNF$UES>g=c=ZEiG0Ownavd6|TMd;}YPD%&`x-BJ7CuKUblXLrrpBh#ED4x_^bSOw|
zRoZ*a3b3J_BCfrlDXjaMDA))q!z0F@`dOA;4acP6!V_{kSvW}Vg@-nx?gZh=Nb@)O
zbE(IU9M?K8pSI3c+1@O>oK3sh*<7$I@FG<ZpF`a|eKTP$26}eI0IU`=Gy|Ft5>B2V
zM0N%oYGW(ix@q=q{O*sq3NP}b_+Iauam%5n7Fds(tGDSwb*NSpzlY1Yp2wv!1PRQ;
zI~4&i<Sxf6zxSBx)v2h0&jB!lc+sK9w)A6stMNc$*AhP7u^p@F7nhv+`Nlr@WI-{b
z#bfHXa-&%3ccpIEPsPrS3{CNGKyJTyaRRsI_jO4}^qxlWo^n|oV~LU_=SXrsGw#eG
zFzFpu5DbV@W^t$8vr4aazlw#^a5&FWX85%IpzTV2iGgGAt!neLA|r&iQieUO0z|&X
z4pMy7DiSez%CJ!I_1-3%3&ptDlVd$nd~eT|ogek#q|NqdN&Wclwb}+{9b+N)z_8w$
zVQJmHk*S}6CJOl#s!b>DkA+r^IRKHpPb8;^ttEsdm7?It3N==1pz+AyTQwZhQYy!D
z-zemd&`R+|u)++n54uxYzYeqcJ4TjYa`C9X#57o4cXvExiF^ET%_UNNFY&2;GSCR;
z^Vx!#C0|Dbuy*}?-YADJ8iAOeb0*JH=y4KoWq2L#Euj9h02t;+XLG0jVUs@Pie)?B
zVu^TFnPCbC9baAvTgA&Vi#ENSlBo<I8>fN3XFkKal=6HYvjJoVEGHf(OihAdti3B$
zS|R{a-8p2q(Jd7H{qbgjz%0q<p{nVoW#zPvaJX8l@UNhg8}ykVj*PR*K${3Xm!|eD
z3pB_$h>PMm*F=K&Ma?M|8h1X<n(QUT$_0vyuN1!IE8=%3<sc3E-R9%!>S~d+nZxp1
z?QoOp%Q$_kg`!JjEk1r+bBZmoMuX*V3!@OHdOcn9A_g$BI|8hahRCl^xl(1T12%=5
z@ks`_0SqN5-FbI3$U&%>$NIQXIX>m*-cMq#L%uHAcn$iFRcRG|E-JrLD&z+G9{&|`
z<hBc06Z7;h8^fu}9Y*a~vaefd<aLZ6lA7V$kUSws)#Mw~2KvlEc}v{jfb<8@3dfp%
zrdxs1kvorOKUnhP*5vzF&r3kO*7^G$E*(>>%sE1$l^hu;A+O66DxZm~7+C81^M2@8
zid9OnxYpxZw;`TfcyyWqyO2<X6fvKwcjN;C>f-BytaIib8=m`+kO*jpC0a}&!ExGO
zK=PF(>{CU?Z*Dg`a<E!%f-~+#J*lef#4|N*&x_tj;3Q0W!ra#ko(^J}ahXtAbTnAI
z@d35aV!!D;^b&l2*fgu*ZF2D;>Oo%0w{z)JaxcUAUc;@zm*HLl<SVK0STRrt)BtcY
zs<2SKUWDdN%ReMIV4RGO4^fTN0SvM%Grf4WKJp1{KUkp{6r%G3>1wP8(Kw=*+U(P~
zM(Xl5lL=!fodsnDKd=i1G(k2Hk_4iJf!O;Zs0gtN4#dbZj)vf4Yd#q&0)ZEKw^wWc
z^Czc|yudl>)f<;PB%nU%kIm7c$F=yex7o0a(+rXD;3)q6=H<BKXWPND;w8=Hko6+y
z#fRqvpaa7_X}CB)H^i+8UAYP}ZWKMc&w4z`PT6cd|CEMb?^Y#iXR?S2qr+kcPf+h%
zL@Hotp_?6cCwUe<>~}^^tk5c|2Ww)s8eo&Wbv)&%OIARda=uA8$VJ}2QluK6U%EUR
z`oQV3mhW|f`<<3-{tw%dM(%QI9j)s<3M^H74q~$cZE#Z7h&mpLeTdo5_JWQ;;bVto
z+Im4ygN@>^OS2Qyp4@zLA6+QxFxhi>Brv$+wV}XV$fqFS2IVw*{~@O#kx$pDzgOwq
zy56)e^^owPtb6TCvMm6r4@hR+F#+TMh+0L0et;m$GR&s^`#~*1_akLf5VRRBkeSJN
zkUoX&nam126M@9g=sA?;=$HEP;={cRY0j&NZF^aQtjGQhxg+@R&iC9VqNAlzu1C_2
z7F_xO+5hg(0l4b8%c<UY!L2&W#Z25%mcWYr`{~M0AR-uYA?x;M$F(|Aw&Mp~<g?HL
zUk(OBCt`){aCzoX-tNW$9L&{4OXJ9&-@iZT?Ck75@^WnJwH@NY#-9*wqbpi89<n?t
zi%Ys~1mtysDi=Zs$PKDJ3K%4>WkFG*k?xU;F}wq$9MYeJUsKcmn6g4nrGEX?S}_3*
zxt7X-q@de{g2B&(T;A^ovB!Wy;AX*XN^3vvz(6E+fD@w1k1ZHb=r~iTL5``ijOv&X
zJ?*80(I7PC6*bVS0Qt^-@{TW<aCfO!np30n3|~}LUR|!=T;ih-<ebK<eWxLjqQx-+
zzLnKJqRMF_WV7Sxc;BESZcUl@{XG<lR6_$RY#%tj54*dn=jfnR??_^61hCAd3z%=>
zm<f8Y_Kl*B*V9e;RajsWn1*58z)2_}%O7@e_!op7A(u#O5ng%xmW3EOtG{EQBE>SK
zw%(8+qlK9`j8qY~v;T|AA`rVA+e_?G?rk9)Rg%uE@Jc8NTunkONO5U}Or5ySKOj$w
z@0w%o%;Lekqg4cwMX1Z!dzyhPVhMXBOg<8o{<LLWuYsSm`SEqYrYW4_`2Coe7^_K0
zFj7?IrzBpH*8wBz2*mrd2K8Zllosvciq6n2Jxd@VB41(Jsj+%bDFR~^R-(qnOv6NY
z34=DMCUCm_k8N5ttj)A_%=bLGDNJCDR@o{TwgK1Zd2Wb0WcyXZLPlF&N*1DLb(3h1
zQDeJ;K?9lRwYIaT7%O(Tk=W>Qt|<}4%J*NSZSYs0;3T+jlP$o1)*@VH1O(3vsMwcO
zWWY%~Byh(^1@XhoP1Fpx0+ZNmduc$LKm#?X?zct*s4xI<pE(OPma-_mp_i5F`k{+Z
z+`#`Fc3QsvdZ5z9WqefRhdf||NluqOpMM2q$#XV%^;Q#lm^Gah4mHM|M;~sz!0c|k
z7WN6r`poB^xW9h=I)5UGXz8wmWQo;7c*%<0!t$RRhiQTAN??$1c{J!^?x{>LyqU1D
z3QmfSPc<LARI%8K($9;-#XyTn@ruCQ(u>YpPm%WGIz9~X!ZV;onLiio?Q-9fFL1Ca
zq-{DMG2HQ7Q=ucm6{RRz<rgPo6mMTq5Zy<ISV=I&L)G8KLzpWrW=#}W0mG7g0(s*V
z?3BCTjIb4L#t|5j!|*jU+U$!kj%g^9ramkwxSbyyVHRor5yt6D=HcRhoxdW(Iw9$I
z5q@j)i_|y_sGI=qy%<a5oM!=mJ4uvbz`~3BXMe=^Vz@5%3F#Q(FqK^NiNETvuN?iy
zGnf<s@<N-=`{f0~U@lE{N8NbST<FdS57zk%?n6kF<xv9J!wIA-U<&9LL24e1Fryhq
zl=D+L;^ftsQi@$*EnV0rtlPKT>ClD4j8gTH#ez4S&Swr|cQqVFv@H)|P!vVCM}tCr
zn?c~ytZF0`P>(n(GDyN$hYI5;$;aPN2zwr~e*xa8%368`FIxqn0h?B-mK_0W>IcBu
z-P~Q6fi8+fkcLoAegL|t1MGzCD<CDyN?CtlicmA}xs()Q9~j)%0j>%*pVz_J;uV9K
z!~GASXKjD_(8Z{4YlP^-X7mR8<1Q5mLnH`?b3u)6OTy67yY#RX@yald{uNc)aJvzi
zc~2+q)IiU=4R~%7WvE_yKP3?amMRi{jw9gh(?%@2`@OHDZl|s80SHYKnbkkW{DGpt
zGrkA8pjrG3H~U??1Lh+SkqwIsSbbYJ$1qC|wx@`-F}ZjDW!=(S0V*eFmTXL$%w2>Y
zs3zi+t3VJ^vL`Nz36#KMCH2(^B$6-r_GNilLRk47Abk}6zcwRrdsPZ`D!+h2Kn!TW
z`|(|BpeVeT>}~%zntQweT|*Y;N3<zOm&I9p=8s<ot5p)aq;NmHV}2`vGop0nI|GZd
zPOuQY^6Af>aqomZ|E{!`v9ztQmzW<`_BNy&jzc&B<~aNStPyMZaxZO&D(lDK+HcBA
zkDybgwjCHK!W&G`+=UpXlQ7V?4t^U?!fn8%?U1QyqsLLe6Tf@k0D}lp*R6^~^+z%E
zry9Y+glZO0{_^S$V4^#Yfsj>a{6gENv6}=x+KVGlSE6F3;&geCvKT-a*WFJ4YmA{+
z#K~!k0N(39LUS7Cfy^#(%bV0UyJ`J7aRkVvmu@?t-<3dC%Y+~8jlmgjt_M*WJHQ~8
zB0S<Hu=X9$N2NXA1pls6CfGGg-5^y!H0^36t6m3tLl6b5n>TNACAHo5nmO9odl_eB
z>Mx(zz&Y|mfA$FoT0F_M&9w$8Rg)WNYu{e$PX9~UKT}=M+Gz{6Yh(W9m%=rvx?^mT
zKHu$1p`Byo%^`m^<29^+<dQm<Pp{m#J^_LE2Jcr(s^5@_XEIE7;FS6P<k;@J9dJPA
z<@abm)k2*n$W<A_<i9y-ZQWbK?ucV5SZt`yu<%>2rO??Et`3WQ*W$Od9{Q7LvdfFD
zWf<?{whh+ACxzdL7N`$4kC*+5_lE?=2g;K#KTIeS^P<8g$j&4lCBP+3)W3cEj98k&
zDUjGrfY727FXHCrbo52~-fK88C&$}@NqyVP116s*{?Pr!gpa#rzG1WnmDRgr&{kJ$
z9ZbhTy=4PVwOgp&vBGJEd~XK{&vjnAvy)#~Lv}3wU}JKpU>aGG;#HE*hJ0{Wdgc#h
z*%lxw!T~V;`STe#MVVJhRzV~|J;2aD%+G)CDYJ0shpO_&_Fv7)gzUq|?E<C-fz#Py
z3wRB)Uar2qR4OF3N6I4eVBxBRjJgWu*Fqi^jW<*xJ5m<Wd0Qu<6~GS=9WFS<0{o>F
zR{4qc`n&|{8TkL25m;Ftc?a5FYpc-9+16irvuA;RlYlAdPT6c+o9s=v!4JVS+yBZ1
z;8od$_;M=V734#MF0XHB(&!%RQVke~4qOmof~KLNcIXniH2k8%(p9s}8mFOa=l$cI
zh)?G~aj#bQWE)NEPnG87@jq(#)y(Mn0I?|KPe1}gX#14bBg@MgSS4kfv5k$5r&USr
z2we^gz}4xjS|hun6p5-jSA5eD;jfX=?e*(#sfMBc=~ri!I>b?hf9v<{{K^yKUC5;6
zj?J@k0oF%G!F4{7%*Cvu(5y}_ppk%^dXLJ{dl8H&x8!*UtM#SMs<z^~`gHt5z%csI
z<y8)``bzg+<~5I}Cahm+mMWLp%5BoEq0ro#NF2#r|NU)w&n3OD{}`wiT3lC{9%nu%
zKPB|IVSTy3#kPPs?n7C_8F~l73zGXaDJ6Ea$cvQzABj3oq6@`+-$txCvBEZkdI$fG
zCHXs61W*eQu!<8uQAh?>_Z1U(svN#J33?wPDL?R%r-_Mp$69$<k?tjlzDqzfAw@&`
zYlV1-Bh~Z6;P||*b-L`ZqT(ZP(Y%;E(eY@*ZSmQo>!4bqO%<_dyftj>bcJ85=HBK2
zb6rDJb!Jtxzbgna?BB+Ju>s*XqmUX>w;@%Z(b{ol(dmNENcG=hMCX!GfHp>)ERNAr
zbK4o(COcd?h)`VQUkIy%=bVzw6HTgW-6j>-g_Mihpu^VzMk)-siIC5=!|FPcCCfh-
z`5$N-!1wOUtnHa?!;H)BGcZ`-B*=n|tN2yF@xP{L_nJ-JY9RAB-$Oj4+pF(O8DT7h
zu0I7A&wn9Yb&Q(bB-x`D1JO^FAwbRqHkF~S+2t90m3eppn-Af@^?g%Y;;!s|I#|9h
zk9>+SejRCE)1fP_OxQLPZ!aQjkrlUWr-e6C<U!M!o+RCOi(=J^w!;aXokU~vMprAe
z+$IaX7NPH^Y+#P;^hBV-OPJJMTvX>i170qj<1l<^3r<M>cP$HPFH^k_&edxQq~7sL
zdbCqRw@QTy>crj_mP`I(2=|^8{IaplvDeYqNL(Y}l`MzudAmoD0XJ^wJggmb(Rkcm
ze?87DIcN?mG802v{Jz=>k_&>jy~!SV`i0_a*CJM@Tj<zsyuCv+88WjJGZFnlsby&|
zNG{|+-Z1>PyrC^p%wyOhEa*b4>>kn*f?xSltt9HZhW--cex)B^eA<<+bJG{5JfG;7
zzjZA)xKCGy+m%fm`OfK5a%$4XZ3Zv<pC@+_q}$B|`-14B@gHVE(hF!wOfGT&pq{M`
zPF5QG@az+-AYFbwAuFXBSk>EkZKvhq300k3tI+p;lP~a38V7Y3{e+Ld7TMU|z?^DF
z34Z?=7TFP8k?F>1@CwLGJ`><y<q!8_%0xB;P1n6cpeD@z;8fair}dpTOv5=b;t5lG
zQUZ2b0z^ZOknoK>-XNL!x!V0UWy&-C!R2xX-M>yJ8GoFioh%c&w!H%Jo_;SRaN+7m
zt;TPZ+4O57eYo#1(1VI1@tnWHkNQ69YKF5Yc~bFD!d;E!h3AG9Qnr#{b@3k>$y&;b
z97JuPymSaQCcRSq%KghF7`F7F<jQn&v`B+GXX9?eN82|a{$f{&5#Z-8T?{t1>S#F@
zz|?tv8z!C5CccC&+72+@ynOn11Ku&wOSWIuAo|_v>RI^uRaj!o>k7|FwPZ9Qfck6Z
z%yP~8h4rC;-Ju>6Wt(~A3wPUgn}+#(PiXWwBr9fD{G!b$8aPd122aZh!`<wuvUbIL
z0Rp?*b-#V*dm~T0dUqLoqb~k@dd<X!;fPQg!8Yw^#l2R!-hH+Jv+puj(=MGYfcnLO
zlji<@Zj02gs)L-@q-~&hVC-S#5iol_`tE#jzM$}e63hK#CV=%@f`{Xa`4^2o?xU5`
zVL%{&$H_?%t9RtUH>a#@T=spKe;+kUbwC2fOiPw#Yzr7@U8b?wjCcd6N2F!Wcx|M{
z)7gffv`zwQk|FsNzxpU~^?bfOFZ=JMvdA(Cr{R;;?`>Q0?IOofc8KN>kk+!6E+B8E
zm1B5Fp%`yaiyRtczQf(#x%1X+N2%wvs0*~g^yGbq>bx6R?D2)quJ4r-D!lOR?R?D5
zs+~z*%ij)*ulz_m5*Emh&BFIpO#tQ&2^uGUBh{P#1OgB(`j(JL^D36nPQ8rp&7bOa
zW`x>0i|3ke9w`mK^OeYEMKbkRO?icz|K;6*jl-_b1Ru4j{X$QPZj9OCq-6~^BJW?1
zX#1WhMbgq#H8DXGGum?;iu-xnt<gk-@}pZRO?FvDmbf+4QetOKdQZz+zmpYQcF;Rq
z18T$1Eiuf-s%c0L(PTQt$~d~FcUtIzB9YC=j8YN4O;f@EFWV$!1ebs8T#1zxKQ?M<
z?Cx?|dSC;*3MSP*g;s2uRJxeRJ-D4jW|N>7wQH{I=s#BDowKf6PI`Po_sJV*N5#T<
zkEoQ|zXcHIV$?hw2WiQscwI6`gfoni<_mPNtBo|DFyYp))>AtPjZQ>U7}nmdxQJH+
zt8O#XnxM>5bLKz@al10VRpd4JDn?@G1%S5hP1yIgkYAv}TVL@|@YY^JLjSp2BnH@r
zS-Y`<JemvrTtB}(^%!oT&Dqa_m7~3j;7e-*4h?qcDpHy=v_`4^Z(wMf{2A=E<GXTB
zHe?w6&RGB*_DfrUmd5@%AO9D=7w<JL6F@k0DCFz4XHG;xop(tQdU}*PofPyje-tuJ
zv0ozsYYF7-(6@#vLfp@XPa(bXT4pSe;<`L<UGO*0)i*Cgj!e_pltAZQw2$j9AVS+n
z<=JW?21xe8$L>0Q;XizwB382G^{1$$jz9CtfHgfLFxsrxaEvw@Fg;(y;ARB{DmN%o
zd13&hVkn+hqW4St{X!CI^Z&k(Jc5uu(O0-oLpuK7)R6D6kSI+#3D(Vv1QW84$DP8#
zDE1kMx<PC}$O=n%)vo!K#kBsb=)j~?bA2XgcibSK|MKNjYHI3aGN3Rw`nyN8A>}W~
z?t3%{CHHB7HtI8Z5QBsiDt{_?Tx=mu(hdU&ryaFQ3Q#lny#jbbKAm)5WHLB;x8iYd
z;M@exLER*8X7>2~%(V=YyNZaH#?;f-cQu_0j+-9_3*^C-rwOx^04S8ZK!>;uqzItv
zzRsY{2<ItC6n3v!oGaI{o3I>Fn4hFy8eR-*44bK*RnTwg`}RPd{zD3O<y*C1=jZ9u
zJ()rSkKuyfr47sna#oH$Y7LQ!=BU!nuSv#y`Umph(v!WuRbw)Px)k8b(^7;x;-Zx(
zV+5EOU$2i0L^tvn#RN_q)ZQ{rf^mgEoKD}cA_zCAben9<R3?|d<5o0z3cJ}XHZ{Z^
zYJV>L?>~PszW-v~vqC1nR%!L-5`a1+0O~y0|K<NQ9h?fsEfE~HFFOZ8I0g5v!-zuQ
zY9;&wrcXL(pD8-rt%iYkuj8LMSUV&tln&=T8ICJ__(HoI9Y7e1JJjx}s}hGVWD!9o
z=YR-T=M4uUQq;D?lnKHK%!Q4>D^g2BVz!|3K~y56K%o;$pE+z|yQ2|A^#z=f(^Igv
z;v9a8eeyz3?i~4%|5DG96Ic;GdN6r{b;5dhi+5gX+LiE;Djh_)0qFSAsV@A|jOjH9
zJ9)6bL1}`+5oh}9@Dm3P^Kk`6!A^C7Oi_cM$$>S8FZ&M`nz@3tm^k$8ER?<+Mk)-%
zj|1EMC>^EAoV2jpUPJ!{f=vK1J%C4bqH)8=kN$PB-4Vxe=!F>Ewu<%ZBkJ_Pk7mWA
zlXmYI=D;@L;G_ZZ@%P3*(Dp2rw->G4_zP=wKx9~#2z#oMR_*@kfwBWNKo;Nv_8k5B
zd&#>il1OlKAe(2hr2YGoFd!@gQTzQHNt+tUKoc?pa3huGf@u<w1Yk7IMgko=vj5Rv
zxN&q34cT6MdD$y*hvf4%XvEi2uZV4f+^C0F5?al0ZUIi4jszvpvNj>CM-4q*G*Tq@
z+JJRPIBtQ5xnY{fkyfLx{6IT0HzD}<Uh&<j4Ka8NxtYL!m1%eZCeaQv^AJ<weMFn2
zW?%D=hY^RLF-jwFSsK`c&bkS)KOa^>6zGYCcTyEeKCqeITY#df&a@hD+JU3UYt<4J
z8%6GNj|*1??jYVOP<{|8rAua$JT~aCyMp8nunL6Poakxx{7_;&NgQXDgJF)b@$Xaq
zt$_H1@0IO7yx)-znkRpYrCAY4&~uPF{W<MGBVRvfOyUERsM?2%uY7bk6pfbu9a|8%
zGd_UfW5SDoo71uf%E3P`vF!`7s&|ZBt1;ND7vsMii95u1SDiLjCT-@sGkCw!pi6~y
z;!F^jZ8kB1aal$NzSrJ15fOB80ArL$W?!mroMi_cKy&2Q|9V8M;-YK|Yu4^03tOw*
z@ln!8xBsT5`QFafY!j6xF^N+B`nZ_^VAfe-0NncceTYii2X4Fm0Jnc`H{moiat!cQ
zN%TPHL_i+lGB%uSgZHe)wP$k)Qbz4(7f5`;An{sDpDf7v_<{X3efeam8S{Z*MSyf-
z;A0etA}kE0q@)@({jHM%nY;7~jI7b{hsEIZIR|`bx#cPQ0fQhV*2ImVm6?q^dVJac
zXSEIZ|D%SQ%dujT6sRZl2j*<1|KU`uwptRFBQfA`IN!sSR^FYK{{42xW&GjQ#XqtM
z&kYN5Tp&4%kP)_fQal~@N>?}=hm)5YkbYq#j5a>LzOv=?rD&zH?qSX^_dlKAMy{G`
zgs^W;-%(&a^as`U&9(3#pEf)aEX-*VMRi88iV7Q8{nu3Ck>gjVo0CDU0VRs(9IEZ&
zq;N*|OowK%oP!@=o^?|%Zbj<v`zvx);0ktbS@qSaF!$RZ?GmI!uh{6{83%;2MFyri
z0Dc^ahq=gDmk{iAbT<ec2TD@}h08Z1u9QSH?c5Y^<Swzq=HVb(inzI_?*vP)c4_7)
z)s$X!JY#Gn-XYJ*aZ(FWx;^KSq9xV~Z7-~#T!6seTsD@|p?Uob5B4>ah*sl*77a9v
z2Hi&(ajtk;#5~2#u?P5EB|5%i2IW^0E;Dm?>hL-KtjuM$FVuD38jW$Y%YN)Ms_Rgy
zx7s<#zCl%|kWrCM`P0Fcs*dpszBHp}!s(ICgjG<a8|h$#a#*;x_Mv6ICjwrHjOx+2
z!GY~~7f1sfWNur+kyXZrRvMAq<}iopWahcca&Ya&ZPGm$7}1Nvl9u3{>o=+s<gOSf
z9bPw)3$cAEDWR+R_^SnX#mnz+6K{OeatoHN?6ob}(3h$k+4EWC^q1I^RorojN^@A{
zY{^{ZpLT_&37^K8>*)8EXFa<Qk`K_UtqC6iNJK~7%P~LLlH86F`$)f4kK?qlz05t1
z>*<Q3-@IhqDdfrJGp|z7>3f*z?1ZOjgSmlA6ff?4bo^u_mdVjN&?5L+JQ9qysDSN?
zG8yRD+i4u>u)PNNE7P!^niq3!IuGuMGKxU+<*NnaX#?!R{b%X0$*_fvN?cvl;`38%
z(>LPw*I>bRz2aVWss4Hd2pQDIO*b$YiTr)|KsE>Ih4<ue#Z4FqPTqJPnn`X)1jx8^
z!#C$A<x#e5x0*P!J@j<Gm!`!G8hMtb>JdnRh(mV70b8-EEw+sROte5#p>*xMXTjhP
zpB<gpcSc>FW=_z{+H>vO=QWGb!J{B+{6Rj;gx(;U3KMTCuacEl9EtYxj!;^mGcHgs
z4|_digg3+5eF7)Q{0b?+5=C3)c1}qQWeWsBJ}sqcWVU3K`g3WzxsK^~j4<+<CqA27
zW6y&8#^+bn+Kt{SExPI&%T=;7@og!@=o<MgYjaks|Ma|ZBRNO=S!nf3mZV+_OozZV
zwDCt4H9D`q@6QKXEbXj7|HBn2(Z4R@b0`C~l|WC-Cmv;si&SMn8?Y{pWS&Nda)Frp
zb$OZWw+&s_z`CY@j4AZ9dc}=_oy|}Co{N-YY)no(?2wXN3N=W08D*P0db=3*-yOKt
zvfm}UgGxtBDbP<fp_)=c4@3CeI!sWA;nV&QuP*?v_Od;8q&yL8+R)Lv?<l$o)h8yy
zj)`kOJ%6h2nWn_$SV5T23Y&pLQ~rthJ(+~r@^r!Rek6A#T~%vbKu8NTz2nk%)p(FK
z!==T`#X7wVo<`?Ecmj;<5GH`Dz~4?Ed)rEjRPCn6V*~Xi_gZ7(RuVDS?@ArwvVvEm
zy3t#M;{%8(CNJcnO!A<k9X~TIiTAG}DeQwFl*baHq;k$H(ya)vOGOk#PNgY5YT-jG
zNDC>a<8h}Hj7NS@A}Z9%iO*9pDnK|txBtMPXoU{m=F+IcJc!$>IS#p6H2+ClO!5j8
zsj*yt(~HV{KDf2}IcokvI$6p9>C_7;KFF}}y~e6=Fi!hk8vA0k!c?<{)3`N+=2IT`
zR+|3+V<tuPUE#)PCoaL)46%GGgH%jjo)Jom+v2|EZ$pOR7CeZEz=j-hKsS5Rs=x;?
zXitZ~O2CH(P0W@GTi^1_wp_#zOhHkwt<9`PO$5K7q`(FfP!X8|!tKmxp=m?Z$BPz_
z4@$c{#n4?$TK+FuZ>FldD2@ZwS%t;1a2Ki0uG|s;;o;=7t~&ZIzYWp~_&M!?tW2_5
z?v;R-w*4c*5iPE+{%EUx_9MvCZp81)(@0PKg;`vb<xixlk2t{na=^M<z+y3Y$eF;C
zkEk6GFjAE<&HDt5=)tD^L-ICxCH`+n-UH5r1x+cwBQvR@4_uNKz%Jp;2d{sS*9(WQ
z3Mh4E0pk<M2Ved7xrif!bD=SR2Xl?N$^8AjvU`|vILh|#a`=ISYnmi@^LA{mPDh8@
zp3Di3grf!H#L1yOL+4A3XBaeLU*11dvY5{SEYg(9TxNYl7L!Lm2;A2>L2C=|H%=1-
z15O-6?eqcE{Ki=*IlnD~V>+|RFRjpM9D0ix9`zDVl&u<b*a!GeA9(7oCaQRFcCs-!
zgVsM2!292a2w)Q|`Ys?}5nZ|ht_NunFY3wBo*}nmL&7Ag>amhsh}&BC{_BOU=82((
zf!v%XB;LPFxhUbFh!rmg<`F_>C-zTY0vq^0eTgaBfp(=sj&u4a!8jD>LDeUYnyWfU
zW(l$=I9?)Cv?zu3g?LmRCS8Eg+HNG81YM`|D(ePR!Cuj~hW-@@g?6~wQqAhWpvaE0
z9bqTkhUY9m{)o1SG7<6L@p|*aAdjvX6Ij>K-G`X}Ys3fHnOG96!XeE++u={P&1lwK
zip|t&f!2nAV*ViYbL^U1Cn#4a#~h7xh^;#c(yk;MkVkq&96VS3VOGPW19_IXF!U^2
zF*+(NMcHZus?wl8`--M6<B48COW2Kz5+&{c$Kbv^oeV*iEPfa=!erzVeyj36{>K!+
z1-m~Ne0H{$`O`cQ4g=EuXl4J+=tZ&#>+k)!FMjynp9`5cG9M=TFs>F|$GKF!n9VOj
zIe!$7XPNcsJJ#k>YRP*q;LUU4hse1`*9<=dFB?dX*lfi!_%TYyhE+_I1Acm+ceH}D
zd*{(Plzi*L#jo>{=nCXi)D&PQ5ijzvn+8+dPI&ESaY)cm?==g822zmFoWf=Z;MPd7
zf}~V?RQ(mIN$bym7=rOPk6wi;qnYytp*L<#>#DxTty`7E1k#u-PTzH3inJ7d>L2iJ
z{nc@#@U&_Ks2w6>5H*2Q62WaVj=%qXn5+@&WSX^r{Ff7OtVn=hcfAbLzTx*^?N)@!
zJ%h`m)*9MUB-6PjZ~Q}VFm2e3b03KV#^8=!Z>FI~0(C4L4Gy?x9V~~8^=M%4IQ-L>
zz@_q^zNCih4{py0Rvfhzx>0?JWZnR>$>Y}0^CWo0ujsM8@rM}y@o?m7)PsM!)5;L~
zVh0tFOn5E4mnl2uA(#IcPmG9NX(nD?>~o0&OlLZIBOLQt{6S&I{D;p&H~+q%dGM>!
zA}a4N612lxHfyIKv95n`ux~+HKEYg%P23ANi4#`XH8jv&UI7>N1@`A?zZrp>M4>=S
z5MB59c9{O7PUDbW^e}7(4}(FmN761fcqKZ?>?@Pi;K$*Mw>X0?St|lm6>TvZB!aSN
zrEks*kYXQ|KNt<`-yec8h6rEH2V)56LwR=+Cae=nKYjtZpfmIxA!{S!RAZHCOi6h<
z<<HY&P?kHymNM$`B3}q05xN7%n$Och0N$7F4>Uuw56wjV&;IE1+4yP>KP)T?*K>-0
z>F{2&p?Z~xIs%xOG!L#VWT}LAO6ka{w7;__ENsn$>XQ(~R4K66GN>g>2~2u0nNJM&
z{>Fk#1EMx6;9*Uusp3{WVNUzg%1Vj1wONj6L+-GHl5RED(i%g%7RIU6=P?CXtEhUl
z#T6Zb1Whe)^?c6sNeE|Jn(%G4L|@=HO}~%9J01QB@7!Nez&quk6Y|UABC#(3i%U0`
z8M<mbDUP5M%?bDPrIubnIB&klMn$>V{7+Df4g4edY2yDs$MygF%VR?9K$@?>Xxsi^
zvb@0ZQH9?--}F;Q*e2e}=afa>s-$E<`9CJAH^Hy?U-I1O{elWq5Q`wmSULK4<U==e
zz9r@$7}D5Y!;E4I#-$ZBvf|QAbDHf;Z4-Gw`T0C%%oudrz_Ie>s%qYZ(&8T<^6_xJ
zbl@TSEXayJV3E$8MBjev@1&}HZhh;PQHBxb+U&l7uBX(>Bim3-)+qj+&mh!t6#_T3
z;$bElaKinZ@UCdz1uzrcLKi<8YN8wXkp98lvoCAW3!(4AxD|nW1+;Fp6|H}ff*_^0
zzlSRZUfvKzJ|<})PtN_1qBiEp6jo^UK!<eK@z(#(q5ek<Mn+Aan7wZ{;z0mWc<ASM
zIN5eWLBQCcR@{eRVR=o~rI6namI@MWWFbWN<3NM<88Db-jUmh#vQXMj#{VBw4+mPk
zN+1&_W$BtlP!kmdJT$Q`yT90-2-2gpz>2z|T0A4m^n<gIfNN5&K{@8&e)>p|6BVEZ
zklcIJ^!fr+coPEMNK0BGDtqKvCDD!YS(QUy1b$xQf4?4H7~wFAJDq&!R1#sdeX8aW
zB!ZGu8>l-ps!q;_(^%ZzH}oYqNL>+y8|fh9Ip7hZY#E5OUgd?>gO3Tt>rsSqDYG2e
z?UY`A0K?~7nP7<fY~Uf%lV`w`x`V}cG2`hOdZP=_d;5sMmv!P0z~#qk0!ff14;Tz7
z8#)@Qz{aQk_ai>!i$QwzO}(>`1>TXeAE2QU2Wit<;0(_MH(a1+c7t_4aMo`=G#TF>
zyN)SMZvG5-1pL5N9lk~O$pWm<Pki_%)43M7oc$QO?nZ~{4V@4v2l-cqTSemNnuuL@
zdN=OY{bMf=Uk5*8aXe=CI7#T%k8HoUhViulF-1FYxtHD3WYOR+v9n>MV(?Wz+rg&B
zz+H|pnkp`g=h)RI3LD_&#q>A8Gq?8<^lXbIJxf3H=i)jWvbI%oCe-4b3CN!p8fQDs
zMb)7y%o2(1EML{X;DSST9c#ZV{X;kGA6z5D?cQwlj!G~-OnK-Bvm`oTMp%c!*6*Lc
z_a1M-AX^>K+FS#vNHS_O9=PMy9v?VbGSJgwVS4k2?lX~@iV<+BS=lgpM#Veo0q5V@
z^RGT)^Bw1-tra>=n;&TxMZbd3(SNdA?9Lzp+Oe4qqF5k6z9;PS{))17emmaZ+!D=b
zn+Bt%Qx<AqW)b)wryWf+L*hedJad%OCwyU6H|7KlJs0M*PN&s_R0`ywpwIB~3&P$o
zz1|@<dsqDgLppWXO~@YS9;$q;3|p=A_gIp<w_5p$#ZGe@LtIp}<o&43#YC36e~&ER
zPxp<TFm{YJg|SPGQMJ%EwldgZ)RijZU6L%}YFhA?R{|^OG0Xre#u){BmMEs}jmkP_
zx!v#D*QcW;QXYeZ%(|T9nB3?)VoF*QI4tg4c`4;lbp!L8H=|aETz_*|r(!JA&TR?i
z_+YBM@FNFt@t;aG+0LE4pLMQQdM1KxLjJ2n*P$DR?v6hUoSR%KTYK6Ul!#{i)ZEll
z<V<WP2#=G3zWb6pR?4ZPgO{b+R!iDWMBZ$8`|8F*`$R)X+gFRQI(U?wAJKW%QFSkx
zc6hor7s8H>Slq7gdtO&J{%(A+<lVIRKU#p9ECc^Nb%+(2Ilc*^1fGW(H@^3%JVffV
z#rem(A08*yZj7`o0&#E6{2OK;G@c<v7Cr1Y-b=3GU2#VY=1Ulx=QU<Tt#)YP8spmD
z_Oh!fSQBL$HQAEMZJKZP#%CUC2o29)V~g)qN=v1i8|uC0v<)^|UB}Es9SEjfpWqq?
z$W6O0Byab#ZN+Jh+0_YRvh(9Ef8zXFVgWdljc~q~Q?G#7o89&sI@}6gf>}tU6ldng
z*npks#@8^PbTr?jx}r#sAM7{0<9WC%9q$b5ze2?GEE>i1<y+<w_#?D~hOqY?Bg4!L
zAsc?Hv2!+w_w^;u*XyY(w#dpus&J6x+<nyaK0K9OI*|2P_>IEU4)S9G_v*-aTU};|
zPQ`QTwU+|ESS}>ka-NgT(<>kh_^iYV6I()%67mDsBE0$K{GnHBs$#kBw?C4UXP#ZV
zMCeUa&~I!n3HWda*sp23dx1-owJYrjzWPv4To(UL3lZ?&76Dz8mfz1#dE#qq^_#{!
z=XILRo~>lKMWVvMDC&8}Y}w31zUkweAqm;aI6)yZh1)C7nzeZ_C5Jjn%U)%>Ji%Fg
z;dZlRC<o^2Wj}u%LB*YzK;Y8C!v~M{^L~OPOs$M;Rg?%$68fCIG@;jBvYyYJ{kY)_
z`C0>Uot>EOP+{HPFTa)i%G@7#T|q<(Ua&{kL5kz^wxl;n{xWgfk<ycE0o#=U56Nzo
z-kH%aG)=7ob;nx6(i@sK)!&~tp~FTWfXR>`FKs=ull=Tf93Hzcg@aFe84PF6Txl!m
zGs6+ygYL(6vqQ!05m?hg(}mjgT2%$m6ctSK;)$~uM|L!(`X^xAiWLF0S=GxCSLqbk
z63U<K<Eb-m2wqV+`;(zd)GNn>F#raHQ}3~s2Y&z=F%gXbw4E<j(WDXH4o(0CXm<W6
zIFLP2B3$Tus=?3i_>pCP9XzvJBLl^|*O09Fsd(^5)jUGai@redCXl7=f(C+nP{;H8
zXwV>X``-Z>ib=U~gH!Xchcz*Sog}y$wRYZwIS5CjqNmdUYFckUPLlFwvu*REIsR22
zB7%Q-MA%!DCflq1Ap>7u_*Yy_T|bZogrC-pG3_z;IICnu1M0Ey?lV_f-VZk{!L96S
zPV@#vQ{!}xnXpJln%T0gTcm4Q>+lb%3#T(CJ*F1#89H9wepoEz72+gUH6vberbOhz
zeQJvri8FC$NqI}Rvy)77XLMS<6ydnB+ln?&H;+eqat}!P3-5^^pMMTaCxSn`?-b`6
zLe;)XeT-vLaJGRKmvtoowO`S9?R9zD8{9Lr&VH}GQd3dn2x@K555;yL-~b8=NVtyG
zlw3nHPl2>5W9KI~21{6E&YY<DHs1+i;KKH&MqgQ;x7Y1{teeT+4Qd_m-FbFq*S?HV
zY2j+7m0;z#%W%>04Ucg&WA3@X_zvoQsf)cVV6}6DYK1<)Br~#vyjuwYXOZ8O9T9>(
z<Xu2ScC+w6T=<uQO|anwi<Xed0bi^r+p+3*@9PT#b{DK&V-o08iCJWZVJra4EOZ^_
z8$7;A&<KV{H?M;!4|M_&rQmh2L!w@~hYml84vovqOdDB+BH}>C$LRX@Dg5e9NWruN
zv!ZxPH7_Ji2mE>#V5Fnze%6%fIP8X3?l}ZDgnCOV-(?JcKdjqg6tQf=r;;HmnwKdS
zsj_A(Q#bv7Txc{-;w5PrZmClvoy9wK{$Rt1S@5S8Z*Xp<(ImRnvob^5gCi{N1R^bK
zfvnfZ&xR^SCKeD8ty;Sym3?E9qvFrvZ)X5nnBnB2^cDC;aB?qkJ8rcS<w06x`%vn_
z>bZ|yJ<p{o+HAIB?I(&>o|%YuBY=P#+xhoZOPO_%n6JfrUfYh7BUXhM*Ld4VBu2@7
z1&mIRFY%4ZZ9~$fBl-nG=oM3&rNcPplW)lr2t7SLTSksoR0|xcZCChp15CZBWy3C{
zos;lh`|P%nQd2e!Vsk-<lHPXlw)ylB(pLRf8oFtM2JfGOAv<+BJL6iuoN?hUv0_!p
zwX4!oHkVqj;~Rdbh+9ocGMHaCe)Kk~?rrHZeZX|)t()Y}!FmN*DgW?&deinAQNbbK
zKefa^fx>pyOAwNL@_9If)SqvJ>hiOifQQZ&BoCGOq~1NZ+Sn?uy#R4|o!9CV5~@EA
zfZu*?&CR^{3aF{Ae?&T;hzySyoUPLI?lj9et*^d&E$`~(n<dM+c`tH*$w=p%`SmGw
z@H)t+?~5J#x%do8=&fg_Jeq31tbzAR41cR!7s06i_R{SK>!UjDjDc9w%gWfXtDb@K
zlP7V%a**#bKI!B;EocpQF%#2&&cuA(q)q=KF2cE6qxu&vN43e)ItVN~T56S6Vd`hW
z2#L33Ga80a2NJk-;mCX=#;Yw{;73!w*Bi1{F;L}WHBO^{`C^~3v--mQrK)7-OLMpB
zOzHJyJRYAEC>Okd6}^7W&G)KPYPR&Wsrf3Jv{y#|kr2=?UbjopT*7<8dxP|R)FJXF
zsB*h;<L<pj@V`k;{0sEau-?3e5qjTs&XFt^NfQfuLRKyV8LjnH54^p8g(mLPDmsZK
zg0q+OY&vv}6i?^&E9b=N+HZpkZobaVMSos1qa+y_Q_6YeM0iHFKuGLf&pJ$x2{8V|
zi32mL*ZE{9rUow6Y=5h^o+BU{GLFM@3da#dGy!G1p#81Yq}58!AZ2z9c2>_LNT)xk
z*~h0*#I{z6J9$Lc5#!-JM^5zzyTiCR6aaovP7|Qb_*mB0Bfm<U&m%&MyQ`Z2)3HkC
znxf-_%@azLlOnTIomW4>nAP1A713(a*3G8&jL82mdrE9Mmg=OE`Bg(JT9X{-fZsm@
z%=DB-Kk|jpMcq%}{}kUg)A|+3>eVBQ3vNNZOQ|wo&Nb~y5|>MDu~UE(b~a+4-j|?w
z#%YkF{QCDXEUe$ZiKQ#W{<dLE#jDbN`|>BArRY%-E+4^RjYZBa>ipwrUT33@TZYHV
zeAR98$;VwCAu|6<>51tT*$P&Al1xR>=f!S3aus<L7rz^5bzgTHJ}n;g>h+e;vXgCL
z$<LIV9?70dzh%1Q@-)2Xes+2frFm*z#6D|-^BeS>j6B(mnzM>gsyJ!nUSI$=>@VAM
zo6NF1Z7cou*LNrVcQzY;S9q&%GS9UqQ=tKs$y1Q@51g8O4FW&&y4{Vo`*Jxu^R?jj
z*3zn4y$YIsJpO{ROZkh?!XqSj?&-FA(uwF90!gsKd&pZGA?h|!Zy*P47pdNtPb)J_
z{3`8|<CoEtW;kgsFYp`eDW56L-Go!*-luW+rq2n!xBO=VH!6wKDP!#e)axgoq)@~S
z9{UO)k8&&vv%kZU><E#>L6&sBfu>*AUPwvTbiR8~&yGV$a$jZh8aLsRp}&sL&T0$Y
zBkXgeY@;7h(nTm_RQBBIa#IUkv)yU1j#Gb%v=AX+R4NEe=OLWaW#Zr_-@nP|Qe`2!
zeRl|UdicytipnKy<oG6F)*k@cg>yYr%O=;}b4GZh9a_A3?w2>D|CXzZjIeL51OiTr
z^lVwjoeLzDN%m2{o3bGhp;qBrR`=obDZLv6MQ3zO(t5Ao8)liWzixM95f}(don?H~
z;m`n`4V3;hR53UMsaQ*fpW=t|>jj=np`xg0C*b$?_$gbh{+wGqradb#WjX69bT>=~
zL(`G+J?a;QFK1H>Mke2c(WTi+B8`YZEXYnzs`uW#`T`PkgVz(-wpNKfH<23H(#QSl
zy+#OHERkdt_7-3Qjm?yBkvW=4Qdp2+nTXYGd#Mr+{PYFJTX5WUFDQ<TU%S=&xNq-i
z>Eyi;in#fr9Ome+b5YBJP!Hgr8R)A0C8_2_<~|`=cE|(Jnp1oPaX@_9b=By_-g5)H
zK*#omE7E?XlAq%;cP1W{JXYx{gsIP&n^p9pc3J`*aKSLuY|TC_uG)wmG7FGBwFJqu
zy$ZjtfA2Z&x2>9YOP!SC@y)3U>Rx^uuM7C9GXm3%<%S`Fr0TPDzub$jOkE2pMpG{n
zBuW%HuWN5T>3er22D)I3CWLo4KaP6M5@8RIzrMNsi;Z+<9vgdPymhi?Pp>nEm!F%p
z$3olNdExP;qh|JN3Qy2WIR5)fWIxSq{1Rec>cKNK8Fk^y&eyy`%L`dqeoMkdV`HF<
zr|LVt1d+={MaVa;?8qo6xsAk4UgFOT|1}7U&rc6_Dw_?WQ;CU*{XWO{&;XH0jfbI3
zFc${|x46hr(ozy?dRvD*Kh_W|W-sCI_mx_&iQn3gNVSyoD69LjxS4eghmu)5VW?t{
zH+waKOGtEtD4;CJ>YcoN;n^p@ek{;6b5s!5)8EET2@m`h$17{eonMT*E$^H7xOpq&
z*pB=SPfu>W`Oo(>^+20t+EWVtWS76>s4M+!zs;iitZcpGa%c;LI@N-jwN|B{ZpAQ$
z`Z>zcouSj|tMvF*4Co!T)@M7keV!1Df#rvR-*zaA(o-~$C;Qps9I!1bSW$J=%V>^f
zd<FxabIaI^-F|}+C-0+=-0!%LvMxZ2{xfji*i&(aBEF~P_5#_N9j#p$s334zf7)uQ
zDZ&BTVKqC{G+*jMznt6Db=g)eG{al1@aHo$p!vsCq)_wJmmh}{OwR~<JeIJc2;60s
z@ha+N7qcd~V1EVIE+8_#dyjT~f`&kp5!-(Ec>J%RL$Svhv11J0H%An!rZW(zZXXWY
z5I9yq&0hXJDYl91>9yBT{1xAU-o}>^tMOgXkHr!h%Xn^~C_7=nXWaCRru6b9>+2m7
zQa4T;oyzZ-O0hNaJFo6P5rmhi)byfwTHtB@(;4lra_(`~CSavgGs%oSY<3~z#|6Po
zoSr?dI$!e^yMWY(R8(>DC-7Al$j;sB<SipLx)KKT-jloX_bB2v8Z%|vR1V!<=kNG(
zy+?_TQ}0qNrMd=H>NjiQZuYH9eOU>Qf4r7W6?aS#SS?aE$w6T`x1T`U<Ok2&OG-4d
z>c&;II;kfI1e{H3Guyq72gCkh>(|j!9T|7WV_9<jF#86dBQ7GaZ11+Hw+L@TsVZ!8
zo?FbIG>THg(NLBjA&P`GA?Y7{vV_&hBNRn?ju7<rYVMbpz&iI_&l_Ll?H8!QzbKr0
zHAXn;gozPP<>(Q{6V`h$8%*f)L-*kFrOYj8&@pL%VKRrD%qec#n4P^KD<)ab#ry*M
zhX&ur&T%;vm8wyj<1d6y`d=aF4jMAp8{kXbkoXj&vTa<T*X);)y8nQj6+%)gOl3B8
z&zbK+Ao2QZ0#X=!j}O*KE7P~-C8-n+o|c?Za=4S_`imhW^@0iL5K4J4+p8|5p%H@j
zAV%7u|9Ml{)63Uyl`g|cy^!h?`K8wFypl?SMw*P|d=3?vwnpiq)<e;Ra%dH^N*|gc
zI{9iA_5ZwpC~E$FzQ-cX?c0|hyEsmZJ@*24PMLgIQIoRlXI54h{v2Bm;0ucY)cStM
zi1Y=QTu0#*_-ig|`z^x{Y$jk>!ckAjxp}fb6icGGdpxD|$cbNR3DnbDs`AoKiq0hy
z>~Ag!pPN?q)l+-ur*{WS+^}1^AQ#{)$x$~8*zVen1?7poF_kNuzxT#}YcJ%NUdRzR
zx_3VIkv5pEy2LXLlTX7FoOq?b5DE8g9@N}YJ2j9IepNa1>*6~Xh~4^N;_6%wX}d?l
z7T9qil!TEs@kQO3geoB|e>~Qb>yI+!ZDZNQ_8bq~tRF%SD#4PBIPMb+lngP8D>i{m
zJY$p-MClc|J?x%3#>U15??6@HU~E8h?U`&Ij-t)+#CkZ@IsR}R(Y>z&xa2uP5awBg
zRMTm3=j-zh>(27t##$<*)|k=0U_U<eAt29L`u-q`d<dybn0Nyz2yK)>k70&2mXno%
z(~~AfeTK;OWDvv;jW5iuky+i^;RDBhZ?<Im*02T{k*6Ojq}54&MvYTecS$rsQ!@KU
zSk`;~f&taf>Ue2q`S%YJG{R>Ff&MrK7tT_=FfbxGFaYRx&i7v)N?#CcQVt(a^J)4p
zK|eYj<>{?5+2rZF37I*5t-J+i`g?jZjEs%XT9;K;&S~?K<R_GEVCldq5v9jm1-YKM
z97lmOo>gTXDP`ilr^~C3wGgF|BFcNYnrwl^f>2+ytgP(0+|6Il$7+4^pgF~a*{vSi
zeO~GK>MK69D1^#3Y`0)h$d$w*9xRfHMP!QTX<PcH9};?UjUpOz6oF%oI0E8pNy_jI
z0hLxO-B!vfz^?{YTaKBDr!iwcG@3WLHXz(4aAkZ}lHmITcn?|Scf`09>AN&p`CQ})
zyBS<sC1+_kEAZd|QMY3jniHE6+7p(NhrVXG>2iP<>aknZ)p~b1nY?apZxnFw_QlEb
z!PVFhf)_?Q)vxL&k0Z${zy0tK=WIS9+%|W0SfdL?y?baORoRR+-w#??1YR0}8|{ui
zw5s?h>+gT~y&3!4t@A2?+j=l0udb1|yfY#>5LXTTbhu1%tu%1oUd|DrONgH@ipMoY
z_iUzMchKC*vHi`!8N|iTx^!6HchHKY?0ufJ-=>=w$bH+X#Yf%VxZR5cQy8J^N0m8(
z@?j<0)b6aV$p>yokQrGqtFodnpW#%##!t)aMf&UD0sll*$C4whUvr@;O&c}z5EA7+
zyKP$QlLN_xVF8?TChw#p5kGy;2a?vXuv;Bz#P8lC)TV#S8AM0`fm+v~Bb4jOp)Tw*
z(x7vqPryxy3M`|eT{65UrlpsJs|rFV_X+POgoFw>9P*%WnTOl4pr9ZlYH4Z7w2g2w
z>ql4(UJ5HKvloF@uS@ZphsvQp&jsRU)fU-x4cowfsivQT_i-flOjJXk(2IhH>OFlU
z-@qE!d3Jv`MG$yS4Ga#_wu;%b(=UI%cOniwMe=uAl`ZsflvR*v4?qSo0A{e%<1LrU
zRbOI0d;^{B;qm;N57Q5I1)ajY#DFwTuPL-L2(ZVAzAm%1rfzM4Z|2Zq#{o-i9cV!%
zx(+^?SEBFOR-k{E$5B8Jh2OwubRM=;f6G74Zn<tmnHZm4m%~Sv2hs#_SOi%Vu$QBW
zm+xT_#K}7oxO;l?`{60u+1nGu%Kf(Z4Wv!{e;)$4u_^5=VM)b`*9>n^vId1xLaKk4
z%k^x6@n4gaTfISZ>30A5FdoOIBv*o5`{LKkB6wy=Hr9eO92%zf8cJ+t@YqHSf8HJH
z0#EYm9OO%eG56o+!b69%$>XK*+_-1}-^WB@9>Am0ny;YCv44QN4&F9U@eo*SFcB#F
zCbC<X@WD4^;&%setC>(I;;OQ$;0_;ajTg?-qKQ}vB9*~=Tlxl+e;45=sNs_Loh|p_
zVS)vTA*e^a(*`#@JZ7sB^N*qx+k}|)ruodBFO=t|`87=Ao*fdZ-F=%@^oH?lBO_XS
zg6=c|U8c8O@Oql?daHa^hgAb{Z&5=29e>N{KOUp*{Xq7{sIs!s*UWcBWLZ}c8=mEw
z8f@}Me2D{W5D(JiJv}RfN%nu79c@{hUM5S*kB8s@EhYSGQjq`tC{G^rS%8v6YNh7l
z;lr@htk{&#+0d!u)2nqc><O9+O!*Ig^ic=*_1-VXfojj?r@5tBt-V3q=Zx<s;KB<W
zRY}jwQ!g(szl{^cAoBETu$6$JQ2@`C@|ru!cxfS0!yq7d`<$cOYX7XH<OfOV{Di_%
zAFi5wqY|qK!eCZ0>pK74eog<id;N1AYd|{4;Xj+6*o44KqwAZUpmzziZ@$uxB${_>
zz0d3LwLgcjZ)@Cpj`?<p=ZHP`KwUt<4M^cG>6c@fS){p7=;FU1xZAV$iuVKFHe<pj
z464<I4Cu}nX<@)>q}g*CJ%i(`46b+ODdEM5?AN$X{oG}N^*U;aiHLCfOfdy4vnG<O
zqU0q1mhi*P9+*9ly8+I<=W(IDk-4$<p4;}$!y(l!nf0h&Uo-<!jAT#uOyC#j%B!pb
z#-j_3p8Up5FyLSS24|Rvv#|H4)dM6w0B%zX&-jPTb7ft5IlK2>+2{6bNN9|NkV>iU
z&IkpZ^V@++@Hvc3_~!?74Cel~CmbOHvfmo_ZgbK`ev|^^ju*)Pzk;4skKSa0QqSz>
zjUQ;(Spk6h4sC*!?kUO<@yAfUF*^lL)iC+dY?tr)PK^@O{XzpUoh_4Az#z`3Ty|sl
ztj0re!5`k5EapZXLIGRqC{4HNoCUB)2bvtLW;de@82^qhB0^%V<>J{QL)Rc%Z=-P7
zSy%ozE{c2)UP?hV-p7&$jGn<w$SpK)cH(i6bFvl$P$WDa8sLVLdPw_E0NBKNUZ9q>
znk}2>>^pz%8|phA0i#CWyGjpewYk9t!tw_iE~0HpCbJTF^t7A#)2i??Xd@|y$)Q#;
zk1NBMUvIxKaxNcG(<6E^4FbCUyI1+V@oo8)T@tRbaLq(g1fd?a?;}}rHTuUODh-5P
z=gjWj(^Dwevp;Gd(B~ix<^<g*p2?0wRWkrhE`2QPas(U@v+7jyab5Ki;C3cG{Q<L;
zfo_GhPY~6C<RNrQQ*Z|=FC9c&Yd_Xsn}AMyeeP8EOCPcx*!+e%E!{~*aTq$k2f(c1
zF0V@(yE8bU!0eMWwE{~)WnRnA1dlxFGYrqY8)4yl|Ioe!#a+f{J}sHM1rn=i2}AhX
z54lz|l_DE2E_=m3JsjoV%wreYd=N~plrNaRQhW{f%Y5=bB^`9(5}(}fNx=+;ei(Fc
z*S>()?DGjuyd^_ksc7E{=tFb+<yg*H{|-wa&E$l+Mg~w=JnmK<{jZ_6=bz1Le2Q=H
zxO1PMU9;;{3W>H=vAtsNH@9;7?9))GD26QtVjrSTv(R`A;E3Wb42agVO}(K<{1fa<
z3Qh*cW?V(rAk+3G&c40G%<1t`<>7G-LW4-^jmrL5NmVp3HWy$GZtDi$8yuL>J|>l0
z!lxARIj=5V#EK)GCTeN3QL~vNN*VNP`zRKFcWBzvixZ*019p`Pf-mo=EBMYnz7VD8
z6SR;iZaXF|EBOr?L%DqCo{T+b<BDZbns}s{MujMuQ_4Q~EeG=Qne&UUb)Wc--!?J+
zd(SgpM)sz}sXgGyuUz$h;*a}1R^f1Mf1ys|eaLiOY1RDS<|OTE<du80((FA}uYG1w
z$8c0La5tlQp`M!n?vt4MzC>}yiSUH{Ec(KD+}FRp^8Y{$+I=4r4Q&|8VV>^ozE9o$
z#ay((;*X-KFY0`ldvrZUYC1|pA-VWEy$5;Q*>u@;{m4zR-@sRBqeyu4l%-;Py1|=8
z7@RBgDvW3;?xJh$k3^6ig!O|x0hMnIBWh;;_06;H;bwlnM%3t=7s(!heqaoodNVJl
z%N5$=#N6k**^UgbWItzi&o!&_Cw#)OR0Op{4j^&dE(XEzLBqowM~ghhjxair*LEyO
zQB2_1NCmw}rym(7Ocy(SPIl4Nh^Eci$l_TDNz2b`u+PXPr*ezmT{C&rv;imf>^&=p
zkcn6Np=(^#<i>z5VPtPg;9Jw*K%4W{ti(ut<+*LraNYpSR0j~G8rFAI=;de18!#Ux
z4-R#o%n9Gi!0gjhycEZdth|!33X>ojM){;H3o;<1$em|7QAFDVb=y&o2WMaw%<wv-
z;I}S_xW78NE1bhlqpmYa@-BFC3*;3wcV09U1ICmGHbEC5EZ_VJ_9%KjiW(Pr^)Fw&
z{c5qKtyLJ|j%RIYJ@m>BNv+v0E_$7QENp$}*N?&g=CK!{-!ixK*N^sfck2;%ft)7~
z8qTXzltcxD;8<sLT|UWlwQhIGHFC=K!+>axNXYq>Gj~k1O>h>~?)H(TULKxwIJ`5z
z0yQe{c?)D?LcP`!OEyeHN804zoVW^3$$2n+fj?jWmVMTyK;`JQT-Wi}lfRLRNxr#|
z$(so7wC5)PB+Rrb%JcI4FbVTU^fndW04>c79jS%$OPP-6tyyHe+<IhByB8BJ9d*pV
z=sx~B@aACM$&Zx^VNQb`8h0qyGWl@-7kh6VRrS91i;_|T(ukB2(k0y?-JK%cC`yM4
zC?Y8>-Hr4F6+uctL^_=)f}oO$2nZr@pC7EX_TJ}x&pl_{d+vXCjJ?L#YYmvpIe*`H
zo=-h<XPAT>)OKZG2Hih>2;D=;0q+#o@k8uAl1MAFagoN1BFXA8?B~P$Jq0DLZ8*fG
ztPz}}G;|2waK$hB-r#~ub6;(ruZnd~U|scSi2b$?9>?%=4t7~3IvF(2wz);_(^`KL
z9z_(4j!1i}`Y@a``Z^!Xa~Mfl_BiXK%Ka4L3baQn89NpHPYLxEs=B{?bKj%+amiqE
z>8CK~3gMkb2q2N-x$19ssjPlX)t9D9i|Ts*K}Lfh1Bs^yf7s63JKB)w)I5BsB}ET2
zA=o>$L-xz3h~A`3#uxj)jp@r*-9+ni$xg`y;h;Gg5p6=fkjJwdHL6c=gKiZ>_m2+t
zdu6&(TRs7h3#Ul+AxS(;#>DvX<MuGiJB>8r=d`Kj4-I&VCBUxK{YytU{uXgfi4V>x
zzmV+JM5_p)Q5|GzxtZkpqIS&SzArPzepOD7FGrNJkw)mwubJ<kQSCoWitl(3ek9e;
ziv3<5G_aUHAoDrJ@KeTyjuO6Fw$QsXdQ{Y0NN-g@tyN<GS}l}eHa4H@K^m%uT=J6H
zl;T4^{H?;7`bXy{|0t+woV!(^Qd^{x5B?eU7Z28bf+$TMI|XT2ln$2V=%$gFKF${s
z+d6Yd%jCn6EZGy}qs}AdW8!&sY_OBq{nuseZ)GJ>vtlIDi~7VLlpQN3MSR?!NXfiu
zbeLQUxXU6-j<ud~WPt{-&;H0X$aJhBynBS|puM|^QQK71Q@9JE1N+?<Unh>Z3{Fw7
zYjL=HC$HrD>-kpV(O);e^InZ>=sgmPW3c*Qdr?bzzg(5<zH8E2Bg2rauJ<qkuthl{
zu!u3=PlPjwDVZp$AFyXUCl5vGDS3aMq@7=uc)9iZ18#SUv{i=&@i&wn&f76<$L0Vc
zPx3(KImm$u-)*!g;1u7wdi_jkbWG}(Gk@Xdxd?t<$;_o9mHsF{my5C=lV9%VpLF|p
z&Y<`aD=)2#uR#_@;ueG~rEk9iK+7lnP;x`d=|Rk3MPILMAxjqZ*4DHlT^qH&89Ti^
zC?Blyr>Y3IeCE2m7YX(%&(5t0Hz?lr+0hCBl06*aTd8s{N^G2=IZezhuBv@I$l}`k
zi@r-iD29MJkIj*>PtlUg9Os7C^>{zL3p82y16;Fv7^cen9Fb*(-ZBx<b}zn0h!-RA
z&d;UBT&uC%?R(biV}&xTany}RPm+r}9y0D6746BXL>4C5%s3`bfm?{Hk4p<XI7K+=
z?PAVn{N6ih0x&7$GPU`e2R)ph({*pAql+lFQSZZfqrRML>rrbf#*m_qBkHwEOAwaS
ztF29E<6Ibj0M&>HmSu^D>z#!`Df>H_BZn$k#^_QiPft%<qy65i-P-)bWCQo7SQ9j{
zQg<S65)`SV7DDgKgS1Xu58V7I*Yz#ftE9fSHV%hHNp76!^-J>g^^NEg5W+Hym{SWk
z>}ypUDbOIK6}0uwT#qNc?WsxBjLq-r(5=dJi*h>WDWq|fI2w=KGUTUEmfWp<@!PH7
z+ph65D{-LAh<-H2(xMnn1;tee1aC~b-=^FQ^PttvpsXx-UL3;RM&rv>8W&yvvvx7U
z7dTt$CNY8V+rxqKS!bcLBa@S!TJ?DM$|OW5*hH|Uh4tkT;g;u}(KTwE@Howx8<c_5
zV-i=mY>ou%96Ei*HM*d#G7yPG!?pe^>4MF@yu>GTy?|h`HA(Zto#E|fEOD7RbZK4k
zMGMatJ2HZ^KaDFn5W~``eM~E`j{kHVi#(q7!n8AI#?F)75B6*KB}B6hnr7bS9sT*5
z-iAlBWqU?oWyVWLb=u|>d3g$k3VJAAkmZk2rmD=C@hDSeSx6v93^a<o4bUi7g_k#Z
z&1EDqI*n(Xog~3CHmFm*GEXDiR%JgDL+i=Rnc_WnK5mwwA;Gli>X#iO^R_7ErN^;v
zqPDH(IJ)sWZ+1?EIn!&KT^z&-reu2YGUa#Iqs0#Ii5$)+bgBYES6jVamKWUY+>VH#
zX}EI#R(xWX7tYMDw-RVa5v+A{5spbdQt{Z>!F3k;)*U>Swj_FCzf6s(S#`G}4k8k#
zsnFJ>>@XD_l>48dZiQO|B2)L=@l$VxbUGE|wkc>{kTRs+j2J-T!LVZ=%K}h|wckre
zF(fMs`5dmVa3P=0?6GvLfmLh3jdvsW%f%MMbbjE<a289<C){}!N|xa#<8^Ij`R4Lo
zt0;nz5l^{&*4PhL9C<jXe{NK3XX=_^kiucQ4DPvBxwj9e)IWZxi^aw!Fw*E~x})zZ
zF`;o$d)95@dF~!yN%C1pIZJi9i6MYC<6cGC&eslXQl3`v{<w~AwG>F3L%E>vBopwF
zu~C&ieixDQk7(hY(&qtwkCD?2;u``blZWI+*Yvks&X&|{U#n3Lw7*OrP;fVTkmrpb
zK@ExYpX;G<vjbv0Uy#)8q_X!ZL^ENg72m8@O!>Y+flIjAeLj=TXtN6`+6X_VH)DQK
zxJpax_AP@Y`=Zx1BbMP@<GXh2n;jfiihkwN+qMhPH7A8~mX9<fi*|B85~IKEx;k9Y
zw)%mQqOk+t?Xe6^YNyr3IIfrY4&tx0==UF#;B4u9jdGl4-)vmS`gv2QpS5=pJXkTa
za_s!xmorf#e)^a+JZ5K-xvMBhvhJk|4Ul<;7t|xQHP6w_lCi^6nCsVXaC0`fMr3T?
zrG)xY=j|7}b$Miw%CjBBX!KXANV;yC6b)_<0#b|j>EvLoT<^HV5Qnmtz_T`J0kvL!
zv5^_YYaM6n*JcTmFOHyr?Ah%5)S+}?Mlhx6Cp9<E{3$<gWKvknBFeGnnG7!X`ItgW
z`2M5>OQ+sB+WXi@;DxVuG>&EdyxG#&|9yE64D|-AN6@hdIhw-}**bhy%=Z-;gGcP^
z`tP03tlsP;sH<_4uV>(PcaK4zTcCF2XXz!On>m`-)N<$dSe~5_zzFQ_n~Ks}-MUGE
z5ca~ccQ?8Z!*DuzG;905mNY%cY+t{8x=U7&&7k<Zc*G~+67Srt1Xjz6;OT;453j_u
z+FMNh@|S;hlM0(Fj$V6U3Y1n_ajc;d*7WH&OQ`#=1Xc+Nh7WHO)x2BhS4%;oF8o$X
zYWsrPV-l0Qc1JVSnKdCtzT_ZgjZL6b7@9>1m)Mil=!wD}I($X7`e7hP>d8iZSvlLf
zO6EjZWPE;L0ZbEIpwiRO4Iku`oN9ht`gloD=F#<Bqm_<sowgm6SZoca$fuFl^U2sl
zjGT)?1|=8y&BVW~jpXUyvRx6&MopYXIZqUue`pn;x`;j%bTf|WJQslz-SaDcqrZzw
zHhTu2G;0e4&L5pNRBFVTZZ4J<u$UrAH*nj0@{o_Ds#rg=WU^f|%fRu`^+7?QGe52<
z`AA?b`oE$wb$G+xS7WsQAzA&53Ksxm)?_5Ld_b*DjI8a@2yfhts$bY4%OZMr-t*br
z;--pqAL)x>1~@^Z?HsB6d3|h9_22!5U^vDKMl)fNAGYh9vJUDQxIe4Rbtd|Pgq23y
zRw%}A;Sp*{r~K)oS3e?Xo|x+n)!3Qf<;pY>xl>GBut8(1B-&c=ig!gh-I?zQe5MzL
zO?uS|5)AVS8<-;r(%Jn%jvS$*?>$8CiI-dTO^b0td@xV0UF{I2UKz8@Xa#mCIM+J5
z_}o8=c^n_Fg?n(76@7`uVmr=#wFh+<w%^R0nm4g}b9b!PVCJK?y@U$vO<x88^c(nE
z&YqbZy5yM7LT|=*Bu9JJS&B$7yxQnDbTAwyrJOUepL#=!M`p~$+2fY#Jl!oAZ9MvI
z2CEM2<t_GF^K_u3@ez%?t+;<~O8(N!N0c`A1~v`7_s&IE)tjw%t~{!bau4;dRF}_G
z(#mX|7&n&LOnUt!ME3>b;=L5x_VI`ZJQ`XvVUL90bbrH7^PbDlDa|m6CSk#9zR}xH
z$rOC?6^hfkw3|XkMM~G4J|LlQuq-KJE}4?>D~n{JStGw|G_7Rbh;M@R#PiWK!c_4M
zP!u#2{(i@DnfdOGQ#M@Jg{MiRO+P6AT<L47?ll<u5@euafi8}CIAT4wRUN^SxWl7O
z5!e3Em&DUD%v(Ivrm>JIwzTV%;O>#wk^KnjDNI=`%i)@Q_wH<R=d}K#*d&!7%Bs8$
zeM8OarMH{o6c|S;mLAYFO_;w?q#u|2t)y#U@iSgfsDwc??vv)5sTa;tl-h^WzE7<>
zDD5de(hFujD)@n%QLQAStoa9V$*v~*`_P&7c$C+d)TsU$y0ohxpV_UF*?QYW;yERP
z13$Xbj(Jj+SP<bcs?g;xIH&|qU(H;1HQo_s5l^nu3*NH#FDx|V_UzRi%{26Sd_FF4
z{s{_YB+nty2PQ9~y|fkDnbq&MPd*rQ9yXl%>hw&&c`!Ttbh$k$`N0({?GMS+S9NdB
zIZj-Wc-J6%2PE-CLW+|x?#Njiy$~>*HJ|LK<z_0wnZEP*;+9+14&aaRO_4W`8WZph
z+mYe2W)`Rlh|hhmz69a4uQtOj4T?<rjiJ^xRG4OV`WNkwbXh%rhX+jjE?}*mEy*rb
z(eXxo*6!zDexZJysWG-t(=N@CW|Q@7Ky3@o+V=|+0~yb@Qvhhlwn^fxUpKd2=bzMR
zTEC_{nEYbDNByUc48EH8v<-#In!!U-rELN=cT#4g`IbWX5EEQ|_xX;P=rEnfftdD>
z<Z)uJ3H|cYvTP(~F<T4p(#S~o($PKq8>aj9h^BWuc<L4bGMOjLG?K~cMKnK@`_n!g
zeoAGHjKL;l`Ld?NQ%b9<T4VI7Bj!zN@pG5#T&tN?EnCdhAe>k&X{Z3Aee6}<e*h@1
z-t~ie%#JD7#dY(NMYlXY2rPO?%Lh6Wlije6!#llu5W=d-79%4CEtY+tjl~Pi&}2?7
zyqcn&0;^p9wk-oYStk_VsA^ezQTE}%{-gdD)geJbzKYu=-;9MW2ynLFWZ9_O4-jTh
z$+eHdz28FtWYZ5xwPcVBrE9^)EB1FCuW@SAvAA89E?m2!3|M=qwMS+>{V_}*nf3-j
z_an>GEAMHu6!I(hyb$1W>kP-OXOJMH;qM0Q;ZnA2rm*iDt<<K6erKmA8HmO{J-fw3
zLiu>Y32U;Z<AKnOdc>|;<aT0G>4OZR^^gIHH;<jv&<%{fp;lfzD({euHy>}&d^-Iw
zn!noh=7Q|X!^E5EHO_c5^-e1;n}Vv|Ose~)YFY=?m;HP8-1<k7y~Qm&&kFu7pTx{6
z&L^Jv{E5Cuq2BuJ?He)85<lq;JA>(5hJrNeM3;*^$s~J^68qH}iuJMU{P}F>I>PFk
zmnFF-{NB&3{{cYc4G*DTzRrrCs_LGdL&lX;gSL3?XOmPoyHwC}r!G5&^VKgr(^PAq
zD4`FlV^H^-n21`vz_QRn){$A~LIe%5pXTdn5jOdK%h)Fw>j39oUia6n0eorF?rEzV
zErJi+@2t$X*;@T^ty@(w;Y7=1j+6{HAGkeaj(I^}-$N!^^k7zO%5`QY>uX88tKona
zoj4xZu)ff*prDt+7Hi`c0|_W=O-_lC8xKzTuxgal=XG{EM6%-LGROs8qYv;>yPW6P
zR@014x*D0$Fwf(4BW;?;YH&z=AyzS?)WiFwOBKwPV&0ZW|0O(}yPaa=Iokcq5gjJ+
z4Z<HYh%{Hdlo`I42;IYem*UBE#gj%Q9lbmtL2qEbsd%L>B~0C!sov}<L)}=-!Tan-
zH+|kM4?Hu!v;~p}A{|Ie-cV)z$rGeSSMuN&6_%)tkkI9aUT>d^&#!;z*S0n@lI5N5
z7fIjcpm@F|w5x!z^90bE+xSYHgn(-kafCZPX`kLDkqcc=&ER3OcVB0DVOdlg_RCP_
zh3V}oncUP1i0r6R_JvGxk7;J>uo$4l)wxK~K%$=qp}Jnh_YOw8CFqrDZ$-}Us}~Un
zD3ZErN+=cvqNivRlTa8E)=!sOGeXG%)Vr6+NV8m7WiEYYCXvjiB{RH+uH+orehR4V
zMn5jSE{jSm{+$W2&oipAuQe@$S0Fq`XWauGC{GI0M!3jFCc^L}w9AeBCw%6V{h9fi
z;`0-wmX}&oaOuy$bUT~4oV!1iNpIS6)ebOy-E9wmfXt-JUM;G@U+z_1OXPn#7oI*i
zf0~O*z+es-55hWY$59vF=>i`2(hMO)RFi&ssl$jo2`!EmT(irhAkq;Y$GxdKBUe*Y
z^qraJ@F?k1*NuqfVsos?&6h`ya)!Vhl9!2k&!ODPwAQ-jJw1y0Ojn2t4zK#Cy~{bt
zb!Ta1u066JtS6wnD+l++PgR;`R#dyf(^b*++7Da}ITsJoZ-eB|S@W%#FIy;6pFbiT
zF`Np|Di&}h5sX?dpg|`chW93fg?_^+zkTpKLNJOKTR_DXA9LNy2#Gr%9ebE+ZDzI{
zZ?_95sIJoDkG5eXt#JiJ%!IGTb8a=cj9h4Xf3sgArdrBDLn^h~hkOltJ)<$&J<SHl
zrWHW=jL~Onx6n>epYuK0ao5uggM=+di@Mq%zT1QW0NMz5nJ%5>CYn^g6^`{??)PFB
zT?17d&2GG+Lg@V>h`Kc`QPr#hDM1zwp=z6X!l+J3m+#i)hdO$4`|pOv3ArDinN-r{
zHwLO%L+WXfez*JjxJgh-tOekx_CsPYRC9Re730!5v|t$&O-xNabnYDE3y9ozl!U*e
zY>(?eNzaV^P28zz-E{%d>+Y%*qjHiQGSL!?J3v-lE{gQUput&xU<xdn!SCKIGih(^
zy*uB7KN<%4Un@~f$(m7CJtztvC0eEVaET_Ur3MW{yi*VBH9tW1Yt;4>iEhc1)g|?S
ztx$}$@8Cd6&2?K06CU8jTIXNC&bgnp^YI2~?C*VFhg?MC`C$LieGJz`vDv#2Q0io1
z(yEK~?3ltOm!%Il{SM#jrG}_ZtTI*-zRk-l(b@T4tvgkhZ@adOkK`H`eqJfd5OFst
zILPob8(>ihQ|{(0%Qn~+g1q%k7r}Xv((@&8#opByCsCI)&#_W{k)#vX+?%%Q!-*~B
zNM)DQAY&mAG-x%V+c!s}188L`a^f?7*8hB|CKuezW=O+$P7`D2k)g}ED(n;;7#rfx
zG~-m9BUW{w?N&ltozug)EDN<mf_&o{gsU#6(J4qR7kQ%vjJgO`y2U9x?9!?htxK_l
z%<B=#c-BUD<DThQeB`OX*6WkeGJHjj;a9%X^<JgMK+3(WTGx5hNy6?KwoF%Njm8>1
z$t4+dXNA3SrH7WP=pg@Ewj~!|uJTlgW@fY%7wMW=F7+12n}*M-KYMQCnrUTr%0b_;
zS3rUO=t0s7htJ5QB&4QZ4E8uWLpcyL#i|#7qOR)y@#}{KLx8MXgis;|!FVc0Hk}Od
zsI{5T+_uregnAxoUVfZxA&9tl3<@O*ngJEd+Q|pvEHs<lbRi{Jeo@s9lxH{NYo41|
z(u=>hQfbSOoK{VV%5hka6GgiIhWCZ5&USHfQXdVz+T0ac8|?zh<SwFs^u9g4TRiUa
z3UxD}T^8x~Nx$9%LM_d24peea@|8#=SQ~loe}G*3-sDT4u8@PQ(c(xkv-Pae+!ab7
zpA89c-Po~c1;a^vOPFX(gv<F*?-w){Km+tV-aXL{!o=p;g-f!fN^%SCMvN2%#i20H
z{j_<seDnE-&Jm-iOT6kN9=}7IQ`l^1jI@>`-l_+_`=xjBwX4)fLrR$Vv@(tBmDLAA
z^#Q?|EbAZ{<IDI81Ly57*kKnLY`?nXaX(Bsr60GI|BW_9Pr1~1Sm$-DAnmTMqtK5@
zUo0ij9~}^(-LfC4<wG`#HGh*)iQ8IZ?mTIOvAXK{r-3w%5t^7^;5swMPL2K~lpxQ7
zZv_T8joKJ}Kg{0-4a@v^ZaZu5js;v5F7uZ@^ienUDZV2Nq*kE2dd+U$9NX=}jV%{Z
zUl2%gdzg((Oi=M*#1jilUcGbR)tv)Wh=wQ3vz1{F*JiCNAVo{cnmN#)Gqp*kYs*cx
z06kfXbtKRGX*-bU5%+()QQ-SpUP^HSSZ;>fTYT#U2UmfR8wXsYwi$HLl4>0nf;8l%
z%qQw>ySqHL{P%L5dbd~;GN1r;G6Jc%HLUjb4J?bn2%sLn5;|@FVXFf(T-i5^&lk2q
znN|bF2)PvPE*g3B%QDfSIr8(Y@_z|I3b2Qq*Y!j}ZXpc?8~+g6@dGFVLfHS(Ehv#C
z`{n)8_S^aBRz>EzjwuZ5#52q08*73_eswi3eOQ^lt5%%hN+*!eOp?(Fx_Pp<G?a<+
ze&Fg+>l-RV30N^c@FocaN|!+}@i0X}A6`8(8kT;7aA1iDdsN7BNAlzw4t@e7H4uzd
z)>Ueyg#k8tmRXPxkz|aAB0y>A8y-YK!;B5j?L3c2cqbh`<eLyYJeEqJvE8ieu}S09
z`O_IjEuAtP$C21&nI!GZF$m2qJ^b<TPC4jW>mGF2MlzJvv7f;v*t!Q2l8NNsLY}U$
zNmIXcmw~#2RgW8?m7{rnf<%isY!qZ9TmxnAazRH;CAEQMS0dz`UT@IUrHH-m)2S2)
z06h*?l+5%_a0V*jSzkCi>vNv@Mtr;G0qs;Ayzy(Tv{{J2qEdMdnh-rurq`n3piR9e
zl3r<i{fiPJ=!s*8!eh+EiSM(fC54q;aQSqYKj{9s)(WG%UZ0OF2gZL;`AO`#^PxTQ
z3_rYL?<1;n!Xo;!@VaXG7Rbh>RzG>EYdAJ{@JJaSOG-(7AQuuvzP|+dhxlqf(_7_T
zzB)X|<i=n$L%{AR%-+6!PumViFOnQx-)l_IxJlwAwent(4NBvzIZ42``+g#o3bZLM
zJRmfo>(46z8IdqQ{QpeTUymi=b?ph@b%oQ?Etf}kHIix2<*&mKO$d!^ga7wg<*AVv
zW_(In{0I9Wa4Uxlb$#1k1HO~ys-6vf?DGdMl2%y_LE9@-x%_5z@@=qI33a-`^Ciux
zzQ$Nuc46SttC;RhP@5@5Fi30S8uES0_Xs&N8}d_o8>AT(P}f+%7IT|#b2~y@!0B?E
z2$wVd^u~e^)eq2|RDjh@o7V!RRt>85708><IA&crd423?ZQI&J;4&1+v9*5+Q>HJj
zfci||jOGSxMPs?WKHYIJ>t<!<lBzL31icFe2;A(s9dh8?W(E12Zg?68%kdWrq0+Dz
zr+x>LU6q;?iUdN0&&;(XbuNngH>94(2$ISq@gV+%=uaec!sSbJfv(PGU?||H{9K@#
zgNq(9c~F((Xe<5Pqix4JUu=j&!}%(FG?`9tyIa5R)(IYz_=<!6;h;)7^c|?T!kdX-
z!2U>x6w>*3U`~19I*^@FfiH!8TW<g#PyJ?OwKzq)nxUW*2K=0JDRVlXQ?Ux=-dTWE
zr78kpu=6Ea;23!rrWHw~mU6D!{RT^Lx4r9#3gShOm!e!>1k}ii0%7Y3kBG>*DzDZR
zE~a**2*Kt}kf+9~YX^Pe0D0;SWZl94<dH-4yr2!Qf}jwe;c!t4VUElKy^yN3CLv<a
zaidF~q<;dl9t(L!>94cDxD_jTW<Gsvbu*PW7#JsRE()5UH*W8I{N6!;qdU*T+*xrL
z3qvG?*gzfpY&@hSJ>R{R$iAV^a0_g#nDh<oI_MiPh1hn_i@fxe`g3Hc;iHrsQgvos
zM3n?j$#B*F3i6adexaMx%)yO3CE>p=*>h#@-e~$|Dh8un%PK(HV!rR5k|`Z{GvKB0
zDC7uANk2T?1Ch))Q2AA8G&-8mwKrq5=v|lVBlzLK`LmOER-y6?H~<K{{OEV>kwMUw
z*}O#Qiwo6^^CjTHZwEC<5LbvFxFafl4#~+$_ihb>MMTy1krf4qXh;#UYs6|lBFJ__
z;z1Kfi?+tE!>sqhRrO?c&X<~Ky-(2Q7?w|ghrLS}am|V6JV{WX(zy+ID%YB{Wi)TJ
zf(#P7tr#^;Z1(V};W>{wh4e$@Vw9eN?tbEB6a~=~Wbv1`9R6M<nzDhw-FC0c&8vr-
zk1nVL`8bFVoHY#8zpizs8GDz6RzTi1sPPK>Cq$(@pJE{;JERWF-6xU<<zAj8+Ba*O
zXz0>rN`}jdN&cD>Q>S2{!p@xcSZO8%j~xV8CgxYR`)bHP=J}zhRP5m#v1V)RsEuu5
zpT`@s0R#16;Vyd5Fd$yv+OIATv0sPzl~Xa!?B&9ii`Yk-o4qFe?5b)ue^%$%xCHHT
zggA7$d_mXv*2EK-#vHdx8NS@e&oNUln(gXl%+8O$PgB}limRpD^IDR^v^JH7=SmBT
z<F^v2j8Q<mYqgU`m7gB<H~X#J!wcpqI#L{nfvEb-*G%^sPU+rpdqehk^&%#Jn3ABF
zjRd$Sq+N_-3w2!qOh9wj^)c%n&lNPkPBZa9nTi=V@<BI}k5%PMP2qd8JBBhT(a5TA
zJrC-qdWXIw49$Z1ZoP9@8D+!hS0L@9p%(@rM8N@vUyZ5*)h9cKXD$Ld9O5%;`KqH-
zla?_$waU^VdxG(-jV9-^Ek=Cj>RwoH+1D<Vsph_Wb^oi^W=1R7uQbbRQLic{;t)UP
zn->Uu8xy9b(v!%dTH4ddkeD7H({lEUTCM5;Tj{Oc`ADKY`y45XHIdV0kac~HhGBcD
zSJC3}K<^wLrYk>&T)E>TMLLjMHms%z9_vptaW)o8_ee|!LNaJh?uHkLm~=7bMP5_I
z>o9+CI#9g`NcmY~x0t<`ROrviTbS=J%PZZDFDA8gMVn|SUg8VwzJ5SW*#LEJmcAco
zUOeV=IUC>qc)$|>;B$3ZDfOlVD3<Jo+yfUN`GGU#0C2ccYFz8mYeA1bj0fykgy-Gb
zF&FUv_Fl2Yz_)RVpB`K|>Fw)AagtV^m~~+hiPgD-x|XMMdkHph&o~<lNP*@HQ)TND
z%X!duDrlx8LJ{x~uDt#t6Z4Dkln4<1*%`E~)fyJT7;(|`0nI=0$6PgWFQK{n=CQx#
z-Pr2jbioUzG#wSzYWrewo>yh|{F3h~;jaY#H+;X;-&odCnZ8!7-FA*K{^=cyFRV2S
zdue=A^kc|RMQOt$7YD5j;XqppZY;khkzX2d8;xR^orA6TK{C%3Efy0|&v{k06sAgJ
zLG>T$x>xoCXBW$*l_VqyYMnfkf;p_1%qXY>28ul=L221>5o`3f(&*y0`jojt;_D|D
z@N#bqcSx9UbznNuOQ;(f3m!~L=RtC=z6lJ+u$xCR2~4+6lW-@KJnzZen6`gN_04|8
zJ?U=W%LseUfgd7vFCQq%dRa_da&U~<{&D(l`A>>XJK@4MDc(|g7HQydBYe`Y(L3JT
z{@xRN*6;zgYJs^F>7W+ug>-Z*f5Gsj)+saYq1@cJ%Z=x6UO!Y5)|>6<FJShW`>a>-
zm>eu)_)rl==}|-=$G~~yysxpfo>AebmI+mPcg00YQZmKwEk|3J(o>}hYbEUuF!u_5
zr>N_L4H&wwMt=RcH|U>|TT=mZrx~qR`7dfb+~ImTF(_NYV2@~hNPa$>zkhBVeu?Ir
zRW#1@73${`ZoQc;z+O`MDT+u9GX~6sY3OPa`I{nlz=Y$YD<;$HMp6Y`yVC^QFy2ZZ
z=0(_<$sXEFp%KaoQA!ftv{yB%W-Vd3WjmW-dcEsRPo9W-TGzKm^x2X_5V}$O<xj*5
zAtG3TLI3vsE55@Dc$|h`H=IZhX`5rHc?rEcNm`Js<)J2)$9g2wE81)+x$#yU#E-#^
zKr|I(RQ!AW`kgA2q0r;Jwr(Du!YKltlzMo_G%IBc_g9`S#1oyhrFjQo7SCMpEMz{L
zEQ{cDqtF$VsIewJrwWRt(~_cV8m~S#{ysBIa!&tw@nodN&$O;9E?NJSRZAI+Zqs$P
z1!sOg53rB0T}DdN;mc5sx?ON|F0XvOqc>RguKmtYp}uLR(u#<R&-rPV`NAJR=(Ef(
ze_x<8bX(UZUlTVwM}k@OVr^f%*T;gQ22_F5KV3q+J^h9!;@b;d^S%U*l1VL?a8EyP
zVo!Da*!@~s%4S7Kg$aElX|Nvi6}b7XT_j#x?o)U5f^)jeNxuagCeJ(qX~W8LA}}ZW
z?KfgVUtXpCb0MAIRUC`|>Td^6SnP{u=J2H@S_+HdJ?~xUIMjQw^==HU=0WOMHp~bU
zY8-P!d*<&pEu7hU(2%Zln&iC9O}-EkygZo+^5sYp20EMHU%-4v&rKR%1XTgIR!l(i
z7{S=(?@-cgWb9!}=;eZdr5B~A^%3Tl7gz;rOGwq+if<^?;Wcz(Q}xKV{hH?&-=dW+
zaNYz*w4`pTvT<ycEI7>kIbWLI#}1vY9ZJh&YR4R4OfW@h>${;7CTg&Aw)9X77uniU
zdyb)_y>^eNm)XG%Isfxu;~uU)Acy*AF&SeG(8OCm%_Sy}?*VC~XkQQ<Om{ghemnYg
z3MS$i5*4Jf$AM*dYz=BMFFI%Fg;EY1iYLwNiMQLn-_mu+8NJuDWz*(e)0JYi9%^{b
z6Qyr9h1%UY0QH-x?Gi`nYz=&F4Jje2QbN^k@RgS!?cc^uhI4Mk)*xzxqZN^!Uk){N
z!R3r8MBZ5RUwDqSoVe+>og+L0wGMx*rC%#&?-!(_X+AxF`f2eP&~!QT%y%xzeLa^b
z1TJeX83wVGn`o3Pp*x8X)JY+lU^xS#?7DC7rQ=)vB2@Y->RFBeNg!QV3^Z(dZBLCF
z=_NNR)h|T`+ETJmTjqkN)z+KH<FI~Ea<%;G!HOwima_o-e7P5ERq@rWr4IY5<6f!<
zF`fv%HwPE$i5X!y%+&J5>l~6+^d;&mSfvAYzdZ~(NIH<ZXOv}@C9D+eCkQ|@CZcch
zGz#CwH253V_u8)Ij-RGtDX};~AnJNjhLbVN9?I4L8?)Ooh&XsZ@zGZ}Uab{*SST3+
z95F=p7d{?*^IrkylIrQZv-d8;3Jyfhop_lAVsaN1FkS%2B4W?&8In^!TRHn$V>zyZ
za`JlIePnnWZ~LFkk(pLLB@DSz+u}b+65+Pjv0nYxmX6dqWzBYxShfMuFT7do>fJ_I
z**3LC`Ydo1X%_{3MYT`Z_ho-V93RDf=PrIbVvyYUGo|0n_GfEMm+H25LpMN%9an#O
zGGhU|-&qO0bczqe^JB55u@!=ygdv~-ZOT}DrNKdcNh=6ATEnWl#2KP99*yH)L-e#T
zzJEpThPZQLHM<I36ChH5wx*2rNacOYe|k^ihUI^zs_$Fg${9N}$>a2nr2wG7=FX*>
zZnuvG*tO`(8B7EL8u08p3}Y+ozygP7UCGrJ5b8=!Dzl}h?a-<A*tYxr1nYN$3?x2l
z1yQzAvArj|mEoAic{Y-?{d_2YAA-dib4I$POh6}95f9Y<lBe}l2F4_{=+WKxxb=e@
zmnc>DvoH;kRSOP0H&5yHu4t}a9VgQsqedQaWiHiqQLGTmag&A7$#PgI&Fcka&zQii
z<pu$TG6U>_GNKnUWp4$2YP6VrZx_Q$n01~lwGsN_N-(7gER%g##<zK6qNy2UQD>vD
zsJ^VJRkaR=^s=e6_`b3xR(LWni%a@C`eW{;x85YE_J*dJ4<D*G7-%uS>wm=Fzuq*F
zfZ91T3YfAWXqS(|u!wp8IB0dbu+9-U(?<NA1wb4IcXIgp)HzRK>oe%ap4I#moS@|V
zuA{2J??$TuljhE;TrEyRI+Dx9Nxq2W#LdJnS7>t83>P$7-s)*3=r^D!&AUi{Vf7}2
zicxJ{Y`~JB0Vn!b!fy)OE_Z@4hLVd<%7e=T8LtvvKVsR~dh-5p7>h|mlASaY!3GUI
zQ`4w;hRYCL?+dHG=};Tc<nq&0fBFiPgnqy_2}Go+Kke)4iXfVRz!FC8LqLv=?&6Wg
zfA|__hqciA&=h^vBE;}2F;xHa?{x3{HpHETd2)HFev>_)Zs6_FCbfJ7HE|2ekQFlm
zENP!j+%bKD{>$DM@jtcq{WQ(^_1^m-axq$7!h~_8o=s@CstKMWp0nKwQSl870=xu+
zZ1;Vz5UVl_9Q5VEq7OpU)X_Ce0nm`KUGu7k@mftF`0d%8kNb@r7P8sgoDn-u$lV4K
z@9&5)1v%Z}U<!I{!h7NbLj6}K(3e^vq#pH!DUH+w0br}E`uOLISVS<)m&@wvB{5U_
z21P2z`!2O7NDPlx90ilgRofrmLFiqj#wSiWZ|0zOF~96u1rQJw&bfi59HW5vP}jIl
zL#a@>|5WXq_PwT`qSH2a%|d3&Ns-gB`&p8K%N_25v?F0u``(D)zS7=^qlnvudKpkZ
z<p=(UWBrdYsme!N$_EUP`lpYY2Gk+%G2hcUEUD<0Ptn{UonSD28p&DFesu7iXiEGY
z)Sy(^c;;9tt^;{;SD`<5pZ^*sHme6IQ~I#(YtW#Io<Pt+fKXD|stU#7{&<20v@7}>
z0t>i7AcFF*Ei88E4qtGA5?#D9DciXILE<?;Cw<MB#6^FYqg)a#u08uau5!+k@^+on
zcdn|(<m$$U<a?j&@h*{*kRU=~oi@b73Y(fB@BpTAnLzSiJ=!VDGg~zIDZC9k!`OOG
zT)%=V_HiO>pJE1|%v_TynF`=+5ewe&K-?a!U_o6`#n_=4dsSb6fA=a7aqS~`pm};;
ze8BtV{GLW(w+i`n*N~FFbIlQNx&H{-<x{;?j(kS~*Vs)3Ff{6lxJ4T0=Lkjk$KBSC
z{0bC;LSjcyTccSvXc$o(gGeA<Owp7wcXYDP55QJ%roAqxldzD+BUup}kXyt#9RH%&
z!{B=yql_f`lrqf~qX)IB)Thv$R5291mQo@_t{OQm7u7@K&Nc98$pO44FU6zjXc}-T
z%{9=?+2QqYnnV!E3m`=xns=lT=QZ_!l9YWv`v>5S?3O#y!6;#KXMbmXMjX@Z<^?cS
zGUp#?d(eTbONb`QpY56jb5x`7i_t}8OT8qz%SDckkxxeGZ1z~^_%X$0P7KtwDIYad
zf_Hd{@&>nDD$^Tr2YI9*>W-#f+-nV)c<zVFFDYT8{+_7H$Qf;awPU12wdd97<s6j}
zcL11>h_Cg>8o8^SdG$i*nVD{3zvr=O6m5(k!|c16#$!&t#QCbep>U7&%0mVzQF&)s
z-ap_FujTky+<?d;P<DhculDHR<)&WmP1|E7a2ptL-moqFq$7uIc;72HL1g{=6Ktv^
zkJMWacT90n?J~R0?luNX#Bvy)y7g}Tn^kS{o0&MPy%Aui?99mZ9sI?)C|lCk?d?0P
zizy5+gmT#o5f04gcl|6T^9k}FHIsfHs>H?l;)5->Z#*i<!p&}WXMn%`k5GL!0JZ|L
z$l%wPD`_Tt^pUQZMP!1C0=%AS`;V-HTrGAmC2P$$U(>1TX6Wz|kzQ;4(PI|q_12L9
zIc#~>A;cu>UozX7Zjjl!-bv0pvwoeN3d#!yqz~(B+Sum<miM>qiHY`akLu6HFl%7#
z1fBcAIv;bd4d4{}8!EB1`jmL_E#*I5K$6T8ZO@lfidGqOdUju_`n``{?0C$VCh#(x
z)5oI0W0agx#~jlrax@C67K>Hmx*)z**Ij9H%0cON36>8{bvMd&OoNReZ@FoYoS~HI
z)|UP|RIHv90zSuk>fS6p82Hvv1Rj9(z9YIZr;DNQ$jpLpm<SXs?x<mnmB#`r9F_9a
zDe?(8c`t;qU;OK<zMBI;XkwAm>xA?571`=zipi6TjZ=Ny_|J;<O%aJW7yk-t*HSge
zesLl0<;v4|t5F{<skmA3BGzdDp<cbY`;#zUu5I6bEhFd2Y=tPjF<u6<u0aQ-t(5i_
z{g{6}<E*yW2TE`N6VKCpGGN(8Mot#nVYP88moE31v3v(*IDVZqqsk2={wHb^8Dkjr
zdCn=5;@Xc-uPR*M-UTd8!h~_9f7lE@t4P`%i)B>%$H&j;nzEjj9`HIT0S|fg<^BXa
zU@YfBNV;0ZxCd@V`G5JMypicfip;M3Jm^O9Hy#@-u?F302@vASt`Sb=K0=Z<%PUI#
z<oR={T}X(dgJFpbozTfJ2FJUWj)c4w$<}eQa2Gzm*p+gHVoDZZSq{3kr65qz*ZVwu
zl*Y7q|0h|NVhT~vqxW^muKo6dkUWZ+dl6Thv_9ir)HxZIDQeWu;pbo3OYgo#hWdtb
zc8_>L_|(hMeD7>(aNnJY$(ta+yEgLsG?T1!HOFi%MHmd7!wu@Sujw?pILr46qA;W?
zU0^9*IlM#r<o9uPSWRGkD{ewo6@0O@Cr;Hi#Cr{q*g{yxnFJ~Yxw7|JWLFY8eUpmH
zji}~O8gzm82hj)!&gHth2G$vEpS1J0D)ipyKQiKvP4C`H<iXTw0e#^Lzh-DRFQ?2T
zy%v#ItMZu57+FXd068G4=g}UeyU#A;7ZICw1ALd6y=)IJW_u~}tQ(tNF%}|bn#{#e
z38l-Kd?B2s#|uvXWmZtCd$`<L&<f~1&V>jj&ge>&=Vu+hdVyBxOaNSI#^3<$N`IM#
zNm~@449N;9-&h+CCe^q{cSmz4%FmCB&wya7>MF#uDVHn}Q;o|V?F|(Mf969Osu43t
z3RACxubzK8YP$P1@%oY>kR{g_YYc)G^DKadxBmrhqDxI(gxoK`m3?>$4R>lT-(%$Z
zqb~8fdOg1YA>O*0Bu0v;kf;JxZ}ZLaoc!Zz|J6tlEyZeu>Q)8t4`m!L{FW-><4Ky3
zp8dlU{w-nqZJ^<2UzBW(JNUxnbc8@h25Gkd{`S@}p!M(j*@u7m(^t8pXF-6x#V|fw
zddhL0zfwhBH_#MuY5!dK)mLd-9jJn8?|T48&9mV{cJKKsVsOyVTx+sli^lH=>rIHm
zP<`yM@Y3uy`5eFa?UfRJ0|;A)4jFpW2C)J}EdE=Xa?)}jzseDMmFZ^YJy0yB`k44D
zxd7WlannOrcWm%>+DH9ZM~U}3(Uz#xvGt`3kDkxGUNS{<#Yf*DaelCEM7k03w&C9B
z6}PeQwCF%Bx=5I<)>?IN^a1DMuIl!zIK<i>i$2WMzC+|q?vB`59uTn+!9P*_{$P+V
z?puw?ZG3Y+k-u!%X`&|nZ-e>2EZ6LY8A0P#Z#k14#SES|k3^v7$2*;fYR0^h{aScM
zB1|5Z$Moec+DI-YD0$qtl%sT5+*)~qbTF+rVetWgt^G-b)E{O52e)q+%(zs>5q~R;
z^VvvHarIZ{GIw|Pn38Vv{nK&aEwx_o)r-P`uX7w0ofWe1LY}c4+sLnOWgC+q#9@Es
z6^4iZW|}-nYS1bbN}japZ&AsvxG|dPmg?cLF|0)TivF_e^X5X2$1~KvmrJfZgekNS
zEO#DgvyX08^xPW1#K|i0Y0GD#xI7}*ELcFD-9tQyI_Z=1S!L(%iXNNk(0*75wq8|5
z4rww!s+huqxbETum;}oJOf#SCd&fJk@pyMx?w)w;_D`;qf6u^3F4GcUiDl`bA1Ux5
zM%<P>dC5AGeIi*HET3cPzlZ~rHy)BCIv}u|6h9p$rm2ReyujJ6@tsYZK*#&#Eg<39
zl#z+y_HPApo*AhnJ%5ic-PYsym_9b%s>R2(qHy=I4#EXeEOAl8L3%oQM{F<x{&`s@
zwMM#E9M{tig{c<>(ZPVY`&{97b9KSohd6b0(r5uiO-;>!#k?<U^1<x7zaZvAZXB5i
zcA*nm$?GBz9*FGGLx?7`JOGz8S9GZ=Q8^rIR3Er~Zvd1~3Eo+0YUSE1Wul;RnT8m@
zJ6Ee4-f-3LBOM2VW3Ih;b^{E|<v@t^TCp?Mb9O`N$`mZ>w`Nw7V^Y7*Czb|df4x}Z
zT<AWy8iVn}al~`|fuZ-;LEaXcwmLC>fgxxr)gp+cM!TpjJqo@I<z3w}s|_pLa)C&a
zFIdfo2*C%<U)VD$M%8%wiY<#^<zE3Kv>+C@h3n7_G(aItVk)?UC8LMzEJaXzD(+jp
z?e671!1l9k<Xe~B)W19;WXn=JIXPKJ0ur9tDP@OXm)4Jl@i_GL7gSACQh;g|#zZ%c
zhJ=JjLJdD%UWp>6*66#lT=J-AbL3-PH~|HFiiSx*slUFqRxJa0g!oAE{`YpdG$&BM
z;}@~X+?D<U9s94#jodAVfQl2|(59o;^gOMfqJT?d<M=4TyN%o9T{a->>1HiRo$Vm$
z6`fX+bGyP-fI*UuiA5w?g}l_M3r~a%V}AyuZJBAEZ>uyRbtB{A`gK7X^hY~JS!;Fr
z8=qNvg)@qgG^(fGgq{|wnooyFu47To0+owTU=u_<x!$F!m@ku@Ut)LGADpM@_D)~k
zX1q!iva)4BmEY~P>dx2#0+sDcgUd)Evn!c@GJ-y-Yei;(A3i2E5E(T739(q$e`UW)
z-@o3HDJt^|+<g5giC*v+&`9sj-G%5$t20Z95tyRo!$q@%o65ReH(s76U8{4SBKjIM
zZY{iDxQfiPO=FKD5Ty#E5X_XTnfGzrh(r*#r3+OcA6qVVxpNmjUE8hQoy+K7z8`l}
zlT|^XjRdsRT5+Rt8@t^*=Qx3xBm=So%7F{pfxbv2BknF0)ANURiz$1o5>8&ELnC2b
zcu*40%weqNpVzjyeX#$Fbc=FEUiUlEoS75|WbVR>l@pQOd<G1z-Rb$$FH5Oir6}zd
za~Swu?0!meWM&%I*F8rf3jxYfuIhUUr${c#Oo$JUNpz7ewgI856zPx|GD0jd=Ypi!
zDYX^XcXxe;!MIlwJ&@84gIb*gWAW~0r#LTCIaOHI`4!}*om*P&s<?UM>pS6wxaYcv
zd+8B(2KW?5hMIGV4&uBE4tRfkeED@iO&*HA!g$Df8KyhV%=jntBqNV_D~A7t+WWS_
zZ)&MhQ>#6x)%4k>pEvt{OZV*wd!kVdd#YtVxj`!xj$^bLHDGWqltieujIm7V272xD
zyAjPPV`GEdEu61F7m=pk45@XQpQ7WvUOVy5q+x&cvbOU%4?P<w`ZSqT=+GaZCUdzN
zdTH-x#Zj!+?hSpazlei07yw$a{Mlm73o1#SWU8cj<d&ucxd7UKwZFn=j~~w^xpPZ!
zPGtAJOCGNf`ql6_<`?98-7o6X%G8LTp4fskS52I#5%-eK%f0M$30D!rDQGG#WlGoi
z@1Pb;f^NUzETen_WInf0|NTv^&;@??y&%DWWFEiqtosgwy(y7o*&L9C4CnJ+8HL*A
z{j32&<$cRsc&m9w3*;q9lkExYwJ$|~tUu9;8l5tC@(Wl!o&72?lo^2mQO>tT`P!aW
zG#2%UdwAWWSOYq<-$>$5<#RwYjExvYO<)d>2CwsQz5?6NSCsQEp@u^>nibw&8idYy
zxBR|HYkVf%6WpG-2X(aLAkFI6j19nE95nDEFP4gy;Y!ayzbN!MB4{1h`%d5nGSV2n
ziy?7A#3u3@KTz$T84T<AQSBuXBZsAb?*8yoKEVww6AkFHAAs`*x9&K?q>V-Xu7zc)
z8j@5svooChzFr}`-$i)gK^NK=!ZB|0(Ve4buI{_Ij-LVj3Yjzvg>vlgP@|5OSL<H|
zp-S}Uu#phg)sc)40U41NKWpJ4=xm!7ozAh>F5LhIcHY8AJ;#toFJ5F#V;OhM3eM8A
zGs4(9;BcsWO*+`c!}1v-L{{fO>!S-M>`;lxm-ZW3p)wZZ*VVIhL38Evl@bvY{xrYW
zGi7FRRQTiNCcgr|k^Y@!``7Yw0D2f5{N5(AC$~Ih6Z?q@bG=T#C3$eilBN3iBNg3e
z7N3FcPF-fB16$*}Xw}vdZxgK8DkL7rv##@$#>03+sdZGcg>3}Y%W*FdhilZvm01UV
zPTklAWHNGlL`M;E?pHvr;Qd~aNi(JDmX*p%(4$;fDV-uQ(v>^#*$o@Ky;a17)Pky>
z(D1s!8Azoc=|p32BCk?B{YUoQTJ?O^@3F(t!%ev`oFs%KWt|B7RjCQI9^SFN6^V!3
zz@5v{pyPt<8l@kNUP#x=4pj>Ucb+j4oWp^zWv-i|M$f1y`4ah|&y05ZJDTJ_gRuZ_
z*N0t3QNl-v4A5mc;RsVFjaWBJu!jm#>;HQ8%p=Im7gfmT`mC1m7}Qar9I)D>3}_+%
zV}A|D;6VXQtcdu&hR@5(JD+%`QdQm}Sbks`&W@A7B*+^2f0nl+Z)*C;&E1{oGaUq_
z`)}&!|0d6usp7w$rh*8V?sov>RCS+(p%Q|)OLk>NjRJr!WDY}9AhDD(sVk|>)$N?e
zsMxUeN&W`>mNf;$Cs@c(#sj0<`!C&+x0R*|#D<5$FdBqmVS42gb$pq<9t(RkduXRS
zJs)>|o)}*Jh=tiRju3rG^a{dPxSU2*O5Hgn$6^&#)krr_Phthg4x>;@B~bHzpnjZ%
zDQxkdr&1%kd;ViS_#(xSQ%;47^4_tsB|jRrUu31(c8jAj9A$3*pZ!wnNvCUFo0@ll
zo*{tKCh1yZch=>IU-0?mk23&c*CEje0(z8yo9Xd#u0Xgz=C~8#Xh{>^YY<!Boji%>
znI+3tsQxE{1r&PlqX{ZDNQfh_iSrB0_@jkCWKRf~WP}EAx#>Gm@*BKyHh(Y7Q2tPK
z<z>37=tgui-WMI@;kv>%qsh{}kGz^ZmMAPTB1^yAsrzttky!mJOeMdWYx&zZlRC2F
zwKwur4&^NC9@zlw(P9StXy7E@Dy)^cOJUG?O`y?q(bKcD-x|vcJl5JLOq55_*e-yX
z4zvILN&Y>Z#Tm4USlUP=7`usV6}+ioy^|f{S$>n7!|FItigMt8GZ_k=E6kNbGW5#B
z1nqkk)hKQoDT8uA1iEiD$^Xk#s+STl0qAX_Jjz+ZfV~Pr$e5!$Alsq<GKa1<L3P1J
z;4Tmqu4#KL#=t=w8{`u|z9phWI!^-PGG+d{;?-bM6Uy1TDzXdt0@F_@K4WKR=Xm$N
zN`#RMy@SIl>G7v1oi7oA7jXk6H%4c!t`SJJ3bM8;z#j%{Qik3ueGk|hh?k3^R?!5J
zY;nLJJ*QrN99D|Pf=75nh8RTbjL2Pt;2y+4IcrbKn<G2wIs|HcjpO3LKUpwT$N6jZ
zE57~Fa0HJ3`!B)e`G4-0U_9lq#@L!$PFY-X!=QupgHxnf6ybOH)S<^=Z)azCk+&p_
ztiZVWK?CH=l&3@1rR%1KJ88lrcz@M@ADMrPSf2qj+Lo-_B7c0>k6O}yh9f#b<3B{1
zRR3dA8;tj(`c3=yVfvqEzgjS60A5|JJOXW`>T@t1XRO8HYh{b;6Q2|1c`Jh-PM6x{
zfxJ0LghO^)!k}gNAH;|n^>@Si0>iB;>8~g`Y9VMU(9|Xo?_Gx~F%r#>X;NHxa#a{~
z$;bafg|D{%V(`$CD#iK&5eRXOfxf$^So9DMMDqD2R^ZRnMorcoA%986KJ9q3sv}#s
zuqznZx_ZGUd#1YWRF!4;aoi9J^Y~o*FF}g`U3S|`k+<$MyTwg-!;oB8p#66>^1r%~
zAhw|REtiopQnU{#{5V-P5XlKLlmB6#z{4yYS4MWm&C3wdL(kbrXq<R#VbYWS|2Ktp
zeo>K9Q*-lup=Jn?<K@1WJjt}l=n@|4Gsh;c{`?imb-J^&$cjdSYwffCE9I`natx_j
z!@Ai-V*5n?e)NAGh8$B6DBAkpY!FC3GluC$#ziV|1|dwE6Bj;%0O9y*M`+~69nWys
zZax1z*#ASjg51^JXgGJGSw_Ggx!l!|e20@y2!EP<2`-vllGx$VQF6vxhuw#ko=q$u
zNIC__nP>RFHdm{mx)N@oj626|YUI**3A@4F?sMY`YzJb<FoQh8+522i!n@dzDobJH
zpU?1rzc`V8EVFeqGx}r?3H|*={%08Jf7wGIg^5owkEjGG44eTR$G)=I{}L|>XT@bi
zy_(X(!2_LbJbs;A3X7vj0oO?uV6^iIQSq;|PjPq_^b{wV%aFnT_6D!H2=a|>JPf@Y
z3FBMG*pd)48#8p6LHjW>wvl4@?}@=*xd-(dMWuMOHe40Cu=hS0zb1rJ2A>UTE*65O
z6Uqwq?7ZB`7m8#O|GfsAu3f+W>%|VrdA8~S79v4OfftZ4=BZMK3tk2pT#tM1<l*d<
z1>qD+ygZ?7(cJ&pzy9~3oveDw$U$ZNIOszIl6E5Uzw=9w^trNP0Kq8*v7lT3VPS%Q
zv083XW*7%^xkEEF$uEXsR#*r;OsvODNslO0*&w?=4~xqFX(P~8uyIsQt{o^u7LnE9
z{?BU2Uii-Nk{nAt*mfj0Rgawsxqan{3Et+)SQ4KVx~!cc6yYu~k#MOI*%z40e<AvO
z^|ufX_@8YCGV)ezGZkjn5Z9qM94QZvD*zQ77!*;CL;-(c#8HUTu>C9Ha}@5{t7>0)
z6>dXOf_nKM37<$tXp<-D+Z^Ox8HGFg(Mg6W{F?`SaWu_q(zQr3B>ht)%Opm>jirMo
z)St_>9EuX+S3_O!hcl7BQ~Pm<3A{#r0aED}_s!^iHqm=}yrel!vLpZ3YZFP%hHO;B
z#Mf};Mw;~eZ&qOcZ8j$`9{-Ex=6`aPAfcux+j|h=zVTXa0N}94fxbhTh*cr&@2tl0
z6i&lk*=vuv%p~E_LAp0&bru)?U5CDhI<#%+WPc4rAD<qO{W>RtiM~`jp)?QQ>rE|;
z9siBF&dSEdOTA4F6Y!tZU`i}R@gC}HJyJtLKf|RN0+!eLckVIF(vr`@dfJu4x^m?T
zBI;>HTBcuC5WN47m|Ak<W~G@)47>_l$}ehzNrf-u3rD6S$gy?tNNS5i<P%B;S(dOb
z?8obM^8I>h6Vg%ABP~vEE*ny7fFui1w(G5{jq^Ff<DNcw-0Jy#_-e>f6*0$l^o57r
z3adO<tTLY&DFWbAxw@xou95~;|Dl|R;s7(*CyQa0m8(@1ixmUGK~kq1HE4>e9mY~L
z-c%OTFV$E(?pcwD8+FxM`-X?e5U0M57P|G-K)o%b#*l40G5_N8gV8YOu{rRAAp7F4
zGOYB*uiv*j5;8)sC4`wc?e=9qAVbz&cn5{9^!Yck4o%L8(0japT5f6qf<m%q<k>!7
zh;tA1sM%KdqY>_xDgMY<J%`H{?;RIGYF$;ajM0U2qefj9EXp(%tBb9QZxDO5e+;>p
zJbBv|A>UUkul2z917AB*#QvrkT}++M=30$Vd+bW(Fo>S7R@8qRj#H@<da7wfEk-D%
zN32SwYFxoaN?{3Oue4$~>{nc4Nz#tq*3cKP&YgW?@BJgPuc<iAA6J?_B1-YOx3!mP
zw2r+xti$xk4$oBLj;nTb{P>BE+q+w12tDta9?Fvr@yZa=jg!qKorL^}BJ6u42?j@u
z`n`H-?vfixGnA@xx5h@f&bD!Q;y_es$z#Gi<TSw`phe!b$~8uJRRmvXHit9MT!@Qc
zJPs1MeU*3%nQ4Me`d*)#Ni>Y=E=|&rW?=p(+@>+T_YFhgT;=6<=B}M{{<J1`#a#1d
zQG=NU@JLM6i5%#VZ)8d01cunLQraF+xB$+zg5V5uCAOFs&I4IL)bb~BP~9rq-#^On
zsATn6(DeS1xj1Jx1Y+SLpRaA7=!`R)lU}WHbM}9|i6L{{x3XBaEohr?CeQC12TJu(
z$IiJj<0ulYJvoKGZ7%qk!gc?@+RXyHUD^02y0b_+ZI}fzA_Q`StG7(5LXNN70+FXa
z0Spa}M&#v05V2j!XOoEnO%=OwYlmAmrWnl53tvnX|3MfPsEFmuo-lqJhQ%?Bg_%0b
z)WMa9NsjSvU`Og7Ju3FaJR>YPytx!Xzb3gr3Ptgyc?Jn2B3#1fv$_Tl8_ok9f-1KT
zHXpUe8eh?^_)PPTmTGn*PTpc4qDBoTz&WINLjr3^tXiKD-m)|WL=O+4eQUFh37xyS
zf491S-!xV=hQqB@r*T&VffYcqs(mk<Vd%VP_k&wFJK!|yoK#rx&U1*#u0jm8adaXY
zKW?o(KgG!eyCfGeWT}1P{~HwfCv+R`?aa`6=VK2CNPgPVQlbA-nOB~&SunC_CgQ|N
zLL(wnXavhrxb$?8FXA=saYr}z?;TxfcMNOx<{U=6zIr|ZzM5rG*K{Ldd9jsYKSjQg
z6Z#In<8_~Yz<r#D<oK6o0GT33shouky7|u)KM{Q>kp84av{3e+9Ae%XpMb=Buspqx
z0R{5%qeMhk&5)UlUWA``pS&N%LEBROl*d?~v?jd{8h?F{_P}8e@A32K+shs9ByEjp
z$N@C0g~JNh%}NBk&Hym`NGdaOVg=d08+#^BT;E4mxv`?6Dp-F&XzG$2b^Qsr7c#Z>
z5Qqj&bWBY3gYy+DNGstDMbuwiYm&8@-$in_LW<%(x~jPq^q{jLar0HBaMh0l)9Y|E
zeQZ+RLyqkHAMm&N<ux-~<EHAK<G>3DAW`CfO?=quik94{%epv8Vb6^Dk$`o5sc&cX
z`6Nf)c^l+SoTg%gF9==+BEeU#=97#rYA<~Ic?-Fp5wQC>J)hXk)A&Nfbsq+Hz4^@C
zCeL$k+k$XF`Ch}Wm9Cd^e&#wEiFJThhu_>;!FRWb3I2M)LY(3){=%C-Tl}Ct^|k>#
z=kF|j$n&Oru_iy+%pmX!P2Qn40(l_RREl_N$FCE*GiCo-PZ(JP8wsH)hY^uWH@V57
z1g?bo1wK`f>M@v8{QS9Prwa3~G(#%GYjiXA7i&v|`?Y-WwyhaQ0E;dU2H{o!-6Y5p
zOR|I(ay~|lZ2nG8pXZ&n|3@H+K?mO%iU2;djUY6QyI31KY{X;_NLugPI2RaS=-9Dj
zOL$vafCFq&(W%q4XWqbdrexfCTyj88`-$K4B0}4GyEvXazrS0-FnOQQMfoUgE8OOf
z>?XJ4D|PU!|H%V~Qhj`4q6A!7iaom|FZHe<8~y)c?#<(|?7p|*%jJ@(GE*{V$ShID
zkd!$YGb>X_M49PANr}v3hC)K-A@f)nqs*C;A+y_XA=A4~b${>s_dMU{eLv5C?>~J$
zD(AWPKKI^htz#YQShm+dOyv}8g~O@-?lf>5^BSL>Q2EXTPyXKdf>mej2ViN_RCON%
zKtPR$dvZB5d%EFwQydY}zo)_8@z>_@TmRdyZUCnw3wq3{F9M_gd%T-?7Ci#!Ah0F)
z4}a0$zAVOV=Xa{$5V(1c6J~&9H0mi*+X@=vgn@{*za>%rUNqi^CN?63#$}rtd|nfZ
zS=c7iLy@BpN{4t5;R7m&(0DNc3}^o|Yyd=N8(sp`s}CwA8gHLt27J!>-4e-z_8c?+
z5iUor56vh4cGBTI{b?N13M}qWPP<s~Y=I$%4(h?QUcd>IXgX~V5sD6C#aS{XT|`)X
z@PF)nTL%X{QaS-`=onz~HXYn4&TiUl-Z<1?9^-(IJ-t*62t!H{8@zswj9nI;i=1Pl
z`Dqm|7`TYs;imoFGvM(Cs3&KGhUdGM;tz9&XzUN9w}=~~1{&F{p?`|VDF<b>uSl%C
zzN~Bn_d5DGbC@iEk`TY*=e3;T%LZrtix3(}LPw=Z4?XMahn{r{Z`Rq}<b2pYw}TJo
zJjeJqqS#84=r^61k0uC59&*j>bZRpUw44Qsjf_fwM4ySM%>L#5k~FH25}AYs6=HRv
zF*(UaDvZEA%?MB|<ul5M==f65J?}#X&Z%0bRY*(_nurIl9(p)E0d+6MoqIrAuA$wZ
z<*gcP`=NZgqO25xKmAolpp8Af!Vr1+--lS~4|YykuO3$eZ6Lh*$){A<%&tIBt}bOH
z`Jna;AewQ0yl$VF&j}ew-lkQFFzNl=Ycm+8IK(VF5<pS5fAk-a^!GOTN5UGs!e_~k
z^%PMZe-)TF1$0Obh#a8|y<AspYG@^~a)(|DagHEv{%wTqOZ4`Je;Z*&*i?rx!hf7^
z-<Z>u=dbHL$H$~Dn{}U11a<XpAN#o3x+0sEm&pBRs{#RVpj~W{^kw*}bR3!!@yDwu
z9k-gnl{L9BiccfJ!symOGmtiMijWAxj6u{Ry`f^$O1L||iMSmi%Ai`kuVKHf+-7_T
z3%!c^Z3zAq0{mwbj{x)4^a#na9n3T4`1e%I9=<L0N}m$=om6Q3=iopIdneKbgBtw_
z4}1X<mQh?h)~1%c5DX_c@ZEOq+#voBL2ED)AhR<n{@3I8e|k#zaj85Vg~Omg8Ptoe
zg!3HcCjKC)zwfj^NGhZuzsW)gfr_ZZs1)>O@+yH&t66I2zg<S^mg62TLd1Gw<-}=E
z;4OKV%)Uv;0^XIVv&YzJdSu~zY_l>JH-WuNGPDm|##5y`{G|hzatc1t`{<UUGgWsK
zG$mGeP|fVfRzAFy01=|Mdgge4JRPf4p+;ze@Ej3GiCgd;4(IZ?6AG$HLm2P#oTOJ7
z2gaBVs0bq>{AcBki;bfme1A-r1zCS-T>G#aaG;RR6+OT&v%O0b(!}8WTmqDwdqF<l
zsEljFVr%8QZNnpCTM(bre!}eIcH>y-*eAQ9j72)pvZ-jp9_Tnom)#CxQZ)C6Gp;2M
zoU^&0@dDYIx4M0~)I1}hQwL&?8SuFGUO8>7Y=ij4@j;Lw_}dn*HWW&RUQsZ*JQ8}l
z{T?LwY#}vC(o~GS4nmSKBJ|5p$8jA%pQwhj{C8@$r{_S2S%ddo{v`uyicLcUgC6MR
z6=Y$#!{2ayGbpLll*p@BM=vuqM1n^LXY4wnqn3Qp7+e;o%zNxo`HP7EBXa2fD5ex(
zgtE~xD{SM0Ze#wD`=QGWDlJzDO}Hh(tbufiT?vO_%gVNrwk^rhf1gF2WuJS9@l<v8
z2D&ckV=|x<zC7;+)JhuhDWv9C6msyFq?}iu$n3tlJvj>U#5q3Dy5LOO^3u}zuRX+E
zFH=6fU^Xp_cDqEFWNMIoWycsHWrL&>3C$2t1sZXl&^U;$WbA9|Ro(%?mip4#`{PP|
z?3Rb;`4@R;SFMmpT8rIe^!+sOi46a?sb==?N=r*M1ZERimd9;<)2i|pj;DQBBurwx
ze((d@pk=uLG4fJ9(&>B<GXD*B?+639OMtuK`h6QHD|tR!jm)&U&^S?R1E|8aj^3cQ
zmW^~b*qG~8xhY);v?V2D@ch0zi{VT_itUooQ!d!<=_Rn%J=2j|>eqpD>2u@kb82sp
zhV6OAoBK9ez&Hy^g!n}74wtQbQ|8S{TVZNEmkBd^OZKLE&6`grA@Wct!S<T0@<BtW
z*~9M8w<BS2#h3m)tG^L<&8<O3MJ<lm<5f^BG^ouXg<Be_{7Ju2G*2``dr|2-a3VAi
zu`;nln$s3gFLN%#CgTi3o+r8x4$TBfR~(AJUYR45*^<+|H|OwhCt(mW_zl+9FYjIX
znICv%_p^gfDwN>Ytek&y4cKY|kIx_-bb8=PR3?qAD8ez&1A?+<K~2!`(ETnkhvJhW
zcM6AHa_+sUh-9U=smE;k9MYNfm1qNTa4=sxk<N43kBZuvtiYbm5;$`KxqcK`SkfP|
z`dZ$#=t(>l@Eo%)qme?x>?6#1Q-pFQ0BdsaZ1H`-An1|Mp%-ru%ZT#5Y0{s!x&7FM
zWhBWM*pR*^R06umFe$Mwr$1lP^)Swnl`Oq=Z#~|*9)#76#<vk7%o7GDMLwnz2o2Xa
zsY5R?;r?sJhrUb-kvC{p2cT0eS{g0zC3W^$I8sV~s2cDy%&8uzkDBX8=|aCoo4#=4
zdA0@!Fii{6T#r_|e?EXF=``~qIk$(vBKpAm%}9J|vBU}YZ$7oGz6W}#ZBPT>I?l~-
z6}ZxFK1en3l8RJI#9mZ$vM}O))P-}X_(7wBN6f?`0}r!vqLG9`-CGPS095o6J^Nb4
z4ro=3ZDys0`<(0`<HH{)p(aou;@nzkBqL>f=sb3D6eu>V2k?0+o0Y7Z3Oc`gsW@Sw
zg-d|?MajFFYi}cRv`=HrJ3B~^*WSkZfBo>a+rc0DXdhXd=_DIx>6(8^lQbKq@-=ND
z7Y4meLGi0YWoHCbwn^=^9jvbRaBc*CW#g(Ibuar_-V+3DRk}I8lH<R(Celd)qS(CR
z^FWS-D=X*u9}nbDk0|mp*N%q*3+z$q;PbUs210OwLCZj>CpszG;OCiynG;XXLw$kc
zhaqT9VcvN*%OS5CN3!12Gf)H-gHnAY<0sC!|G0f+@LLG+j`%3YlwXqW$;+0^;!YN~
z3Xy{FD2Eu!WEmhVJ)PwkvDndhRV03GvEM~=83Z1DO?J-S3EyHqanZ}|`p~BX7bZyV
zxqo4s2kXER_Q&}yA^tx&-|kM>6Auveo5$)Gue{s?HthKq4XGxsSt|^wxP|NYW0mbB
z%Y||pjwAIKVtc(Isji>-F2{j<!gY`80KMzmp_ag~Q@9uYhL~2khfC4<1I1P9bl)Fr
zHqZFG>c8SOJ>fTBXE@#>1(?ci$ruQN(v%oilR9G$PJ3Ep+H^YB%s_c}4CpT(2U|u7
z?Ks)5+m%G<|Mm}%YMMhYMWaKO<H2*a3?;!w$v2p*Q{1jMwS8#%3u;@HRsClJT%>jw
zgh2)Qr0lH<r0&l~34I^4xchk<6aeDl-*7Rkzah-r{<X@I=~L6Jo5^saw!wt!E>xI3
zem9OSN*)2htjE@|N!EjykQ*7tPcOV?es|rv2?}TDN!$lQ&R3^eeD_)Zz@!v89Vj2a
zw6tV~)B*62b%^(z<$np4g$jS!>n}Pds2G^kj9Kq|Hr-K3kc0~Cb0s&QhkTX?X`RFN
zSfEcqd8VH^{03LDW%QSa=f<MEr*F4GmmRg#b})pNsJ{wsqm>xV&A<Gt+p!wSbYg(C
z#c(Zl(W2<;H>BKe{6t_#N4I1JZBkrm(o~?rhT2bPz{yJXoL!j|ky3U9bu|K{S6~oj
zT{&Z7DJfTR^|vb`_j-tlXOc%1E-+XI&er3arNo3kuYb6GQ8PDUA(Hx=O3Sx*i~U0N
zZwExK7FS<CwlA|vKK>zQrKTC*uFn@S*muAi|0a_4kZB4W;*FD=m!c~g8$x~SpbR_n
zpH`O&)V_%F%J&Fr0|SHbyS&x-Z@ScjK3F5>+Q2eOhcelwNofKMim565ALyb^QF4E_
zYakUNog<AU3wFxzzOk5#Al(*hL6Ph^^!xE_xCwL80pgF=t(uzQo^GYyfdccO4=w)E
z-KQ6OfQp#0PE>j@Hkw$scy^MQA&-3GL=!-c<Q9JHpk1gNOD<_nB>aesxv=rZB@i62
zAPi%?+9%p<W-Kp!?((}Emxj;xDY6xu*o<;J%~c+HnTXZTdVcJU$NX%@PWkA}+qwOX
znYA}Nv$;#TzjUjw9%%K_M5>Y5>%jdQI0uKhY#HdRdKFwoH(aDs?QTgDQGLAg^ZQ$6
zvY=xjaN^|<sfKzS)EY<a%X3UW@2W)l^|I7X8#o0d(df;_{K`Idet_jKH1uLg>yjq)
z|KZX6s_UT`x6Ko)ZRiaedHi;OyBBx@I>1?Ozq#C3JQmR)p(4Knr1ST|(b2T&o*CTI
z#40O$RUqr1O)5|(i?mc!eC6+D`yr6xYj@R7mDOkZ-W<aV>HfIUsl=nFu_*LK(x1o0
zl@Dg@G#-iU?#&w*U-tHP^Enrw)LA)Q#%qUmqv^rmRgvT?K9l`~nXGpP3W;211Ugb6
zG))x6R!SKgx!S@e$3xtQ6xG&SOBXzfhw>8>A=S-0S`~T*jz;?@g)IUH;ks_@R5~?M
z>8e*@ZREHG5;hQNsgaZ-S4p~O9Y-SVLcesd_E4$)T353>6f1TNWq4W;_y(P~ADmo#
z;q#?vkw^OZg#ms$;w^<3d@?DGxbvCT!^hT`)Mgs32bT{Box8vIsFa9H*FUZR8KFjt
zUwaX@uv%m?I~8)M&i?q*W6&_W61#gfPpZ3ZLZ$8#*V=7VXGWE0gdU^26}J|awb=4f
z*iql3B$q3nS1Pkah%^g*DC&^lz0u}k5Ixlv^Q2LKL-2dmBJf2%&~`Hw^2l#kbtW+;
zs0Y+t1n96FQM}Axu4xp(Vd$M|c9Q}PEhd7X$dI+U8!Yq*&Zno&yTM)YsOfWns@GWv
zN7v38wfR)O6ZYyeS!d34Z;IhJv|`5Hg*eIP{d8O9C3Xw3pv8WT=F5CIVd&~=_U3k_
zecKLaccIXu^ZcSp;^eX@Jj@+D)1)U0Qahz;WK|@Wa@K(<5R^t_p<4I;;9fjP9)u@Z
z*ce^`GPFf(V)f6uc!M&>SA(}xgBM2XOZS{d7$QeKgzsMqz|Lc1;?LU~h+%ZPn#nIZ
zlbu9o_+#ChHausvJCmgS*B*=u{@k|qsB764PSf$Y(@NRj*s(-&#-=^r>|6BCtgCg4
zRsK)c!OUflrE<4P+x(9BJ{+4q(DX)ClSBT_hUZ@mop;K(LQ8PGqA6UDuY;3dE0m^W
zQq3}U3aQvMnL0`@qBuM5F1OrUb(zUKg)r%Al!7jWEXtRY<maXX?_Tm3x#R6;A~>xw
zCUd+!nRxVB=%}`6@7=NTnf2{g^<E0b4xwxIUPSmbMGmFtOsjju&W@G1n+j*&DS&w7
zZi@=Dw9K#Vm(Cq6l}5#MMk3zpYY^MeCoxOw&J`VRemoO@Y&OcOv-p(jG4WjduxaYP
z5E6@my(I{|6lxFFzu=4t;u08hT3>&AO=xnR={OslAEI5gy7x1rW2^J#^5(RIB(}jQ
z6M`G;3M}R@lm=xx)V27LmG6y>SnN?d{+aKlR-ub<O}m*Gk*fv_DGO<A2J+q>5&kiM
zwuwP&eb*PCaGD8(_NttJENR^_H9}nz*a8=B3b^{F@=Lt(l={2m%?3M2&qQ<lFROcR
zO??)xrt8G2oD9$N+?r<J>hOahv0!Dxu@0Sd;1Q@VA;nlV;jJI``-p`e>B>w+Ru0JL
zh<KJ~gdNAj2(yeE#{5-${FS}EmSMS5de?H87Ie=dwsMZrzhl{K+Z4hW)ox;ByA9Ry
zkv@9HD;vPp<%#3ax5|n-=`=;ZVKSK{5u0IgehQEk{;g-~@z<B79QUS)w+4Osa||;~
zz?m>RQ|48R^m(y`YG->Ui^OFq^G`?9@gwVvw>%3w28@{SFyt#YkX3w<S=}N5w@ucw
ze=VXP2R5eQi1IxO*}9W>Xtxe+f|GKv?_bicEN(a0v`1Qmz<;JuLgk_P!r8FcZZu})
zWiFF&3=4~(W3-Zj1zRYzP2y*zsibh!sw*BtI$a?;V{)9czB5+FVZ0b}#SE+i-r2F$
zgbv*wx|u}dIvUKF6`ndaZZwe?zbvYAc8=AV7)^;fDQX)zH&XdNT*g+Pd_@U>70FJ_
z%AI4S$R7PgvPDb2#$UUUBb_S%W8k?>_+K8viMoUm#0wlzCym9~S%;&Na_Y~+2h%Xc
zL*Lrin3~(5W5MeAp8b8}qR{H3<aN#!9?eF6SYj-BoeJvK`)>4`y}>wkthHT!FnsGC
zJuJ$^+q>Ljj{pX4rux4xgBh9yvqFg-SS$trB)kQRRI|Yo&cee#B9>?^`~;qnri47h
zt|&cS0U<4wAxhlyo<+}15|Nd?_W$`v^+btG5WbO2H2wWJwZq3zkNx`9+DrV;phD03
za;QYVh3gZO9{oh5pAPMo+fTk7L?whrj&{2zvEXbBOWMMPC)IBkw9AT>r6=&(@f>+C
z0ZVc~fBK(w0ZZhy{n)aiNYJQE<H2kf$6Jm7N6U8w9!A4RObXtS`XWSZb3i6f-SaSt
z(Qk)M#n&fcc?I@O;py?Vwa`w{OQ<}*L3k)0trA6ZELN~Ke}Pa@OQGuWh+@8rv8><c
z?l)>}ZJdi2Uigz8CmoP^2;0R`AxsqGU1qQIw-x}oY#oI!^myWZD%U*G_^@neRccak
zLpSOe*v)*3|6Ii}rZ;f-X_;(jmMeZG^RPLL@>o~(RO|M)RZw}c)J=yM`Q^Hp8B#67
znyLQDp$t*j0k%Q^oPynkLS_~<8j8Ael<$mTkIZ-+3#+|_#&w#a-9Dy?1t6!M1zjU$
z8auekU&}dmF$Wf2!|=}zJV~`OMHjt?vdIcWC54$SR-lkI4&0@o8lQ1E%NW9nTx6FC
zyxeO7rxQ9isE|F#{~xRGwN225Z}ExVs`Z%)O3;!fzF=Zy?SeUtIvh!quP({fs@K=n
zTTUJX$gZzr`@N*HbxL?lEQ#-Eg>~Lw;~w{^tCMLO`rF44SkO9ddmw^|NULUxDBQeL
zaM%Sng9(4mU@30mmOF9=dH-{KxZw=8*s^L_l6>)Ld&CykjfUp-8b|@hsv?}uEH-B%
zmQZ25G4!pPsSB2$KekMe*JY;|z(cSKniz6NO)M%fqW999R_c_!XkrE73^*#{#?lsW
z0c!G%8_M1RKtuT!d115N<s0NMcezvW2eL1aNuXKObxGrRw6HisPU@DIw6ON~H$uf>
z#04<oDr!9fg=~0%34!Ck$NS-bju%RpbU<-AgZ`pd8Ps+iUeaGqz-_kiz@*UPDR2W+
z9jSb3dz<3nLAn1g4}u0m*GDQIXa&&;x=17=Gk}#t;Z^<WJ5#f<5tU6<mt7b8&$g*b
z^+PHC{&G8N`j4%O#V{+v27ghX!o$Q10CDOy!aA9hw<564%z>v2dVDel`}I<};Ue0N
zSZD#x%T${nADTr(H<{pe?Ztj3_Y3fmpJVJWuw{Kkh__(B7l|W#^~FCg-D`*!cYZSm
zXw@g%rxS>g{7kn6*??ZqU2h;CoYo@{M;sUOx39oR$&2!NGzuud9b*e;#w?85_c`$v
zj&vS8CJo3jx8C5tZUiq&*+ysU=4eWL;@GkMyICxRVC3McuaP$0iZ;#Qm0@*;JI-vv
zg{m${>n1arC=lyVn?E`8&1J|wL*o8VS%O1bEuvxet<Vyw1Hi1%-|omk&K)y>muJli
zf9=ImV#L{{|7Y9pJm!$;obE`>4uNvZ=e^V)6=2lyyh1iz><>VxfgrAV%IWjxqpnwx
z$vEmF2F#jOEAlKnMw|0rk0Icd>AdoSDF^a@p#|};%W6oP@7%*ba&$#>XHxP9+GU4P
z#-~c+uMbrC6lk}T4kxIY;dy7+#)sAv!d^SRy@DJoEc@R#Roz@~cGT|k(2q}8{I}iD
zTX!(@me}<_@1^~?aXkh4eae+0JvKRtz812KM_@;MrmO|<q>^o4RIy=Y4KNUm^?-J)
z-$e7ncV#tTT&e@;lVxDUu?djgczkd+2NjYAe)3B!Jp2An4N<7gn_0aYK%P&)Zg#=8
z=Zt4eJoMe8t6!~BU1N<v-&^dLUD-?nF>=Hfnby6>#yV{0JUr$^2+u6WTdU*ESRvFS
zaAb$Io55_kPa8#rzQXvHjXzm;ib64wqH{?<l#Bs#myHxi>(4AJ_1lFacYi#_i!_j5
zzwmM}bdY(A-u7<rgAGbDh$5>(b_rtt$7kZYd*!G1ImzjEL*<XAra#?c0g3=3d}_f)
zs!ue&tr0Ovfrsg08#hD=@-IcEx*$uLKk57aZ&xARCn@e=>YQB$sr<Ai7t3;b7;DkJ
z<qU2vH|xP1{fy&*XmNgxtUVk6o{Zp@A&nRd?1yI?!r=VN)|n#*I7e!1l-R^H_?Ho4
zEpyYE+<42V;p#y7z9FIBap#mH+wnH16biG1%171qwHHuy^1)mw!s7#hb-M!SLx~o}
zz0ck7o}37-h-urTAf%gXDXvT_y?%G4!{}<O<)DM+_OmGC5DtDNcU5J<ouDb!*(%$m
z?q*NXEY;rOs|;CxWlx(pF?TP8FWC&{?d~F7JacIUO<rm*E*$R6xBjqj5-A2TANm{K
z_P<%G=+t}zj2v-evU<I^jL@UV=G9CY=sdUPoU3cMms@xPe=n3_Up41r^?KvI1#gRo
z{J9RMr*T(2G7{g=4h8c>^e7CoJ<tns8t7NFC3A@{TVxNqUGt-ZzeLa@*y*BPw@(|t
zzxRPH4BdOMKX^!BirQ5w^Wu+O2b_-98{35}`&!QAsCPf^EO@)@Pt7=Fs3-E2OWDAZ
z;bCl9RzQ9;;9-ByxRVhZ8`c4ncAR-?<P@JJ3AN`q{~9H`Jjyqc1a1i4*%^INvOb)=
z?XXK1zK<wgXxV&%Hx9pWq<!u>=gu!&X^TouUoelBW?jZ0O0i&-tK`Y88FujjT%obO
z@zp%7ju{wX`BeV0sajn4TzAdgVV434MTuzb<a_-WcCxs&L|uAIO(Qy$`;;yg%EY^(
z-J+VqX<@Zw>*(;tlAd_oD_K)Qb@rVpNL;TooDeyF8Y_DQ<F)fxXJIDJy=)YuSe$_}
zd+Z>iZ~))7z+7tl;pQ>fx|vF*1*OHCUtcSfgBv#8A+^>%jDX6;mAA)JUovyN*`u|K
zh+<G_br9XqU#L;C%z5C>PNt+%u&d+z?z*Ki$qn68b-cQL%7gNn+r;A>C9*gqjwhm~
z8i32r{6GMabEUc_2~rccI%feOnGpZDkNa}&$ET!CfKO*exXx2Z8uEwk>KSDVCf1O`
zvZ|Tda8sa(wAn)6piGK~bB<}VORrp6Hh(C9dm`V}_@Jw?(0k;kO9|Q5*ZIh`a{4CQ
zR_$P8O;nn8dHCGxfqkbTe;=6}Zl2;9nxsmm!R{i<O4~UfOBOvjqOReFW^D@?8?KzU
zy=T8ek9>}u9Nf;3fc*W+8=j$F%`s^O-7Tf|awx&Bm90R;jUneH_pV{V(zQiqCaT_g
zbs)^^%8l#Vbjp*HICRy0ShPTZ_83XV!yqx!W@VqGmNeR5uZrh{bv*BO1&s$;rp0@f
zdogp{4Yq2lxr4MBm3HNEzqiWP(IrC;shpHy?dRuA1jep1gwItDY$mo@D)YRE6t#I;
zYRXxl7=B9ziJ!3iFa|d$aqSr_v0G%X$3YO5<1*&G>9<$-1LTRdd3E2N5$nq}t~5Kp
z?uYNu<VQg2mse)4R7*I;<Rn<co_?-LUylp?7AHxlVe7>sWC-1I)L6pLb1xIJo2R`4
zm$5Rz>sm8oL923^P9s6WZIxQ=u7GHfr?O2p@kF}mQ_2b+@_rxBjk)v?=tnhPpLv6&
zyH_|&@X+#pBHF@veLWF%XY4U5N#|hoY9!__i>IElXEy5W4#H?%YaZ~Pr;uf_gc#J1
ze`Af$vRD3|6ixm^!6Ah&@{}|OMRq~?i?3xhcNoQ;(h%h#KfawW=0ggGtB>FlSjd4b
z3*zfhDr!?i32Nd~)<wvawcjF~2xoFztV?`poot@lNDmh)e~RhPEFg(dQEU2emesKD
zNjXc8O$vIVzm65KvD8ZvPZY3>SI8ReERI%J0vMqi`T+6yS+?O2gY$sf1ixIQq-GFW
z^_qM*JNr<D<IIKIUs!q}TR47n+n;yog|Ik`z_0Zx+)?Uvn3Z+DBHM+5f{v*dM_J8C
zFCRFoh5S|7M|*3?(zgs}x2l5bRLn=5{5e5z>)PM%_SGIK$H%LRJ+HAqmJ>VaUg?(7
z{9_y+;yNBPv>Qaa*NrP(9*D$Pkoq3b2~pw2O0PG7hwV)s2q)hf?RLda6p79rBEBUQ
z(;)>^c+aJk^Y=A}>viDvKPwv1Ln_E=Q!jvaBVb>E8>|U4X*~?x!TnhSL2Jd4&((s^
z*Uh(V>~j^V;OMjJCE^byrY#(JQk!$n7{JVV5O|S}r@!YsqhA8HiE=V)_VLvf0lbJ2
zu)&eoy|v(ELv@{Faonuriyw=jbSMiN(Z+}cH4>GvlH1pY^P=7E_8cn*V{?ayysqtv
zbVhk3JjV9I>_cU$aKcJ=a)CFEXJPRaSbmYWbPDL%26#nM=KFVY@-OxyaZcUc+C+&5
znut3o-8M-Ex0_-Vm<w$kYfe&mRGwnV(8Jp_O0*0Uc&{VLo4_Ha@x6_DaWgQt=jiqs
z4)P1`)Qj&U>6?rx@`nWRr1Webdja?mjUgJh%q;EJ*G0Q2%$0<jxM5bF$vP4F9wpjZ
zC6pWP6oqgt#rI8GpbP2)^;4W3`;AHg;8!7ggBJG2;T<at%gZsB_7Z-1S((M@1oF$2
zbwv;bm5`&(B2iGvUWJMLWL_*dImF{M5fdknlMCDT12H*xfd#ma8@dI@2@^0Yui3n=
zuz14y7V6DJmi>iZmDI06GLgv6wZNk8|Cslc)a5c8T&{R9-vjiJddZb6^GQB9X)Nc^
ze?GgM`;;pYDLkX~HvUJC?-W(M_@V4X7P8>^AK{5(lQ_Ks#jx4X;~2Beik(p{2@Mk_
z=zFK13Xi;k%2O2#M`A2}nw6<-w(O1W=T200yz<7cM<_o884`_u%*-eAT>x*Iw2!I5
z()KAT5Q4NI2+~Y#pLu;EZcmidTLkG#TgbR8`*0Q=&XR?k3j$dMqtS1A%&=6J6`wX5
zPP^%GGPRI5V<DV1BObR*Ict>?2p+#i3%9@oFl@CC_V@fLSDDXn=iYvX#Eiw75OfCV
z(nfUiB7qO5u92V=JZl1hLE&44kbpya`}xT;Y<N{iwVj>?!D0LurScU_pv_^}yM5|X
zGyn-0yO;x*wbP;qmjifMj_*BTfwF&llFwlp)>(efi!Z@a@vexK%)%nZn-EyQXBOu9
zeN3&2^+k&Z;5Wxh;Wr^{KJ-EVuYPkf+Qf~J!4Vt&#Ecd((HCG%YI;v>2SYGRm(g(1
z6!zq{rq(1CTw9#7vV8WU@ztvkCgyjf(<OPqJIO;P;mke9?jAJ+gmeyI`cN+BQM`$x
zOegPMs(bG`Am)38^Cbm@f2um66d3a6SU4xob=*jTASiwur5uJtM4D*GxM%{(Eg1Fu
z$Kh@irM*#P6+G9MlVAY7HiPrprw)VtXRt?k;b<}=!_rVe+*I<n1@J_Ae<#GZOi@L?
zDZtVH`up+#K!fbW#4lWi+mocYxcDtdG7>06ux8~7{)Q9)%(0<i!N&B(ryId!Do}aj
zj@1Fprrhwb^haVL@Wmrp$A=DnQq=LLYAPXNXpbCx7DkP7_TPu+By58iU*$J}hqoTM
zw-=_56MfjYYe`8v_*f92Cop-si6Grwh2I(T>~u2BQ8of8|9_1frr|^-kpykxkdbIq
zk~z&K7x)g>;MG2Tr^<0Evioj~qPE&5Z=B1GMy)ThjAy~Ym3%OF8P(bQjLj6Wx~GwV
zPOKq%?hasG4>mN#Fyyx*A(AsYd1>fvbiKDNw;7>q`vc<flf-?wqyvlZ!4a?prtveT
zwA_CB!<!?X3yGW7(h8m7`u>(_r~C|sen7plNXG|G@cAPG#uxA;QMc}1LM(-gF%~xu
zK#B~qVfewP6CCbRvA#<*o?c|aO@k&90Z`I3A40`p*)n4_3=y{HQCN_O5*9ba?shi_
z7SD|bUiLVn-NZ5hm893JV1>B-gK!gP2cOond56La{p8{Neh^>_yZM%0WP>QfeIK|T
zJ*GVGza9mjGkf^?Ib=gD10>;Y`Z$gcen(Z;)0RYSM@>yflN^vy?cYJwMzOwIls($b
zM29X82}EsA%cAnmhm_Q`LVwPl-R-rgchDNmxui{?@-5_I^brwc%Mn`u3+6z?01;8q
zsrKB|g`aunrA#R)Q1>d|7!3KS%|fj34)Pc|$E8+ei+E}51|4{^!Y;wrXhLH?Naacu
zGQosWS&>WUBxS6`gRJm8NQPtsaxQVETuQr8L~k6rKk7ieekP)%FLwpRn9`9h86|+~
zQ!^B~8bMvYEr@I31WJGp_!j9mp9od=?mOOlvZxQA<L&HsqLQzItiGPs8&%>OLUuPU
z`%Qnci;hKgI@8a9PRfvhT%J(Sgw;E(D9|keTS_f;<`c5_eh|1}?^L;GN?wHcc@E@#
z_@h*nLM^~k3s$4k%X4V}6&Z!Zd!J6?1O(szXl>yu{+!banKJ~4d?o*cr~Jx&B~Mh|
zpM7LohsFE)dhH})fKt@qo|?n7S)SsAg}&J{Oay;>y9U0`bibSkAs`Xj=d3%9x&qxt
z5im%Fz2$@bJP+u>)vyO-zC=WVwBFG`37QJ*Je~y>w_pj#=wxRk`$(J!xP832+{rkZ
zZZOZYq(2@f2|@M)(7*gh%d?1A1!yYwNuKuK7?BvvHMnS6eFH8YJ!ZXJ*v%)YLX+5`
zzKK|wU+#z=9hzmSw>^;JbPuFX(>>OIz5=h6sicX?^HH1^usByQwpEfM`YtPFcNp9!
z!{(p^9SIJM+&x=pqL5`jRBHP<UUZI2X8WlJXbAe7H$EZLL*ys@?M)9}?N#VG=h)PO
z4MbYr+X7BjPks+{x{ngXfaB@;KrIB>yki7j+nyt?)~lv6Fx-uzsaC+_Sn*8vuL=g^
zg+em=d|o<i7Z3Mj%fNK$_Ci1@lxrrUq0_W-HK=>pj#jxVdo<r)9yLaqg4|Ow-)!-o
zRM@ACy;13sGYC<Q|5J90OV4h#fugB@@#0_!uOg?M*sK~?8^U5b4xx@v@Gk2hP<2;|
zhKKf%^g=J_VvlcuL?vz24p68r9PQpImD|yg8&Xr*iNb9{V;Ib5M`)L!ySisxLkM>;
zZbZrFFam`yzl8pE)1TmQ6GWjAG)4WO?s$!XduBzF4^X_GY_GG{eiXV3l(UyU&4OoM
zB$bP9LUT02j3np}C(c;Lqv#B3;!_uaEC*geG=E#7Q!6*Y><7hbY+;p<x@j!FcS>R<
zbZ2DQht5;5{1bFWdD$&*<_ruw)#2HvI|!MoN^16-<yzNVor#+l6dk!YwG&DQ*~ebl
z96wiCZPe{kyK^!k8h6V6k;G5NQl0Hl^`TquP4}2FHoQzzNO}aUNgZsq_--mG0#nf%
zk(f1YSr3l<L)W1g!7F+3vc-&{sDkLE@w~pFGoWkAQUbFZL2Le%b0+preJ|`(P<riv
zng*#;)|fdBW_zLdW9bYiE;$}6k|*>0tTfsQB$VM$R{7Oc!Kq*Yj{%q1X+KhdK&Imt
zneTm|xEYUh!Z;u7Ao<`$f4*wX)W~EEczXK}6LF{{?pdIQC7s4vi`+O2UfIF-@d}H4
zi0e0jp)IPFvq2Q1Q5R#H??OgSWwBaP@hSKe-Ma0$DIq&)3z3W`X-7U*faXr4><e)L
z=~V=vdntAKs%M!J{p_~$`q!&7>lWk$Gov7nN?_#C^|b2#n~ae<U;H&(!j8%cw@m@Q
z6KA!4(?-%MX^;DEFA(B8uFwPbO(+<{+nt}rioLqSf9EM?g+y(@AMSaUP-Dol3Kkm5
z2MYq)`6<B}0usenh?mt(U3c#DPoS0%<7*2W-G}RSS|h)2S&j6CN*-4wUE)y%5j<i2
zHX#Df?X4d#{J-b9#KS!xsOkg|_;e!Dz;tHBbXEA?#D^*o536r0yFWhxD>E^suV$H4
zu!>wQiI)q@)fo8s0TZ;&@Uz=0JADjK>PONP&hBWt1TS6{G{So_7V&1`-S5pCh@kNl
zpD!fOL-3jBQ)&cvnJ~M?9NitG@pg>P`ejUkCP->JZiQNR#czOEKBZ^K(HRk&*LE%P
zMi*}hPJUH)mkHcUssz3ZZ+kUne8C*T7@`+BM&-ht%A;Cb(hILmp@_uYq(TN7Dt#AK
z2Y%p6Y|@nIF!PAmY+6La{jN1Zo3B((lat>~IzPm9pKA@6Q8ZQ-ABAh9N^21{dhEh*
zsB61dzUMe(GkBKX6%G5OmdK(6c^a(Y+C}+#ko;`<w7TCX{7{k^(T{xJ8}amGQa2Ds
zKR8e9@1Nh~*%W8_Yl5LU65J(KoePf@P`*~gJri{Aw-p3|u&^y^L-=jM#6{2Op2F5u
z-D1~KM6=tx)IJkY1C^|DhtiowyDAXr{J#J56DUWAMxcg3V~9#xdwmVj3%zjtN%3jV
zawytq+-dV%o1oocbQ7LcppO>WsEhd3Y`YU&s(^O8PI2*Y>-K1}{9<0htk`C@mC&<(
zxpMb4v+fP#R>aJm$v3M<bC}+11ETnZOEwL!?iohvXVRr`IlqN8ToY5bZ|1-rsIR70
zKxo$kT-()<jPL!}C7X<j`?Ifg^Gq234+Ik&93PX7`F_D&qon{JnWV19TmF-b68AM=
z#ct+TgNSna0Xg5Muw_Pf^^vHgFKmjnV23gh&fU!1Q?VW}0`y*e%M)4^{Z?-6ckV9^
z>U>VPAh`8*?ATjS^CWa`9@;?k>wm4!QcWh~4E04TdVEsF`-{Z0T^0ukDmN<6L>>3K
z8j&_p<V7~``rwGAlKhUELb?VKIL42#`((1|2Wz5dvUhjA!5l^Icc%IkA}va7K@UrC
zKu~hh!Fl_XbVc)WURo{h)Ja*e22RQN;awk%BfMu34|cR7-2$=izE~e-BOZJ=O$}mI
z<!y^uNGg!O18LVazfO-^Pgw#>>W`elBu^!StD!o3GxNJOaX1nweV<7S22hz-+KWxh
z3$Xf3&|@MP@^H)QsvZefC&kTgE@zh757DYoNsA!}dXC|Hv-e91ZvpK;#rd^Sz>g*U
z+biob9m=<&K}&yS05uvro*JoxPMpuHtr1^Xl(5H>$_(Ty9lMRsj-{jdeh%@TW%&tS
z<5d}$B-pWjA;$ww$>BZR9EerzVrdlWl$d{K#>f~xd#-VHs;%*R+Aq4>qowUj^zR4G
zeQsSSDzfejQHhRV&A%qSp}piw$kd#pU&N_ueodBTrtM2dnF42x81z)}lloTlagNEq
z9L~eEh0DOt$x}{@?xv%HKq=1WLt4kzPc7;@z5xBW6$!Z1TffPBA+13DoV`a_mxIDe
zRd)6c;=;Y{Yh6xR5as+%KVxNc2wuzIw)6JReIi>;@YA<HREYZmGW=o6*ne|K)-mMn
zAH!d{3E#gDbu6tzg3Jp^M$;+leMsjaf7Vtoc3O}(C4JJO98Yoj%A?IM5dmiyG3}B9
zjYgihg_AD@<x!SfS<>(HYg4b46c=OOSAw#1FX%B--%(kvKsqJ0<R6=T%HH3%EO9g>
z?&1E-!~Ls!>lrel%>EDVpK;7HsrHka#ikk*-@YEJo7<nMm9cW})Dp2p&Ym(^7c^H=
zQ@VD&DE8*c>bE6U-mz0%Z4V;U&YxAPHdYZdm|e`~sv-6~=4p6B7>R}4*k#G2Uho9H
z`$XC!d8v8h@CqHKiiPZzXIz$C$o)G0g`77J=Lv@XrjT%pHN8CbJ_P6I3636*gehl|
zP|zg*64Sk%$r=x$Tyar{y*y|OrI2UdS(D((#{7~J*cwqM6MqbPbk(#(a7pTz_F7%>
z=$kY1D9LbUI@~o;XyEKnhIGHAFrzg<B4xxz2U95mbp-PuFIwmfOC!nD&GH_$bIo%x
ztxsk?5|CSz<zR*_2JG&;gZ*$8_7Yu^DsljFd^wPQrIxi%3cvOIK4md&6d_R2Cyn-h
zRq4)k<sviyy&ygkA?WbQpgd%$J`%{YJ8YTTKMQ4}2lJL_;=7l#RqyQJHXFblVc4mj
zaWdwdf@EO$(H$hJ`9*^C4-?<JJ3gLW<2o@%)N>UGQ@`@$k!YL64sFADX=u>H2sk7j
zaK@ul8494K%Mq%}&H3FSjFJIG=CqYfpw&&SD$FWBdD!Vm>|MJy@43lV(P#Wlrg4uh
z$qe(x-H{I<X?<QGv-_PaC^6bq^&m54ojIdE^n0Sqpi+zD2to!+|1J-xnM3VtPTX$z
z?B?wwRNc+_F32ztw?{U`2z<~fWrW>At{4U;f?9`D0k(1;MB45;#ukSNJ_^qoeiYZD
zvB1E8yZsXW+b4bJj6q@}rxuR-G&UT~MtdvnYAFSz^@>fUC#8R8s*p)pXFq$6L?RGc
zrva+LmHk*1`Edg-EIm`xToYzIV(fEzQJ&=q;XAPec+8ju9IyNuF$DN_hXFne&=p==
zu0=O__*w)(2)OQ?N>4I?NA9eTz>UBTC^kh>^MBq`3h#M8|6~~X@c|M8(CG>oSVL0!
z!IA0J2Q(>reNiz%hjxf1)B`4-nRkl>LD8v60ACQf`C<o+ksZ!E?G(!=yQ5b6oe0MM
zg~%7kf}(c@^2c?d!HxX~{&GUJC^82vbpZqv+F}LV`h^Eu%r^Y;s`S=m3~I4I?-$#9
z@PjV;BD!oFi$fRAoX8tjG~Kg$bs2CqjkZVx!3cy%F$lZzl@tbOqD>X2N}jUES(gvl
zX7byiFxew=&IuKOd}N>Er$b=cVJe|U0$Ks1w_xrlEgT{GTC$8}U?qppmi@?M<58k7
zKGm5#!VvUuBSxX?*=LOcfEM2ta`V7M>hRg^>#Xqx^EykejxORVVZ&vCYCrc{GSAOL
zKexveoXNf5a(Qovu6z4qvXO;SCmtXR<utFt|L_k0089#(oyDHa2n3@km}EGFApeGA
z{sp^0`j^X0MmFAv{5KEdzafws2pYF>Zz>Bxp8Cb?mJmc;#JJ7}D-y?wwML*$*_{Un
zDsvXEa*wi(m-8b4w1;~)@DEmnoTzj(Fh;o9@T5-H4?ci7&Wm2&e}+`DqD;<m|0ZmJ
z#X&Z%7!3~D9o#hefQ?f8!A6=%52oy9yV7kzl~K;crWS!m&LQxP;d}KFki!P}rY;}(
z1c=8~2%9qM<7GDFy^Ij`c3pp=FZl6q`!58*vWMMD-tHgn!D;M_X>XYM?=i>ofSZ}Q
zJIsK5VgNj<5WwRXf`Gi>)w6$I2lg@-2lBDK(1b4Ks*U+ltMQ+6TIUMuR0FvHBuUAG
zppU?~NQBQkbEh}7Ko|o$NUwfvE;Fa45!#$IV?VT|FjiSCis=|0ysa78P)BuKo&_l*
zpk#vYYl^xySq`nsNIKK3C7k2C<ozj!TeKK2Sk)QB;4o7daS9Dg&%n1Svjc|c{D!Al
z5&4TO*&o!%KOR9t!ASTVsH!@0c*ypnB2`3;U<Wm}l(Qn|MhXV}xoD=N1Rkg-sgm2{
z27eKJVI~A@KNi7A7`FuA<ZRNjA0XbqA398C5xn7L4!y8$01Q|-RPo=an}s~=Owq`M
z1mp|eC4%ORS{pyVHU_tck|G~N9*v+PP>b_W;}$^XZzqs_FJcU1>dwX<{S(HCGs0@D
zTp$0#m{>hKKZyVsD+Xr0SvpzPok_N!x^5EV-;V758aRDxIiH#xwBaGzk2;Zo9>G`;
zP5dEBg!*=A!V1dPG5T7zMQ1{vYpOcu4CQ8Gh{rC&uUz4g`1p|>8#B?GejEASa4cr!
zZ;C`3``-!YI<PI-Q2Pse<9MkNC5#5XoCv^OD$a|V=aHGo$)epr{6QCj3q{5HyNI6w
z4Fbs59eMR8kNB=4H>b=)q$dgB<S#<B;Z?}ByMmy)A_T|-{L!nZZyDE^MgI^Wj*@6t
zjA4*$p9RvuQCB!Ym=t{x4vZ^kbz)Y$vZPkw8iD5k$cR~C$RsNCMb4!rB$W`w!cQTL
zL(PWHf*2m!Q0xfeiFYIjTarkxu3p|6&q6X-J(*O}D@Hj&tB-#_CKY*1&WS?g#~v+s
z%p_B}QEt-vi9w=bWfp-g)@8DYzX=Yead=+P?eP%54=MUR7nsl#Y#QFHPeqW0Y=twJ
z{_c{N!AsyqP)FS?=FVA(q~XonJI*9^LoHr}`wahqZ+M|{hS=}5=(!GGgYjOu2+Rov
zn2Ac7rMcCY8}09vt|e=-B`9xY$!Oe4JR0X>*Z0+K<s^hvk+!v~qhCRL<7_h;ugaqj
z(~n#Ls{{~d<V-r4>8D+gA^pz=5rpSz;QQXflOpyQSUwejeaiQ7{|mPZA9v-w`na@W
ztB0ver#J0-s*P%1h=~*?saC2~SeMsJIy1J&Eb39W8io%CC93W0sb5`ye{vF8bOpEx
z(m<j-hnLp_N;gnU)`n2bKYNY&%!5nQg&Q<qj>R0JF2J4rMRvmdFXI&&{)%s=EfNT=
zV4m<mh-_+9MFth)JX<B=q+Z^CVV2#Ymv4Gkx$l?pU7E41cwrrLF8Ay-K2H0O1YKOQ
z&Jlf6(=xNWw|25y1Q$0elbMUhmJO~I;5eEVaGo=Xt^teH@uf}Thb!7kTm{<eySq6d
zAw>)np)oPPzmtV33pbXFEH0ht@{bU`#;d4DsKAlC6N6FTtO;cnoL>*YBL}JRel8mX
z@Z$Zvy~BZ}+v}J@;(<ecSMa@QfkQr>jA8C`*ePPXW`rxMp^r9P>&X}#Q;vN(Lf?C2
z!bgdk^s=CSA^TF~FrtpTl`IHVd5R0-ke5@J;-bG_ESL{P=%12Yp@d+lWwn$HFs}h;
z373V~n2vpHqbt4Gc-`h>evL|;hMNwKWgK8ism=L7288t_3UtA%*w@$bNS5RcQ;$)X
z`U1i@HOYII)od-1<ha<t7;`b<pnT4dzqfQL#fkz}iErIBiVxY8eVF#6Mp4m{vsG{r
z=y_y1(&j5)yL0%>V>x;i&R*+p)8<<)5sS9yJ444UpSm=)>ivN=Q(*E&*Mvu0_^R=p
zhemPG4<QBK_O&tf$6*Zo7GUagxB({5g;#^`LH?W1rXd)q$!^gbUPXHP0-K4JVGyc0
zdXWye`KB(4qwFufrbD6@q2GKRskdgRF|WrX+&T3K2NMomU+ckHox<x!JS*MPlZ1!2
z&Hws`3Ko_0xSkWa01l7uzyHXEJJC7kN;BDi{nuz%N>YC{<O=&=ABr(TOp8HqDD!{)
zS3bC`XpbcPw^tICae}1)yg=r^{_736TlkJb2LFG5CAlN8Ym@9mFZukp0g>m!D(Ig8
zdcnUZ_~#wt@i3Ks+ef?qWetc}%wQFYga$7EmsiTxiNaKJdx^IHSmFP<5On~|Ti?uo
z`d?m&#oEDCjIt?K4`H<bbyQdmScL{N$jSYe$^8HQgMK;Ti%8r9dahy*P;i!cQ%IId
zj6iUCK~2zbCG_ekTZWRftU<sT@;xu?Tmfk2Rs1>Y5CbG12)^drU`e)Zf39&+(Z{`i
zzO*u86YxIVL{`Linsj}>|627PZivQu60z?HRzxrDQT(fsc|Yaiub-(a^gxo-JlHTj
zNbdu_J+O&XMIkq^G5~G8-$7g9vA0lpqqn#D@Zf#L&(}dc0G&`Vy}tncgGqa<JzYzH
z8c)z4R#I)`I`Uu+NG>`sgA!M|k!Wl*CboUGN9$T9be-j#{*I)0JQoU{C^6+iV#y9V
zplfV=pt|-J;BmHLhx6>qkV4Tx+EQh_bb2=T<(0p<c1+0$WpVaeO2xt$W0!9VdyO)?
z{vOaC1b^pThgfPa4!2PYJp2^K-6hvrt!%x#QnJj`A!3*InjY?6l_N*mIB5fi%GIPY
zj0&{#Z)`Nl>|d9LInN>X9fH@pqZ_J2P<U1J3#R9alob2f#Zsl^0??4TzrTE7vbQF4
zP=vI|`ur+BH`(Wx%|^EAp7LktfW2IiU|$;dYfA9p$+BeVbUin=zc{uU0?l4O&jp;i
zd*+)@`bbcJik$yy#~2huQw`ahT-v({nMS0AxPE>m?bwB{^S|>%$az^YU}-wc*I=<f
z(9$03&9kPZG!H)~N?ypozyEY*cKq@ZRDG*S?E~j|`p9vHST`wwYZ8GJwyoRuY?K#=
zAWzX?*Bo$Mh^D#&7CJFz)C+0o6qIB=BTwk$1UpMKqt9y(0OXlTCY*FcDRkJX9N*6=
zcuy(>vwe9yi>CVhq#9vpL$_|Ok-o@YmHV2L#CUA%^qYjctw_(*r%gXv1vYZ=TW&Jh
zmb~)6X*pvKc@+2e{O>!`q@YU5bhLa`u9aG^ugqJ5h5n^!_Hu$>dp`c(2Xp0ytouvX
zLIgBn38nDu!>l~I4K-y1a+r}dzKkc7)9huZ@ltQgeg~Q_*?pu>c_IF_X?ke<KxW6e
zG|Fk*B|%@8AY!B0BK6b6^1JlsfTahzBF)sU4<(nUC31}0L{m4z>q0S{97MEq?BSlJ
zeI~iAa>ciIC`?DR_|CuO^1V{@)8r|^cC^#clfvwqK*5^4zg#u&1Nv*~sc*KD3OIL&
z&wPj$oV$5ZWi%S<s0zQ#Aqb{X=+%*-vq-n+k4O6N3-(rd&M^fL%p_C|+O=##nl#Qe
z*DG|-rQ+5QCzflbcVg#CZsYGS2~$l|03XZ1HrxrhFu7^-@rA%1ANS}T0kG?)nJk&~
zQJj)PC4g33!-CQ)?M@)?@??h@Y8i_pjbwoK_5EJC$^Ea5WQ{*4Z-jfc^aDxE3G#uS
z>S#Ysu>ycoW)cOYpDhUgbj^P}lQ8cE<=5Z0Dc1uH*%$MmaW3@tNPc&)J0sJSNV!mi
zf1NTVd|uJ<8ktJRVgC&x3#EPA5-(^7;*Q5he0TO$vH01uLF>Oj={$S2)iR`|LKNpx
z9fZ`L#kwv-H6}mTh9w_ItR0~|WZvdhV%IGq>WuG)VH3nn{mj2AwAvCDUW7u!{Nwvj
z=spb^-ci=~p@YL^!Rs5=lH{DylW8+4`e}@cFRn+p>R-io3)fwB7|g_!s(ILKTGf_Y
z-M5c194u3+g?<rL!fl&R7~aPic#jSATOjfOp($I@0o#=)dbI{Yl#UV))C+b?&H~(Y
z6rY%K{H%&No>XPV3-M(nMVUuOS675@O}P*xK-qlG_n3YgxvFQu6Gs8?lEm13d5MoG
z3(2ETP4AYmguYDSVrnM!P?Y`j^0aS3+Q;MLQ21ER5{HSLOqw=;gr=aP)?$c?KskF=
zE~fP&G^Q9(`BqO)95AR-f&*Au*Uf=@2(-)XOwSKTR}@0e0@UJW?SZ%3u|0vZ6$Vf9
zkI!gs#-~*j<q1}!Q-g|?eh5}m+Lb6?lXo_@`SDc&9qxiUC-LZf{QmpeJ$tumCl9;9
zseRxB`L`(<G7Fi>x-qsSuRjVlwEfGQ%GLNVTtBqQe@A18vmGjjSLvFt{xh6|8*5z~
zqv|mq_i^c|&I?r1(Kr!bO8uegGIp|izC_X|FFA%_&R8N1w*5LO*`FFNuqJeAwJ&XR
zY<huDJ)zzfMKa!^<Pq*KbRe8oq8r3$(L*IeUA+LYcoNch$0@IBRoSFM6R(PW2a0hX
zS%z=U(ipOrS<hXYYvl_j8UI#)rJG5W$)E_q=^C9{Hm?|wNfIx#NT<_TWTtnaM&$?@
zsfWjwqHH%OsGPS=)D2)_5%=wX6H$@FauEqLGfSI!k%?Ug8o0xzunNhzF=!$X;Njma
z!guR3zZMWOXzIb)A@t??T{0=@HoFz3>YtZSK<THzTW(46SJt;a8$*7~y}ITiluwVp
zOk=Wj`f@zL7}x}Izo5>CyZwFS*}L439+S6KL#gz6L=Tjy=u~z3)FxIr=f`shVD&<(
zi3640Et($`DCAnlm`YjLTt;16dH~(z9&KLBYPz+3O@dR-xYDSQf0?PL=&$KFs&Sg+
zx78~`zM0y<e-xbhdXAegCjUu|iit&_LqR%R`yX>V|AuQ29xy-{J=EKX(<IEt(zuHV
zp_yGmH-=KzMZAh#o!I}mQ@nsO5lNxgGvdK=$#Hw$He26?oL$1506R%?3N^uGwa$i>
zN6i5v@+w^kH_HwSA+c5vxrd!|wuF3B1cHH>@D7sZj=#2C2u(@j&UVwO9KfqmURb<Y
zKcQz^Ft0e;&7~rZb`!?zfC5T0ls^eTdcSGvSP*R<=(3b)>+(wYfJn=u4YRpMleR~+
zs{%Dn(aQL4IMI(iHF@hqk>86y(pIW3IDdA@Tu!*vKm9k84k`%1WlXjWg5m+V4<DhG
z@T!F5EK;NKq^P6<b1iSoLl29S<!i3YhNl8UH@o6?l&)PqdDpgP!Q?LF`|axY+Gc&Z
zxV|l`@LovKay)1Eq{^^!QvVLLsD0N}r0EyXRrF;)ye{PEL(w3{x^w+Oxjz;bU-bj8
zLL^4TC@8!9t~9>eG}Ar1vzbj!P&zwVJtMR<Dtx)JN7Ej4Eg*6TPlxy??YzD1>es>L
zQ_hTie|yu|WpF#a^5j-uz_2Z}Q=puRRXzZ7?qTaf^0Z1ob$rT|OeQx@<NN$zwK`?A
zq+M7pak_-weD5N7k=Uh$jf&xN9_!8mi}895Zl-KABkELL1WRnWZb_rDn!Dv5&<7s%
zN9+!@JtC%RFrQBPTH^k~3=+d)o|3nD7t;sWs-+fp5?UDMzT$V*y!b|&j8q`HL=WBC
z&Rvgfm*P7|8x&kj<NUBlfJQqT3{Ul~^o&>KPu?X%jgZiM*QvkS&vhO}cYW`uwJ)CD
z;q>7DooYNhS%9LfmE)Qvk&|^3vBU<FPam&ag`X6)ESnRbc^N?$8ybU4_;!AAt^l9H
zY74s3#+@|bIrmMO<!X&8nNWE^JI;nsM%jL-o<h9#B`RsMr6w0sY+EoeRmw#ZC+T{3
zbZ^14t#q$OUJ|SKw0ixDNI%W|tsrUAcZ`^75GVpq#q-BRH2A(Us^-ad872NDtD_Cj
zqgvGU(@AR#)z#+EK_CbXwb+&N`)bbiS`QknHwX_8FP+r0wfY8CKEj@PnS}a?uXXc0
z+&XtAN4&m%yesqO=#<ac<-CuFSzyFs0Je*ELu!r?gix68OFJqFW3<2_@$@lf<F$$r
z%O#JuO4yI>M?H{8-ZunE8hqziXyKidfTXTteGKEHFjQD|BwU7`%?m9;fJd1nk7n5u
zYsQU#hKy6BEB~zinRCq@Nz#&A!m}3kP}@xGg36=xBv${{kJ#0<C5MZO*JLU;Umevu
z-)6eM8nZ%A`7+pWVf0Z~<+;0O<#<|NL{?G+t-m=p6{BPjPFuN`<T7ZZ)Z(`nbz}tq
zOp0l_RW_=ifwpHfybV8aswJFib~O&|qQ6NG$mg^{_;vvXX-8q_${rywA{i$>fJ8an
z5jJ*>vk&HastlfzUY_Lm!n`nQf^;8syfK<-?I%>IPCu~)F6C5uL|veMINFU@FSi|N
zHT(`ow9A_iQ>JdeApyO(+iNPVKF9mb3xEQN;DL)D;5kZq?W0Urcr7EIo?cM!d*U*p
zn4)IX&Q`B8bN~`P%`MMMtezfu3yn0dIG=GCd5%g_eRFGO59#a__w9k&w8h(E#;Qn^
zaRId0NLr6AYU8h_N^NY+CLUK<qucbRQfvI@1O6N06=r7<{1(H4J$kkTpQ_M4AgQ&o
zctIh+elUV6c5O1zMtJ=m@PNvp0-Dz7+ENjyLA+cad?sn?cM+loeK%g>ImJ#fXK#I#
zyfPq02ysaVE$kKAuiE7;iU=ENd8jG0@zm2+KBo)~Gf{I%`CxPaSi)yU2w9$PmM&YD
z25t0^kQ&wp91SbxY7%m%a-Ln}XnJiKDN<P6=L5wUa&^s1iD&wwJ8xNaR@k));x+xG
zJVvfOBnBRlGmibFQ%z5iyF+q$K$ZE|<DqdoigD>3lk~VRx13G*`#K@)x-$|~jX9rw
z^;OFYeMFtb{eATJya~&}OeO2SHw>$L<ldbrj5mt96YN?B7h+0A@Ez8Fe%0|Fi>-4t
zVckNmS7bjBrOATNNx~DxhWw;Yk?*Pejcos`{aP;Hy>hU|Ykn9P19!R@X|FFAIfwL2
z+%4}ocB*qVntf*TjlHeWpl;aWvOePR)tRI0Oi<ObfXBPRPT}Y0>9N9=cApLA<o!&h
zrd7};g&zoX7wsYoFD$?F1QygSe>q%<g(0UaxSopi$2zWbMtQvG80M9?Or4#$*z}o!
zA?AhhX3Ha*dz&hQbc_>xn;^#9rs3fElysb9#1NN&GP!>#?AFZX7Zi0xc3AsA?w%r8
zk1CN~PD=zf&=e2?IoH>C#Kh;D9j|+ajjI`vogqH2xGd>TXlO(xK6{d5rd6_k!tX@k
zHBQ1nf$&ib%_Ke2Qc)&hZ|WV#mW4^5Lq^OF0Hs!8<Kb0B9qOlt!qu(1gC(uHl<A38
zgHN1gyWqZR;oMU3Zc5sdl7*M{QItd&lViAUbl+gxig}TuklI{Z<cNvNwJNAKi%To8
zo4pxF9U5-(sB1{@ZkXq@B5xVbq2k6eXT!>C9x|;T$ekV%6J7OWi}z9_>;4();JmCN
zSS@mPdD=_AZSxCgh*_$W_9>a*+;OFooqS!+y%%+t&O^$k8HytgJ>D+PFIPy?4$j>A
z4{$=ymg0X?xw9<${)+Z&3r`QQ*HRF&2S8z_n>%gQpj#HnbTOxN*hwWF|Cuc$ueo<>
zoN9a=lDdj8+047+goluRc1|hJ&aM?_q|EmELhT%!c&%R}&RM|n%(Cmx&4Rk4&+W9E
ze-b<}m^1RQtxhWbokscFA^#09rRamTjOXa|{s)Hm7asF}^Q(?VvZHZttT8zfF~BU$
z`8E75IncY$5H)pJiH5vw6%rm(o-4$EMo?TQ+HaJ)el!i?7V3Dyk&p*jwvhxSrYwQm
z!Ph?2d`dAwGP#EXRrH1ZM6J!H@MqY4XyJ4JPkYxM4rTiH@pB4cvn4d>AVN#olpM=i
zhpeoW<a~;ll(U)1c~R@v$_&CXWECRHB&Si@X*W!viOq4aOO9(MGYkpu_g30{-?x9?
z>w2%>wf=Z6*YzCl=eg&;zxU_+`Fy?;5aD!B(wVR%n*>u4|GBWLQ2uF(eP8<K_0!aj
zL)&pSMMvTGkU2OZfXI{hp(s?TGy`@2Qy|8)4bB5+y~5i8-gCRADQ7t89~UhaAo;zH
zr2*yp{cYE-vcAcIj8EmnNfY5u?UerK)DPd_KlL^Fg2^EtpNx;?S4ZqW)dT)h+x`Ed
ztyW@)tzq>LvF+mUu;MJcuK5a4<Nx+_Kt>Z}p>kwU!_iX!mt7Jhb_Gb|kaT6{>i-Xv
zyvQ2qr#-HSjwF7Lg>dXDVCh|=MEcsi{MW|Yp^5u5MM!j{M;;53&j2{8`s>t?zm1(2
zErt?hU2Orh%sNx;f6}$W1}I&Kl0EY0AD}(!bZweR^mWTKd2jw<kBI#82iiVP5z=g%
zA@JZlDeD?}<1l~~U6s*u8mw72bsDG+ZJjPOhGDA?G9wfCT+1c&KP$Oo0-~KINx|7Q
zBa(tm5Hx4_rG22ez`JO2ii66QZpiHta?7R&O=f29-17$@uBY<!BTiNQ@Ws?`cTT`~
zeRbwFB(6G-D3{8Tqcj9?4}|+53HSpHy20rde*ySI)lI-NDhK=0vH}V4=E}{{_K1Hq
zYS~H^ob+BOFIUM@MT|#OZ-Hpz0eZ^L(*^eMh~evxh<Goahu^a-*2dJ5ACV<sC*Wh9
z5aa@bXyy(`Vku7uQ2<R2-vcJ5R)6Nkr7Z5HD^ts};Dse@>Z`I0vWBviQMVN{U78C1
zIlF04*E@0&gb-c8uj>LUuF7yG&bRu_uL|a*(I}hNS0L$0P>e-*Ib8c2re5`1_Uj)n
zsMkHzly$xshg6WQ^;}3!$#k{==3D%b*T^bWS{^s|PuQrdHs!p5D8Cao=n9rNGVMCL
zw;md;z6&`3E>0@vJ(DrvwrKg$1=w*`q>+g9MHl6nG243vGR;WjBPcB@7>p8XD9E!7
zi36{cF|O8`huY1d3YV9DW$+jFBPenThzD_3r<b5)c}_&{NDC?C@afF!^YHPc7*r^?
zSO+M&3acB@nA?yYppA$yhw&upLaSk@^d6S;A{+4dk&&FBG3W3epL=6ZMi*fj`3xcE
zLZe2nv%EYjzrB_w7SRvBtR7^g$e}o*Zw$<Vb;x$G2K;*x$Flb$=5`$rgtgzuBma|)
ziUzeq2Biuu$&KS=OQE?SP$_KA)d+)<z1_L@Wl2)My|ZaKs?ZT<u;i44!an8CZHKog
ztT$B$`l5l~+2JD;d%2$mZR;aW&J-j_s>@x&0aQvmF~@GFayNCS75tzX6sNSIJKJo=
zK|OUoXh5eeS6;BoMsGBoI3@*nufbF7R~+o%&cG!yVP;?PZ`deUX?}-utG(#)VhX+k
zN9YK{&q!r!F=2GnWenQ$ED!MD;i$r~toF546IpP>^M}E&fV*_=m$7uhdB&TN-d}W;
zjg0~EA@Ul36@f=L5!`yTEXMysdfqmGUru;8ZRE6cb5*QF$^kmKgWiHpWvoKE(Yk}?
zv-(D9^;=ZN(~37^2VSo(hZf@{G+Z8F9koB7zxPz}M7?Sz9=xyOIiVL0%>i;R(pO(Y
zu7~w=Jg;Hznfq$p2a%7&NSe8D-`fiV?&Br>$YOKjd`r}*{CpMoRow=0!<pr2Ut{rb
zI@<<!oC)@nvbz<&^t;vzX^{jT-Eg*fr$4x?5APc|b{rGjyd9U%s-OBoW#aeU>4(6O
zEWv{YrFzBNUKomH%q(spyQN%xwAAoOy!@A0P}f*ci_(7rcF1rLrsPEZ(>(aHdGQ=X
z(q(r8nN<8h^t%N%r{jicSN;mOm`%x=n|P_Zm2nAv@FQ}K0ABk;t2a)Xwt^C7nr+Y{
zJ-D*GR0f`odzQ$-)|2*0dnw3(;*yqq)p%1L(c^yA8XwkE=(5+}#$DSB{FwV<1Un3{
zj`c7F^oE<<=$DUgX<sF)=o+j*3h28f^Q-ckXAUZWm;oPY(0k09G0v+>eCneXQ0fEZ
zCJ%fINs{k%-Yi%Nj?L||mY?17=Xo`nPgub|YNn*;XpzI2JF=KBB{*89PNXe{FU*+I
z-pdu#e$aJuh{rBiEK7fOsi#pXEOvjt7A+OZPmfYLT*Af>1R-R-1_vSWRC<S3$TyBt
zvDip!U2ioG$U0<hrEPzuDY~#r*;r;;*A@GtVuVqbCeEy&PC(+u`3mFrmLO<ov(s`H
z=>YXlWxR44KdM$wOJ+iLq;I+n`AiP4_}w+%G|?g-|Hme5iqp)a)fEr`==T&(sFZAu
zlFD^9eyj3WlVe*!Ae(8CNXq$oU!0pOuMs*oSq(9)+;C$hfc;PUEXdZ1kS}mky+!#~
z+Wu`0t!Ez*Y<W2;hCE0nZLHLy<vriQ?;J9SLgnc;rzvKthdl0}65_{X&AVW~8EPw*
zSE&R^F=S(};+17T{Cs0g{DqLnMSJT{$r-ydIY{L2vWN9pAJP1mUyunXr534KTGO*Q
zmalZ{1%<Mk=_W~q(b}WeQ)+O2-vs|w7j8!<`|uc<5XimgVmvk6Jlwbnr8jmlkKqM#
zg2x@hWnT7>A^|m()yp<`lIu{8oeeZ3wQ+MhX(XDZCQ+b*_*r(-&$)fgL%ur%M5NVD
z6xg%$1tNTo5<wa%CKxF~UZKaXu}0P={(@)$eBr;sfEFuq5b5{R6%`#X>KJ0}{)8pc
zFyUpWiRJm~u2V-2<05jXI^Qs5R9Pt$*?~UI$cUWjmyORm0etaQl6(N@R?GoNXSzG_
z?xLx2uZ36mc(!K(?_PHMtWB*Q9UOz5!hYVMI-){~7|M>CT`|OH()C^2rC@5LGGx32
zYfb|f_U#JqkrF{4$IHWAcOyc5^^{&P+>PhlWEQQA<b|4J!tC#AwKz-mvYKD&aD-V~
zb0I+tVo|4y74LSkjNd($ik6Pf+&JK+<F_-|8krE5fp9>i?_EOdlmOUDUwK|g?XwZ#
zrI$ORAn3NB*ik%Ep|6KP1xfHauG~GCK~~<bO0)mdOvEo3zbQfcIdlw+;tC8)v}K0Z
zGgrWfG9t(W0qsR3!h|s~LCerGz6VqBs@#!8a`k_W37lV=_Z}~k+>v<hCj}zQ&(T_b
zPeL8a^96(-^ghp2QKLDh>}7bp7zu(UoRH<sgN4+=wxYBf%~e$B{li9bf!hucd4nzx
zuzBPt8sfjx5t(v7TkBS#VS0NM6LGjBi`3IF>VlhRdndJTu@~3hw&n6vlzFOtvc|v5
zw-06W8M3)<lIwFQOr&!%m)k;bC+HqwX<|yg#aD^lOzuviR8P7e@1O?&(^+bNQG#15
zH=e{j+)>2Uwz-)Q_*k<tydqgbt}v9(B8o*A6px@|A7J#uC5-5cJHP(7D1N>r?4|<~
zjH9>S>UvCd7e2Xk3DaZ;j?hWHK$*;l=KV<IjBLY<i{n#!D8CzVp6OWOnU<9`V3oXI
zlO&m5iUO6|c3}l<!OBGb%AVXex>Q_-0>jj%wOo}YTA{2nZY8DXSVTR0b^tk_PFSjR
zLM`NN+ek57Z_HYCR^JJgNSHO$C$6fFQhc+zXjvK2G49t+&1~1<<;aDHHQ8Ef9ot8A
z^j?MBw$5~qs_OXm8mmX9+J_wQ7Lzv^kHbr2q8I6Iw~bd9+i$p(a<`cm^azBr^e4`l
zr#75@CCW2bc7fGC<79ri@On47Xk)S}Qu7u$URN~E_ZH&%K(m(~J(%4Kp#m)wf6Bi4
z#n%dn`Nz(qO43LxXqja_(@2xlS2J7+sMQKIJ>DrTI53hWSmC|kt4Dufu5<^LG~2aM
z1B`s3BaL=r6U-#5<4jw1N5`j2b5q6KhS#lD8gUP#j24uw%Y2(onk(Xpa)1H2b}^G1
zfwjZJT_81P-%YRUyz5?<U-Z&YiN=YB>OUHWb!H2Cfu;hf!Nwbgdic|7?@yaHZRQ`C
zvY&`BR2PV-=oDQvQBD*f&u!h+us)RW36RCc3#^%Ji{;`l?OeO0)2$sXMZ+cgLv;@8
zl;@TyrE`+JTBV&ZY>kd=u)%DT&^W;RGN3L=vU_ts0jO(N#+vJhzS*IsNzpL<f9;cB
z7sxo1R5VG(3*pMcldTKVLbV8w(qsB-gM@0*Ecc+`r8{6;k)XFcQy(AlOcJv|SMUs&
zQROjMLeE;*zJ#>;edoN-_JI<0#5>a&QzD*plx|(Sf2n-riV*)G*hsWM0DXzBy9i5~
zRKfAM;??in|5_vX<;F@-(mgUzVJZ--rw}dGt@gfUO}U>gw)Rst^=yb>Rp|D3lkAFN
zt~^~L5tFgK7-;quNARy(Iv7P}_|=voaq%shqDeu4_kP)O8<^mz2*sTe2Y1GtORp9!
z!KZ|>V9~DkCao%yatS1LlQ*xsiX}0vqFS~6<kYvhK)LPBU2q$B9+GcxPkn#uy$l9|
z+$WHvNQv-3Y0b5m7i~Ii-NCuTD(R2b9j2Fk)b@#g*qN>h%O3+(JH&upKip)q03+pF
zDN9LavTEZy@3a=<5tn3R$a)`pc&ze!r(m)7N0tKp>UzEUO_TNW;0m=nm1$>v7O?$=
zgXc>SDpYGa{oYZ6kN*=Su+CFMV2a~__$80cd>-^@R8sAbI9cdvH{yq)#|IWX8(8p&
z#{IE1?^XS8As*yZsFSXhBXR4}M-cZ7obkI$H&TdyJY)U8SQ*%vTOz!F{qpNO)($qI
z!qO;GcK&Y>(qFG-=sQTpsosvW{$CeG=SpjVhK`xo_}{L#{sf26)PBWZS>}^_KlQmj
z-opklr3>Yo1wYXkK7oIly7r3y`KllK%l};mX2;B^m^GArW)Q~*f0kx8UsW7+kN+<(
C<j{fu

literal 0
HcmV?d00001

diff --git a/docs/source/assets/kernel/logits_vec.png b/docs/source/assets/kernel/logits_vec.png
new file mode 100644
index 0000000000000000000000000000000000000000..373eea45c23ad86f392f2cb60c545c50afabea1d
GIT binary patch
literal 17475
zcmeIZWmH^A)GmxAxVvl6V8OkC00DvpcMA}-an}$+u+X?eaEBnlHMkQrIKf>S_j_RG
zotbsNwZ3(K+@H7C(K=PTPVL&Y>)B784pCN=K}R7$fq{WRmwog49SjT{6u7QIdJY^L
zhb$>!V9>-YB_)+*B_*kp9l_?7HfAs|Z$jcU5w%qN2-0<<A&^mt1RPJ?99)nb6083y
z&2J=Ge`OOi1bi=THT!)NSkeaOtlzK|-@c?tRTvw-WMtemX)kM%h1I`^I~=+gN_)6>
z+7PoCPOHDbfZ1WFH*ugSf(5k{DC1BPbP}Z{cT>5*<3V8Y>tHn5%0Yvn5fO0T(Q5qI
z5AEPyPZ>DHTHaMXv=S(^eDg<y!9jI+-s+!#c}xv+B$uxB4hcr3TlXy+-T+4lNEZ(N
zlY1tb1(SU)ng!#LXuX1oFzE^bj9P(S867Ij4;F|8-f`u(&pMQy!!#|*2>ycbCJ(o_
zA>LocscV9GO}?024~A1ZSKj9(apv@`o1+_d>8(k<(9)tBURAsk8$WK9_f2;m4dyAC
zu#t6dvKi(8*E%vt_jBN{7^p})qjl<F`C&oV&#|&&Z7kiRe^T(Q;D0wB_GjxH_O`R9
z>y*X{4uxiA$9hZWam2!N>QQgu8B;T0oA`VoL_v|jiRDdsfx8;lGTQJKIVE>(#OZ}&
z>nlUdoO;=px%ay^urh^_h4>Z>d7!rezboG)lA%|SbSgfpM~;$9iRzl!+Y%=Y;Wi*>
zebIRgr=kUspk(Qf{nX8}FMopid^5~;n7}bB?h<6|YEnldriB{G5=lAytdRz*BKo-t
z!n5q`=bJwX5rP>vzPg~kmPM9J!(93K&Am%2m1a0heRg%@V-p847^A{VT;-$V%XB>?
z!W;?85j?_fdghhRFsC}~ict3>Gb)tt;v7aWOeMrdG`+AtYZk8!-1E1PpXpJ-YJ*_N
z;s%$$;CVzM8Q_9AFL7yL8Hq>;=??MzX?p$0P4G2kAWVm>(UHB+l&FUhqo^<!<=3-c
zOD;yY<Mtvw{7FM)!R$^!EG~IiItko!`}{14@dYjUX|FP6HQi^dz>q(k%30)gUv5@g
z-0_;8|J(`XayV@F@UCSLeo>7}fo+E+h+NBnv?=Es>WH`X={mDt@BM8e>O6^+=dySm
zrCM-*9h)5znS<L_a1bJBgaTs5vKns|bR2WOWPa2!t~?u+*2kzFN%p4Z&LbzeIk^WC
z`f^%qLk8QrTe)RMQbg>=;sN3#YD4Q+(@9{0SmE@WX%Eg7tr!ul1lhVxr(*S$aCJj^
zujcf(T6YiOdS#hQOus-SjrcyeO_EKDP4>_hiFUjhA(HP5q4{#3v}c0Q2ySVRh{&)<
zEcrVoXC}#EW-g?TuRU=Y9_*+ZN|)+A{9u-+xj8uOPlX77A+-6j*WE>WZ`vdsjc1~?
zUC{~`=DcbDQ;7=O@?zqTA{PvO8{F@oS9Neo%cyYTNMB$$sjzcC9&=pV)D6EddX<Q=
z;g6&IJU(F3NaQy{&Brt&On#W1c20Yg)Mp88`1bJgLB;u~O)x&o<d@-8NXVJg&Yduv
zQYBc3o%qHy_ivDJf;wpn-v~wreZgXSMV=HKOU(i~>m|q`nTaG5SEMF{xUTU+v9`qx
zXfu@2a<gU&IISYSf|3|fBVObFa`-?Xh|DUznJqv1;{$#JX{n@AX2K}x2Zn37{T`US
z&}|d*MB47o-&$z#pUq5gZ0I#Y<4o1HSf`LOPIyXice_MaUZ;NLvX`m**l|pk8m#ow
zVkPYO>{77_1KeKz2lH6;mNY<`gN7f=8;R`a2MGe>SDBxb6@M%8zvYL&ct`k$JnGHF
zi&?DZWrMdbHn8%s332(bE^w4^=0YMv?&xs({MSHG+Ac_4j>J1;OL9xT!Vu@U390LE
zSw$vAD@8a((RnoQl8WDzRm!h_>Q|4mV9w|xdCyXbDe!raLFoni3otETG<!^AZw6B`
z9%(4$i>PSHBuQ=g!rau%`RpbYb#)%iGmQvU{GwqE{o)3tv!)%%$w)QUB9S-+a{0J-
z3CeEDO7Fm`OQRpSt#znaBQ)OVN)+d%<%xXrQO+(FD0G;(w5pn5p4gcvD{59_DAp~k
zn&>GeEKVq<RBJ6+EEb>oK4oNW&tq<-_JQoz*67xBj&;L_?hhN*Y*X{O9^zHHVL4`{
z_*q@k$>l%PcdMsN1*L1hRc#Aj3t#hI6MH7`s?15JWT$28=QYbTDl{s|$oJ?oXj_CT
z1`Ru;DeAwS)=9`t$!}D<@IO#kQ1a~dd?^yJNYkv-T<m#x7QBtPgL*1|cCoWG`tZJu
z1BtVjN6StbY`=uXSIrmBm1<M>>nPm#kYnGtrNg~LE|{FiwFil(k}s4m*}QtPV6tH8
zcPIa#mQB;}Wwk?jNSW+6*{^-fF}U<1dCsbSs+rr&<MiX3XynhsqI$FjJ{X@S5hn$u
z$PU#c{z&rSt>sjDm0{g7ecSS^>!W3qWkrA1OxDQQgk$Ww^8m?!kj8tKJoTDMl42)~
z!&2K4)e=IjX<fIutvP#}P+KdTgPAcKcpHWpjTwVl+F8zh$9>-4`V}(!B70T)EPL!i
z-!RfKB!bZ~_B)U}nuEhWYoZ_8=LlAkg^^j4Q3`GgN(l_P6uEf{{B(Rh`(>SWIO1|+
z?@#Jx)6i8}SiJnF0lft2gz2*LGT~{I>0LYSTIE`8J9NA9dGh&Z2c!#Nk5Lh6kC#Uu
z&M$Yi*O`t>&i0NgPvtI(PIru~xJ;A6k~W)>7sVH=nj$^?PLEHSP6d`v63Vl|Wru7_
z2T{DV&uD8Sy!R{)dd}unrng6qlrEO`mWYiA+RzJ7-0?nO_hQadT!x$V^c(M+H={XX
zL?e)6c#fPKj~Hd3b${&mC?7Z{Q7sWIF%>L~#Dh76Qi{BfdV-Wf;QA^Ri-NdCxGB(d
z@Q*{0fDg6-HW!T!O<@#r)GjTh)QpU57Kuz|HjQ*CnG5?Yvksm<CA<kgCyVJp&uph+
zs0fKncvwX77cY`S95Io5Jvb3#iJxVk+(N`d<Z%z_XXs^Pl4B-gdgwJ7T9uF$wWG!a
zJUh3Dv)OY=@=#Ps$KF`>S<a1L_k0T17Rj?N>G>U9!Pherp7QG8r+de-$7-aKaz<O4
zJx3ZOJ-UVE4wdh-ZrA?E@_RM&)8pSwJWYhRpnGQD$Tp?#5=Sq`*`1`!-Qo?W`b*OH
z{P&=H2GqKB2A9kX!WOE-H}i?}<n{0#Odeh*>d^6%$S>8e>$BoR=yCJk+7JqTk?%~4
z@un|LpI``ywP6RLbmMd5S#?leX%v>85^i_UJY<DtVaCKTP^M-HNO9-76x~fosb=5@
z6Fl&uXLP4b2`|`Or*30fd^fjA3K<+soDm){nYz{1uj(*AOqEQE5!$?{A9m6l^buk=
zGg$Rry{g(2yMM8lkc{&KM^?56Zs}PH!3t^+>Ks8mt4{5fE>r8NNs^Pc$4o+3Qx~D1
z%azN1IlTO<glg6^y#@zjholcPL+h{uZe^o-DorcL{VFWG8k`z&WuA?mKX*{q57twr
zl&9WIJC{Z1%zYnrT&Q$0oU$lyRl!!Ps2wYCR9U}4zeU&Ndu~^<@V=?gW;}oB=TL3U
zC)>i}$};99xp|%Hv+bHS+f857dyVD=d#{=Ea-r8kf^OZ1HihR!Ws7Fj)|Tixw58iK
zBCZ#4BJExzR^-;Tdy@;gHJE!b2VN~=J`Y3iQHaTyuP~=5Dh1U%5`F8cF$z<3rZqG4
zmu4WdqrP-2Z=ABhtADZ;W!%WPgftxVHZR9;V{uQE(75sW&NA>a_C(@bb{tMVZ8YY~
zWV9^eOpm$JLkF*_GA)!cOw+6d58dv6Kg&RsVSB|sWe9Cjo8I~5dFd9pgjwrRGpT>x
zFmK;=BWHTkdycCKUzXlL;Jv@+GJGb_mj$j~dfVVpY-%I3$$DRUFy~QgxWLqKa&@rh
z^~o#ug6AMXjKfp;EO>#^OGrd0QVh-a_ci06Ukg1=J?!at!dGKF4@~=(S6t&3lM7`F
zJ)RO?Wx|vvO}jytU7L&1VmuEe-j+AeQNsYgALkJVyPhvRc#b@eNdx(W78_b^4<h&d
zj@TX+HZO81(M0!ddvDzjRQvNSG8p`*eY9^T53E;fd%+jap@`2r^^i61S^V2W&CTJC
zJP~24)nK-cyrqe=`5*dFv-qp}u`iXh9He6jm0+y?Jb3A`-!^W;7%lO--+sK_F@^n-
z3k%soSLUnD6m-{!$zU-HLK=ZhF>N(pv+(y*#j_ZuT)D4*_DbFD$mxdtzSjHmro;*y
z5K)AhX~~)^D8MiS*GMo3up}_gfGb#F7lkGL=UN(;5eEJ#9VioQ34`!=9!22%_=5oZ
zW0}9s@X?=P5P>@!V0Zrp_dmJepx@yC=NhI4cm^Y`Dk&=qoK;O7&CKkaK7gHzOD*<+
z1Z4X++D<Sq1TP+USlM@UM}Yh(OEoQLErqv&reIq(<M&__Gd6cy`$su2!tR2=rLCE>
zF_pWmjh&OAy9muw4ng4hF`1o)>M4t}wFr%tf-;pP*wKuNmyLssgGLmEii%3u@x8g=
zyVufx7YFV{Xg)YQ+Y7R@yScfsxpA?99WB^71q1}xIbN~9dc_LlV0H4ab2fHowR58V
zOUVDod2Qxo>S$^2YzelbdX#Hy0(Nm0p`m%K=%3$T?`h_4`EN~jPJcHGXdwIJ6LwBE
z4)%X!14V@&Qw5bR-OX&YUt8J&Is@tu<>Y@Q{FMLy^W@(e|5H-y-;#Vh-2W~4pC|u&
zNp&YPM@g_PP^GizzuonB;r~ATyPz=pW6S@eiNDnRlnUrs6h)Z*pFR^sX}{3K1-g;U
z^0ks0aDFt*$Db~+GyZ)BuJQchKwX%?Szh+FxSBic-h%cQGAW|A<Jm8l-#*e@<5A*i
zOdyt1A-hXsX}^|4qIenlkx>Qh&FidhAu1}z(p_H=)V!5x5O`=pErdtW>7~W>tDfUx
zehIb1aS^G^O9Mw+Sz9%D=8x<o-|^9nYwEY|Y+vW)=eto^DTE@!B4B}F{yu!fA)<m{
z7PS_<ghQh8hxzMZ3k&Cgf$*PPaCktb@8bop{*ws-N(TRL5hMgg5EZ3SzKYp@iXp8D
zJpZ>UEF@)rkZ7A?cK?5hVf8wp{ZlUz71lSH0B`>^={W3v-VlolvPtys9f%493wH%0
z6v0gUKd*~K1)>_^{P)fu<PV1cg>@xSm;Rq-VuAcY>8AhQ!34lOLt2C1<1PySyUma9
z6d>I5pF22M6s%sv-<E2`|4|s+RdY|<+FDL%RzY+1i0DlLF_-z8sG#fqY%Z_ed;u?b
zvBZ2RIrljtqSNLiJMzHkB)!<}QQ&N~WlXY=TbV%@%U>$|fuhFQA`9<(z0JTQC|J~T
zcdqNwOC4M=t|X?v-}-R>qox!&NPT_oQ%&5vV2z_$xACv!LRn(>m)ZHQK3D4rzhoKG
z=&-$S4`;2*_nB<R6x|j*EK(zfPri3xgi-jM{>oVh)Y}}m*v#<zs!?u0&Z<+*D7aIQ
zJv(eN@ad_MAY!<dZoaC64Bu-@=RZM&u3b1ds>0{LO*hKgho<+v*5fqk1$Qn7eec#2
z3_o(k6_5Jx^pS!DH`?Ga!*w)m@@?iMF-0aO`w|R&uVS~OUA1a#v^5Pj#Br`km1p-l
zG_M^#d|><)4-(ye^Pc>%)gTR6%GDJ8o*qg+VTY3Oiq0q*3QGoL^O@lCUHGW;YD45T
zQhR=pqc-^Cl;!%(^pr~JC7%m@=YFRAYIwheOb@s#Owmg!Hce<$@hvJLTX;FyZd0wz
z%qz3B<fcH(C5CkOtPTy`ewG05m`H-1&p<Oj(Jq^MbGaNe9=4QKD0bJel-1Gbd^Zvo
zQj7wLP4opH9v#^y6cS7RK0$z@^EcQv?$^|fYn18DI7Jxv-JS94r-OVKn>-w@!`NH+
ztri%T*mUiVlejDtn{1C)x<sZ6Lxir6Y!_&W{6=vGQtTR-&gmfF6%~mz!0ZE#5WK^|
zI@@uY1|dB>K2bXzm|_ztH!G9DhrktC<Nft+x&Bvh)3J1u&&FWVuY-lg#&gT-(3XZD
z(t<`;&~=CW-ne<chr8O<jN+IvI1jGK9E><!u|Xx5CBFxs9=pTl3xg(oyQ{04!&af1
z;L7CzO>cU7`q_u~2DjCG0Zm^xs2>Lf9`=zyi)?tN-^B~l6ogn)Q&Xwr@w)03?-vH{
z+qns|wa*wbd(&_Q)3qw!%Z{-{7)}!gH0j-?i~0G^6E~>pmCK5mE1;}VdaOmAmy)`Q
zLeC%q&1X}(4m0l}b7T*!$FF)ZZ6uSuZ)P<RpfyMXPovudA(^A4RJ)4)D<+*HMA!Gm
z%<5zLz&jJ<`nn?1Dwo}<dRN=i>lUQ7GfYNji&H4Gruz^R6NE1i%%yZyZKJp$GVeGi
zGxmPblh$zk6Zh5mk5Z&HTkQBJzW_zzHui2jXxLpbf{w|vN2GgfByB~@vxR&WI;cBu
zK87lP6??elQaWkMxBOU+`sc^ALD+(1bE)*~6wowp))yDK49^y!mIJp8H}k$X`}%iQ
zh)`&Vmf#fHvf2G_)5wT)Z)o4w(+y0uHD=#2=d(%89*^c*=zwe^w`&^Mkn}TXk-+%L
zEl@>*aJy?yhK1O_V(ukMvdb$!oJ|{UT4qgiD;&AN_C|jBQNBx4!eSdUbs<Fs{uqC3
zih{hnzG%JB4rQjq(;LY<d;w|QNVZ<NHE{p)tg2sp=;r80z`tz`MY$wdT7%0Z2G?z@
zTe>?I%N4}1FVU5^-d`85)+9%Y-Sb=4QcYp^9`4hhgf`vLH{2XFs`~ZWJcmM2w{!ZM
zXv7xauS!~K7y_DDe?B>IkV?QSxFO-y6k)hIMYl<HO({yBQ<e9kE>h>nSI03e-Z5WM
z>$xOSg1vPksv|(YV9Z81kE?#`qP|;rj50%!l#AYhNK?tQeqY-26>DAW-Z`3=PtF(*
z|A?nWcC;)|&ZBufURN;L5irs2xpfa{rXnD<iLG(3^7rXR3RrRWMWPD=)T^-{zjq)-
zjzTH++$H|%R#Vf+YO%Eyunr>(ti~*}R&1~1AG;tbaM<@JH?6GJHht;(aCgxoo-6$M
zo!t!w!wL2@j{GOVE9j5rHDiVi#C&v8-rtP~Ki*Qc);11;3=SlPc1x-R1$E#~fk7iu
z;WWgTL`JsceWK9gYdu{SaS~0rsHmvuYJ+K+8O~>-VK^83Pv3qk@Gn0e^1fZ0vB+4M
zMDuu=g&{0?KgZPKZ97V6=*@YKb`=<9r@8bD>Nf8nGcOK(!{8MyR%t)2?<NJE)*afc
zPYGzUiBal=5;5LwL`}yii3(gF_fYP?oEyN|+Vx?tTmE>7a(})*H@AIS1MwqfML%1&
z&cf|YAsKwqXDs!ywRa47+a;W;9EuozO1GNUN$1(PtLxY77h~viHA@fuptN7_y#3D0
z$$r3xu(w?22lD~dcEO)Nf3~_w!QpG1k~m*>hsWpEcCJQ8m3urr#(U`^6{2r`4RXI?
z*PPt<Dq2%wO1bIF&Z48d!H#!46J6^IE>BGJI;ba)#O5g->Iv(gM4KKDl5ovGcpBJ~
zD3?S_&S8Vg@4ETw-@PP!X6DV+(}>Tau|?P3e0@5p1r_!kcUtX^@KpU?4!lxiWTe}(
z5%c_{6Q~RUh4)eWSH-w}K}dpu$Y!cjL(Q)6p8ql62;B}?EYp3?XN)lM^c^T$STxJt
zx=h|>8m>nQ|K-J~AX05TmwCGkn(O;L2D_XJt^MP+5-mK_`Q@(ndsQPV?h?L{qnN5d
zZ;UY+dLK)1TMVbxRh_1QTKn_o`Ifw^7W|-y1UX!1ydFI89#;sKel+pxq;_?g#SiN!
zE#3;It;A*zz;xMS{h+KlvCpEp=H)Y7=6!U%U%Moco4febdWvT|6ib77{iUtY{VL+f
zt*fpuLKz{q18z4=$Hn|||Gq%^wG=N0^!-Su20?ESZ#Bt;TMW1Lc%H(}$%vSykn5gu
zFr{yu)w8`Km*TKwPE+aJ{kTdQ85#9m+#11^FSN7k7-(o}I_3*WH>ahIrLn<}b2Wz#
zf_EfXx5h>#g%7Mzy0lXW%)Dyo7#Kx?n4)!H!M56R(7|GJBUI~q8OH7$sD%yGFa9ss
zX$yu4Xu@>qfBkRLw?~{M4vPK{hFX^N2Q3B*n$i3Vq?7@`)<Q+q_a88JL=B=kluUc|
zpCm>A`59tC)}9pf*WnTXZ5Nq}N&iX010W<j(i_MBz(a2&tlk^t{I~x}LIEHbAp)Ji
ze{iNK0@9jKfhzrflAZzB4HYh$>_51S5<q;S6HK`OCIx&1lAgiJetkkaPX~WY0N7IQ
z#D@Hv1QG|(<VSxxnSXWje^*_jTcG<ix5Yj)$BiTlbH#-G66S%v7`F}Se7b@G$s%P<
z?dI6kIgwoX-%qei`6GzkG(i9DVnt?Ir6&^unA8SU`yjU|$Jt{C@^{&2Da+lg&u_u4
zcdipZEg!6T*qEX0N-;TmC1z~8H7}i1qVuL5a}+CEZ*en<=xvv*oAqvFMXD6v)v&Zy
zpJ@+7-Bq1dSZ3tE9D73EmxM;%yI=Dwzk+8u28eb@T2IKZJ^F`mBj1o{Ht(P1WnNyU
z#*X~VwFwv)m*p+0Pt8~G*ik+Ucvr2UDzIi*rOn_a8Y!a+@3BxLY@6Jlw*S=YlPDZ5
zE{I2-dgJA)5#Y*Zn;zlj^n47}AFPEG(b9A8Jrw$jhg}xoJnoR$Q8mhMHehX_0W>Z#
zeUtaq@KbSlp=Xw7_Ghx=@K*rbqHk!Njd7*1&z0qZ*p~H823y`O%VkGa<W!C=lFHr;
z2$G%mF8g;-z>@7az7fi381`!}Q4%Uow&mJ9nd%zMp1!I+sV??;)$-zp{7xE!n5MwS
zwZWX}K|(&8?Eb5hx5)|I=K5Yq2@O~2O$bmA%mV{uvkYO?7)(4L5SxAjHwh8Zmf+~=
zd5-Ial}5L(<=LB}l#&6jmvvBi-HiOm7p-nfK74`?w<kjL`s+DqNP3oIXzx?39nZ9N
z#OlU!(!$5)^;|lPBhfJU6A!V!23ti2a~n?AmrtKXyG=3e&!=nHO*y`Qczns1vb8R|
z8Xh6}O)jo+#^sTy{Twm*wjS3kBgZlMsb~s^meaF-d(2`wi9&96#o(BHjo?DAVSdRz
z>qRD|Sz)m}$hEq9eA&qC4{P<^2z=GIZ{HSFRB-Z$ic*xz7Y7f^oO|pP=1%}|#zNGv
z9;BukL(WNk<Elt_FH}wYtEKC1X>GR@c&<jl2S7bgS(V&4a9$*Ct@(}|tyx3A<7DX`
zcXUSaOU;FTMOD`ny?t4h{5{_F3i)Egbl!E@z(AzEBt2h`YhERi!XTEkVmq_^s9Qa=
z`W;?%5&=zi!xlYP`BXZ??pj4RCvRNt5WcZnGt%RkhtICw@$So$6_xq+tvP99qT4)w
zR25{0Y~J&(%n7D9-2Yfij5<Zd{mThZ1GW`6ZcR%|t1`Bq(`hl|^P;B6y5EgbE?{;B
z%n#QGmq;8f_nmFZoMR+~3!bX+$)|C@^FJ<aveRz$^>J>4MXWY`wo-Ll*3r?SqR*+Z
zUpv)XizXlcQsrXaDRQs|tomW($dDE8dbei^HgB(d(y9<9=&wLkIRIu=SD>I+qx2o2
zzBBf*HtcX;pRCJD2t&~`R01^$YY*hLF6kx8>FZ8S|Ii!{gieYJVaJ>&hy0=Pv1~IJ
z5b}K$z20(sm6BKAbiGq-_2EOoolp<6DH}N8mU~|_IpQA|#et5nF_e<;dN4n8JQ`h@
zqCjO>!+7<C?4RD}tych#^;E&sYB5VQv!OLrtIv(A%cwY_!p+xb-yRSy1#2I!V+o$}
z@73cs^y^AqFI?VTENSMads>+dmGD5yOYq0}O_8b2s+`QyDJ!PjIDVg(F4azJ^1t}2
z8s#Y9tsS)Hz877FgIe4TJ%+^Z>*C2?dPXY^mC875OBt}>U^(4jri2J-vNFRl=stS%
z038IPrGDcw7ExLVdwy%v`n&0XmC^uoHg_YVZ`rO0P;#=DPYpm|ZoAZITHOg9L0SvD
zh?^$c5mavCt;`H^%Ci2@qs;mwOad+(9CSGwvinc2Q_8l{xaBDlR+YPApRD4Y``R5{
z3FN<KoLly`%kx&L#rlR7F1V{EkqL;i3ppdYrCoMWdq|neg1FfeEg^?v<jsO%*<-<w
zYX9+iEk-HS!e(7rmr9J6HxG#5OINgCG8<7c>P4|L&w6<(IydW9Fg#iyh#z>HuABa^
zBWuUItE;PBTs4jb!jNetHDY37l`4x7JHn9|krD&O?;j``&8f=wUC!&QLmUkRe#Nc6
zVS5yy0oz?$Vls1X;XSYsS6xe<p^o2MJXKZBlBRQkKG5-F)vwg3atxZMyJ6Xq*El%P
z@^&40SvUQKz+;DKBk(_%|LSjHZ<!g|!*lhXLRs6_m(!+)um}Azq(2WZ@l{@l(6Vh^
z)-7ePf1KgjV24}Tk{qX2b?1M5ebjQwxf#ZJfCQDHA)ixL@kny?qdHXmPs$5b?uA>r
z6!stcxz5sX186-0^vs2eq2aoO>5(|%c*g&bOtL-#h?@966jgF|*|(BJzovYpYQ1N;
zUjwJ`!mmHQkPP-}z2$1x=Qz&q)p79&ucX;SUxlq=UUPlA$5WR(J_GFMtlG)cyGp8(
zJ-^Fh#>pw43gR}0&>DoRf298g{;3mj05ii;*5Y9WK*3GjoWQR$hs`l=K!OnoDo2aL
z>Tv!xIe-Mn$?kr%g_N~k$4ot0-5$1)_v#yv(V>_4K}tDJpx0XQgu+M)u6_fi?i0BA
ziD%@ETD4T*as>w%z*04*fMb^RTTSi;nwXdvtM~6;+X7B<fA+_K_VnHPoYGK+*iwBO
zy4)Rr^Q_KxCiU@p$eh9j?=M#yr>fv!5unH%E$dFS>JP|Ry%&ZV-K;CG2~{~(u@Vy#
zmGjeFlV`sh2bx9>PvR6jk`-wz+M&-n9i~dOk1Nx)>+CaiDolb5*6?8wPypYDk3i>P
z3%5H}s`_YaVk+N5^Lg(PVPGB}rrOdoVPL4L9%;q>G{ixCDZ<Z)_%FB05%5VFpML`F
zsdVB<N$#=!HkEH%B>jzzZyM445kKOERcG0k?$Gu{7C7dQAHN`(z~0=SlnoE<?F}yO
zUmP?a(k<0CR&i|(3Uj$#>>VT#|G_7d{z|ruOl{>iXIrb^PQqk>Jz8%{Y1<Bh6AVyQ
z6v#cF<Gu;=XzC)+-O{cHQ`?^m@#%UjdhV9SPi$w=zfD};Z@D>G)UhsIk09Y;PQ%)u
z2?)jIC|7D~Rc{j4t-Azxk<WgHjQ#ifFs@KU6?b=F*6IWT$ai%V%v7*AUzCi1nTnvi
z2`X2D26(cfx}Bn8lU&>B*BzhHaZP)^qN_hFxGmw6f}0!|MJ+pb%Uju=V~`Q`p0^`X
zenIE`O@-A1j{^vz3fQ1J0z%SISor5imc3z*f!&3~FUSZv)3a!h3sN|Rh)yaH4ppFd
z1A%84;5fxJW?dB~@|7q#U3Mmv&j5nP?N^~{zOwfi(p=!r9NCyo4PDzfHN5c{7-R?x
zaxBRIvo|uN@FT)(T-!=lSXXwWD07}t1{+`+ktENzD^=z+g@6D7x7qXTUGTgP(sJ?Y
z$7iT;7S|Vt?2m*LNN+aKZlNKh-f8oBrQgkdt;vr`4b;wKe}p%{_Duk36axP0YQ*&B
zn=R5yy*lNR>M^;1>$2}ZQQ(fr1)Y&PPreDm0TFM+)?%|4=uc5`88e{zMi3e1V|jms
z<X{-|s|`!k_)iG4AKT>Fo9UjTli(HXmNgzQKk^_z#{L&a04oyGd_K$a?bIJ01M6bJ
z&5-IVA{_`0*oGvH07I2fbYH&d?NAauR8Ec*79F@aSP(jd);0?h>0gd_JR{WmM(Nl}
z1xIgj3kti8B2e?r|IJd~^e)w*n>gTv(7-*D|FG!<&OsYvnd^Q#KO?lf<+}f&V{QXq
zQ%~h|>zCx|{@gSrQS90N1AFT&4PZ&EOSW48Er$h+0Rnr4?U}0l-|u4-I?YpTH#Gko
zwmuNla6ZVh{!lAnz6U6osO@+?$G*3QuatRXv2WR%PsjY(swV*!74OS9X1taOazHEh
zpA5fyY@!_0T9)}Q03){=rtLJu<D*G|wy0jF@g_w>DY<fTVAV=>t$4bu?EJOgYu#?-
z<$!w;({8i}Fi^k*pwrF|1L`n8>Q0yGuYL)Q+W_p20FdgTK-iN0#~OXs3P4qaj@?YZ
zfZo)qO#FfX`lfyeHmk9-JSR3UH7;O-RXm)jtBE1})TS0#`s!{5LM;h*CLzMPZcOo~
zYDDxzI-KZ_tY#VnBW<%`w`{kM*iqQ>h(UwJki;D-o>PV)0GbfGT8p6_s5or7@f>4B
zixVfYT;!dz!A;Vf+yvNS^z}$whaPe__vLSEISVGqa#uVrzRHOW0deUTI;H4*%x5G0
z1*mZ^si1Sgh$lewovf7zR-CP+9pz{ELeJ;GtquEMTY_DuJFLr_H!|7Ihywe75V_0(
zeQyAfLP^x-N;PmQCaAI>aRzixhv?`X#(?<E3%V}8;F$eVhSH}x*g7`#PcJ*=H6c85
z9a7?5t@;7D?3w@@(_B#IRwE3_9k)x_7|zTk7KN6%Wa}Ce@21o-vx?Y$3{wbKhKN7$
zX8yg#p;n8)1-()_nfyhTRG7DEZu(%(=DM>n>>US@admebt-cA|)hd}V0Ld%XdzcXU
zr%)XsWo|!_8QzaIZl-%psjB|=AnJC|D&TfsmIN)8To@HXjl!0Ky{#z=5U&x?%tZ9(
zeM+7gd1;|Ba$p#>F)&pXzq}mcfcQuF&sgZJ-<ua(t2mg=Sa7d}S+z&6#ZRip6)EOU
zRIdanHo$xe$8P#HgtJz{pA?Qk^m0s!x)O<_|0wU5O^oo@brK@j1%S76rx6+3r-q0P
zw~Hf^$G(wi_MEcr@HpA<;5gsyD|GYSOmq9H+B`qro8*m?KyDa}W!H4v-RS`|X+($o
z2%ly*A;!9_K98Q=YIEAqPvNf3$Xck^lnRy<wyJd$QmYdBm1RIQo0@U~L&2}6<m=ce
z6^U}2M$>5f7VoahQ9ELOp4a(|Mete@bd{{5cOrtJ8D#&gZ*@PXl#Ihax3!{dIL3Fi
zV)s*Rt>w=oi*#b$;L>^Xd40D9eC_Z$JBd8Kli$wTA!^C1vUN#x+wk_q@BRSUWIL}{
z!q_#mn}JzONW~x~!8hEO41W~!#9OPY1M#uPw|sDKxYzV^=8vD$w_vLr!UgoUTpMFO
zq|_bg=|zA*(`T-HT+zshf;W{bBiQW|nDS{|@dRhuvN3bmWzFj?4?F#mBJ{u%KJs_F
zJDc_b%+lw5(r6w>%gx#DCb?rT^&+q?VOe^kZIPya!t=QzQmjgBnMm~95^iw?U?mrQ
z7xXHP>zEhHScxKux#%mAE?10Lgw`-AgntVicjEdGe_*J0Ph>z;)ESrjoGQ-nPBV%j
zlhpTs&47_|_UsGo*P~^W?2lxLJLFD-oKa^V-}Zhc4R<UKFl>%U37rE5*qv`gFyrOq
zst)<t28EtJtXa4txrDCx($!|hp*ZE@5vAYlaTF%sWiuzlKg@rp$13c801^(;J|{-M
zdcQyBTsW}9sGxaF&j@K~E3{?N+p_u0h8CX#d+<CotUf5<b~RE=N6HuI$6@hcN0dQj
z3W2*Ub<9OQiFjfotoDLe138qBrea>B)w2P{ezjimsscJC|K7=~VzNA)Y|ewI!t<^o
z5|L`*H_u6X@RE7#Fo0m*&O%H{*@cOuf&h2G-wFHDuH|Y&nO9&S+KGds#pq8H>vHj!
z0^h>>NqBv>3>V?IDYiLZDZP)x^CLyBp~k`F=0hpxunQXO-9IQ`CtD|K@DIRdwb33*
zkliZ0cb~@4u8voG2;T*`5?Z;&W=Tuc3R&phO+Hh!E%wa}jLO`}0>@nclK48*+o3Co
zHdaj7@^J4}9QEne6Qb&+JrKrM)_QlIlFnUy=&0Wg51RF_ofhQV%MRCuzf5s-*mLBO
zD;nY5MP3ATEO+h+vx($^uh9ACA!;th+sEd!-Ge-+(bjD&6d#U`ES?kX@?Jg0dPx(W
z;lc2__ZLfkVzCrlbHB&3B6jLmK}0RgGf<$qIcoiIDp;wuJudau)<Vy?`e$5ere^3K
zY6yIxz`U}wRLQkA7cuxXixm2M`eAT}56UWgBRlT*?RUB7(*0I;Bpf061TvU6=6PL3
zqK*Aa&Dt<l6gwPV;_Erby3#~T<wZ|vVHDT|GA{P7$F713{*6Hj>Dz`G(ltXtk`HP5
zQH7fo4&1wNWtY((3-L32&b2YYi^j_>(0&$*JkCy|adQ-4Lah8DLa&%g#7w^vAl_h}
zcD#HE_~g%HjcC?&`)#1eli*+qRm!iAh7S>~b4arMqxc!N-y!fS7aeo91~Ik{noOPM
z3+@m-1YgyJ&TJO>`VAx~3R~v+Xjn=b9AbGa`mbc#?5ml}{JPqiuNa{;Y4ORNy!=VU
zwTmD&M+e&-U<;x^YomiLLpIm8F9|1{+Y{t3&%+DP9@vmae+~>fy_EGJ;{=)S_3B11
zB@JLt&Qi=GozPY_*(72$IUVByg9UWS*Rph|MK%(i#f|XN!zWcp2gt=-OgVS`oF)=h
zPV_wDh@<OOe_{RVNSU5cPCmMsX*~I@yWc=<QZJv7(m3GiGSV#N<v4^;(66A$^^<JQ
zJrrIx`{w048sHk-n40}kQ8aTN(CFJk!J7J*tuh|Q1m^77sgr;ym-_iHD|O6UjuS1m
z=h3v0!tc_CtAErx2t#{mliu$nt)tZ4Qz>_GZW9E4iK!_sYc22@v;<FeIj8-^i!~=9
zllhUh4?9XvtB`u-RpqlFC)C9-h5Vt0ok4qcaZO%Q6m`hn0E*5fOJX_;%oi~@HZiB>
zGMUtd1|7AtLBr?tcfeRyL~x$3bD-|nI_khwqJs}HbpI0sA@k9|tXj%uWox~K{`TZh
zp0EMd4R70bk-G=k@|r44d!_~mcAJLmI|)OC;m!LhL^u*%9LWdP=TBZLvQ_E@%2|x0
z?+-ix;!EI*|G2J@Y=yNKus!$a%wF1s)l;aq)I^zDuld>17xdY?rBtqE#v}QHi3B>6
zRFi{Hl|2e!FHh3lmOxx`suJZ_yv{rOiOP=FN6cyRdPSO8(D}m~jBqo28&!3Bhp>uL
zqoN1b*z`KxI+ny+8+gLie%0;oS!GOo`}3!A<`V?>OUp?2?)H`SE{W?ZJwvZdBZ`(L
z3RNZdlr?y$Ce7(qy(eBOWOzeE)bmUrF2#C1YUDO|-akJf?>Os%6{fh?PWCa@QquA*
zGfJz51!sbE=$t{RHs61KY|=szxUf@bY0^d!V9a$LOtV2)D<OhynX_vwg>m|XK`wZy
zd#2=CG%JALW9tbQ_#7SO{nQMiqS`VDLkCY6-0bo#$I(9(GLH6qWrsW!Sdo{w)<cZ$
z>-zkO%$w*-m>}f7Nuq)GQ}i1?q+bB)7xM+eH5^Ck3{di)j^MBXw#JuO7DDa-{nJU|
zwTCVE;%>NW>sPkq@fppUuFfOQQ+{>Tt?9h?n-edCZUun01Y>bD5Gf2r{IPW!tb`31
z;dQbTiD5?i(S+Qva4qvBGl{Gm2d2jmAxEsK?MxcuH377pJ=Sh4M^<83mogOm{xtxu
zgUG<xj33FWS#n5|i1_V|s3;KYTvr6QQ5@T9h&YTuTigc@&`buPPLmn41bhH_Ia3V)
zfes+li%JpW3qB%+yAf3xIi|ZGOU<P2_PxeSrrJ}+o53`eF5)nU4C~md*U{^xnq|+d
z;f*VW5EdyiS&81XNyDJ_Cf;@Na*W1nMppF&=)J_iXE5^5)W#r7xj)QLb(CWyU!<6J
z5@;_fDXSf{yc@Qy>Nm=r5dOH2+5=#DY+Y{eEP$6?-WOQh+bwyV1+^SGW5Obg$6ymH
zV$I%uYM=EapKKY@M@AKGU!Z4h8`0|<t<^O2aZ#(8*!<obQ-*kR6!ieCdE@qVo+cCJ
zgt4<8HG+eT9RJ*v;_Jk@ypaUu^u^L)qF>QfLlp*4^;?8>wUcd)Urd*ftB~mxf68M2
zY1U!p-v$$@Q$I%b2}Rrdp__AF+<Y(*o0$fUJ4K~rpx^=v3+LJp;knn2<M)j2v|jq9
zZiK4AIyiH5&(#P%@4IWgdnM7U_f%oF_H1Z3!4#g1kYQo};z=O5BPn9v3BP(sVXdP;
z8#@2V`MjkYSx{3SL!%{PHS`=<`ghk-miHBbB2Jq8&>Qr8k6IHKpNY#tV+wI>_7CPS
zu_;O5lGWes1Fv~hDRQro&X7zjyn%BrjnWL%K)*fxUBsHwWN;}QIr0$0phPeRgM<tL
z+etcnfpqIr8DLCd7nf9qLdIo3>p4KM4(RMH?E7NCB0=6CumHSX1Cfq|h|Wl06yT46
z1-{9ML0IfiuA~hjae%2I9pbwkz<&p7@xdFT#Aavy1T4^uh+L#XP*=BP;cg1@Ph~JW
z3*iO$-}OGr`NRzg(u>lNnyx8pNZyM#^z{f3+umhgOZla9UqD_b)V?o^Y5xa;i30aB
zLDx=z8vR4SE8pWTLJ16BjQVJ*Lm;mCqz79~zcDI_i9eQDPQEK-@7HPJV)^~rw%+~N
z#4z|2lMo8qKU?Eiq!4g2L@;Wg-{4%R`%-{IgUn?@#bultR?kWA4u9}7Iw`&9*^c_n
zes4LY@7yxq<xX+A&YX;>KgyLELyn=SRtT`L$bc7#bP8X8ne^Mf1O0Akt)Hd*<yv6n
z!;Q3I0;(u-Jxt0ZI?7$=i|L{gWLghD>z~rbQ6!Szw(^l$qS%wa>$dEv1n_ZsoBlDM
z?6)||o7q4yK98j986%txK`Wz_eJ_<8k?gm4ULz={F(F{NrL!A&)q5N-#tcq&y8fx5
z(e)rTo2UFX8xw4Qw(MD)D$NTug3U8W-zru)n{}Di$7TKcss1E;Emy>1m4WiLa0;oi
z4mmSr3yPXZK|=xSAy^%(5--LI+c1^$LNma72QI&+X;l5&FBHa=7YTaNjelTQf8sA2
zKqg_j0ul2n!%-6dz#mW*`B}^~<dsz`1SUyE_M>L3^SASCfciojCl@}ik)G=%GA-0D
z`Ff=malcehdyo;UDqYkm|6b#Qh8M}LTe7%=Vjm|*J02%9G`l*c8RuT*3$Qb}>srH2
z$fJ;vln=Olt3hQUB-sH{4nz62kLoyku4h7FH#gzo;JgQr4C%?cjTF4wuF(Ue^3%`*
zA~MqaLD?~*Wj61bRfLz}K><~=YrB@5b-&&tONvIab8ESQhF5U%NzfGnU+ug-z2ja5
zshQcdY&PuG-CWWbkwQvNjlzO+>!5nuM`l?`t*KjQ5mn%+n9ts1R&U3V;M|FNUb9P`
zsgOV;K1@bCLnSnEw&*njiRISRqLa{JPZ^qEdu(_e&{c@!645*N!ZtY3<OgvvI|MBK
zQ$kZcE6XtS-f$+4Pehp3an!+STn%;{1)-rayR!1GjYUco?VwLg?tZH2hS9rYZDJw?
z$Nn)06dT+#NkKGBC_FU%!zc_=f8n1|dY9wpvBBJHDL{*8xjTP)*B*%IWl!4Cbm{;<
zg~z{NJzjFBnNWI|y_ooiY5S+0!@xwZv!_UfVQkbyui!g3Yb<5M$<X8<6-Vu5i88o;
zPvnr-1wkdvea7t{PQ16EA%^k^NHa~l?F_=vO^?5-r5?(5KNW`s@AKjFy_s=JwR+(5
zi)(nks#xpcQ~`<Mduk3b!RKct9IVeNR~p7!{rvpG0?LWV3W9;}c$D5Iy^@Nw>h}>i
z%QR$1W4;40E5gp)4{J@oZ>EtHBAt0Dc01Ems`-kA{g`${;UVV%HfZ-<x0fGidwVGP
zU3aHg<8C#EKegv3T2R7&t%ze;zrNFL_!h!5t*2U2(M@vi3J_MfM6nH>IPy8&0X6L~
z4qI)9$nf32_&DlxwsnA&rf1H^Sg)SHt~FT(e7Nopp52_O41aWuPC>aKQfQI(TD7xD
zg1)Ot@O(7Dse!f;Y*00fpA!xK?06!e01nCn_ZxKvpGrS1EY1gcI4ymG;=^Rv@q6QI
zvFA@LA1qF93P5u;g}kEx1n6QzRNsUGbH943)389Mz>jqp5G=e{Sh1vT`+GClCt3^#
z8<2=ZhzqcI)wr-PKRFMu)?SJpXw|*J3?pEL-ZlLpJVbG{Fa7Pl8lj(0tn|o)0b3G4
z$Blf7Oag@H#KEAQ1;8U-v()Al(rWO=aedF}>lgkjAKfS(f*k4IEZz^>b_t`k`vR40
zD;^drHD?7_H!Fm?*Y4<2lCE}jf1Xq4E56N5k@jY`8GH>#H%gBHK;FgCr_i~oR$cN%
z?4$~bP9MGcFsi;OsdSkq^BUPq2nsc!CXS;6`LANl^4C|1iH|(rYzWLDct*GJ<xTi!
z@ADd~f=`u#a*`}FIsG1));1k&VlCjMmQ2_Z^TM?(ev@7ZwFDas9fzTaWDA1t)jA=k
z>z>w9mn_?GT)BN%HLxoBWx;h$p%>qu)q+?Gjt+tdySP-T`tp%-$Y?GlVa^07hfz7+
zLPvuwk|hq{lg7qG9_x7j)+XY39V;e3>hy=ZL%$6sip|%cvPQ=9B|kq-yZO3Qx^6-)
zw*x&t&XP-k#At*!RBV9Um1xKOCo<#EE>JF&sW@8=PP#8>EvW4Mvf8vW!grm+<@P|*
zcWiG-Q8bx=4iC6RKDv)jvwRAs_iQJXkpY6?vqq(C+=;|=4tj@4oV3yH<#9`jyd?^c
zl-S*AjQ<=SJYX32kJ=rcs2-Wpwjk&9xjx<dh>n~~4E=iu@=ly^u)tPj+fOBzyk6bh
z3}O>y$+J;_v$x%uF3-4x4zSA?Q-xp1P<XuF13tve-Oi!G`M?L))M^HH7#G#G%#G^=
zsNG6bUOGIx-qzDGc{slY>d1c1HOaO;#s+GzW4FL9aN|$d;LZO|?g#QM5MY`-5{rL+
z1_f!nPcn<gP7`#&S!i^-s10JyKFVWgyV0#s=)BQ&Z2>~HSZek)qg;uO-Y8m;knD2?
z^gfBRA1OzSErM=)w5*c82)%q_@v(pvf#773j;>QaX+W#jQc{zWI;ux<sKNAo>xrMe
z*@#FPI`<f_19AMmbk(3<5ey`ZJ&uSrDgS9h!X)pNK0YCt|2^7DB5Z(asp3BJ-CGFm
zcaxTDtx7F3Ny(37#@<`qcOPd|KJ+F`4>pa)^&}@h^3EWJw%u|?+6N!#(4)RXU#+21
z?pX=m$V;j=U9N;xMslo)8;yPwzc(^@A-_4$&KxSosT;$7&2z~GR03>x@@-A`a@n$p
z@J3;*;@0wfSJ3<TrsnLus<p9s0TjM9PE&N|Y{aY7Y)VHlM7ZUWkFV$lc3#VY8}0DD
zx|;O`zArwGL}dcNzWS+$02L00Qq$oZGjMl8He>Y9zhcY4u{%33a-u)h1_4bX9o=jE
zATad^qfxG-j{JCntDrJO$V#XbQ1^Q~z);eeWS#+7PJqQJpaCQn{kD4PgpDA_xi-XL
zq4w#y8TJrytYzcquSkyGFEW3*fJs`Q_s(X+U$A`!F@0_lSjJb+6xV(mnEcKbQ^nc#
z71Qz4xP9}@3b4vzov?}~DJ;e=h)nxcAWPQO&OmkZk2WJzq7$^gJ&LkAU}^Z%jsQoH
zv^7@?c0ieZ{mN#^wsxUrlsHa~Yi)><YzgRvu(M!BqGKb&&faaP)oX>nOJjM<_TK1<
zimU*isbSyUT#U$Qk;(&KH;3~d)UA5MNj1bInD=JW_kYs^K8)=WI&7j9blv;it)OQx
zIVp+9=ID)UNa1Eh^P~dCc)-<MLGJ*rs|$st1X4cA8_VkfegCW6P^@)Bs<n=764HZ-
z(M{*QVd!_~0t|e9*TiVSIupa_hIkc8JJ#{o5&+hM!f=*d1ob<jbp`Ft92JsY<!;2Q
zb)!zaL-%Ngx(kVc7li|qFz0NlB7B*vBZuR;VGNR$<IDIxA?7QcA#iiA1Op>9Qyr79
z#=f)5WA>BCOaor_){2@H5ix~z(DbRZL<9IQ;Hu)ATAzi)c46rZjN!{*j@9jP@vIA8
z%Q2M=j)C)y#Wk?xn>>NfmYewe?!7Gp)C6-PZi0nn1G#^j3``G1sbP){JXb71U1t&6
zIty>i$V(lX@BYB@*US?zMT2#bo|nIR$NC#F`D!u3RUWr;)}q%iZvj#kZ`sQa{f0YH
z%pug@ipg=tFMwqcI>m0}aG|>|5$u^?s7ik{`1WaPQDzLNyWkelPxcb8n&YtC$sx&`
z+0E-udXK6r4?f|)Y4C{;vWRvpsz&EqQy15p;|67aYdnSVR%^-G9Ie-<E-dlf*7wZv
zH7XO{@6M#qox3WaEMyf~89PP1`)O38IdS;&ZSmKW1I)FfLwH^RNZ@^=g<pZ9-{>6c
zK})F*_GW*ilghM#rs%XtM;yh)rvk@1DD`3<OVQv?`bS$e!30HB!&GO>-Mj>z10OEt
zwBnM`*0S;v9}?(8l;WJMUWApaTC=#mht2U8+2!1a?1)rC6#beO$}{-!@$H+T{`T(f
z@bTTuLm3ASvrd8Mw3_zoO5>EH2W9?FZ2D!wK;-6;bqz0|)?-xhgZ-wXRjoJEN1(1O
zQ{}j3*Pwa#y&9#{wgiQ)eQu8kbrD&aH!t~@Fy^Xs2Ie&ReC=BP?Xd}T$1|*p7UDQX
zc9t=wgNF3V3*cK(^`BqxC%y$fv3~|6wVC|*0jD)5I1O?*vl5yHo>cR>1G$JEYDq&)
zq3~mg?Qe@S+}6WVWHvMV^9D3;RJQdR1}mldzHf`C8qg<D<czyaj%TlJc~D6{b1@*_
z)zzfPaad<z!R%iiTW8i?^q?Ba9Z;`nI$6_a(DcbVUDMC!-plGw$Wye0Lg0~ZN%d~W
zs%?{n8G~3difueL1B4UUG-oQ;9Qq~>u_5Jf2Yj*v`D1g=^{1c=`2>Zp%#_YE!@HiR
zUi8pHC=>DZ6#YQFj8gvYn3>e}<jJnbTYXQ#1d6p^F5740Cq0va&UtJv3=?eoSx)M<
z(XU<^D-%9Bw<m`RcsaEo03#Ju;|<9Ejb{Gx4J5E6P!hC*N9^^_22uxrPPv?XMe5)F
z<1Zxk4M0uh?aOTc$@%}pJ}K@0>GuD}zycoNVIsD~C4!I*AO9{TE2a3lOv32n{{ngA
BWH$f+

literal 0
HcmV?d00001

diff --git a/docs/source/assets/kernel/q_vecs.png b/docs/source/assets/kernel/q_vecs.png
new file mode 100644
index 0000000000000000000000000000000000000000..f55b3742f3c6a862883c6a966e0c2ba0dfad67f6
GIT binary patch
literal 42065
zcmeEtbyQVd_bwo*Gzij2i-gjh($WIb(j^UtZUyO*mQG12K@bjzbayvMgVG%8uA}e!
zeed|iH^%+r{&yYYxHmgi%r)1X&z#TPp(;u;Sa(V9A|N1O$;m!bLqI^~LO?(yL`MZz
zB8V255fJWHSV>B%$Vp04t2o)4TfH_zK#&bhNJ4v|u1(;7;KkmAOZ{*Jqe;#lG4IU~
z5|b<%*+UFmdGvrRV=W@BPdPM!J%QzSnd+IPCIa6X>k{6B5Syy1OtuFWy}Usq4{W@>
zxV^nP8tC`7=WsLM7e342#6e&VVoxY&>p?J;buwK>6e*pmcPH{iXz@p9YQdg5LYY=m
zdifGv%uMZ8f5GjJ<@3cF((sdm+gmMFys#=11TiBf#=F%hBurE?osR=vB;G-&!l@-=
z>x+&0CH0<!qekpKRg~i)GESu93ZKc7Rgu_bz4zmY)GFuR+8hXkv||H3_vvySnVOkq
z<=+@%^lS?L?Ca_x%!&AcVqO$YxL$d_`~6^r?v~X&hlU|Dvveu6HsW5Z)(2L_4fY$+
zx>Y{~=>~6(5DWpPM^!BjFF1#V8`Ba=N?*yXln%)e-Rr#W<d>p+z}+4A+U|<(-bHh3
z*q5zp2?l<--9h#5-OEx+RZO$&#&KpT%VBzQ9BdYG7gZm>9@a3u6_)z0{(Vp(G$Lh9
z&+)iaZa^s<J>zhhd6&bZKF2qfuEm160&2WRGQC)))u-OqFBgjWqBLDp11kG1B`k%a
zq)3tJ@FeXpRP9{B*_-f7-j>l_KUGivZyD)i*~w`^lwBXc7=;NxUq^eZ$bpWC=#N4p
z&IKb25YfZM-bL)fV<i>L^-p`G&qBahxb}%}la-*-9Y5g2C9$cji59`c4YDHp)hudJ
z>Yay2PvTZErP=X5^=#qm&^riR^LNC8gv>Z421RWV5hBTuowWDg*zO7-vQQ(EzCyst
zx`!D+@=0RH33b1P>^y)PH*A6&?Z+KNSws>sRHDHASNE9^q+W4OVfTokOx?o{Y+GWo
zL}F~!c0klaoNTrIiE!-i`x6N>;OrG)Ahpo5x0^Ajd1xe(i7~?8gM}W7G9lj$S;<0X
zB8d}ovV2s7${ZM$%~6D39PAsQBxaGVphE0|v;7t|n|FxuLBOLYcP0W9Uy&taIWN89
z!_PotY}0n2{D5QsMq!!x9Cr>q^KC^--7@7lickc?n}{cPKHu*e(dy9=hRVd@l2nmD
zdL|#koQG{CE)bI@i*ty7@vhZq^8r!E-PI_J@6U{olUe$l`aGYL{Gcwt8;$xBy48+p
z)NK6mrEMww1XgcbhjD!6y-JiR{9WJIcTR;)c|KEJQiilWbu8c`)}v;_P72cv)Qz5v
zEa|LWX>jnVHxi<(2&nt+wIun=+Z($X0ZQP9?uY7kXEFL#jGEvW{S?_xLP%WVr6SYA
zF<d1AJKPLB>Cm2MCOKqJ-!sv%G1$@U(|*RSz`G31rMn;Zs5^*IPM*GuCLwNHc0&$F
z&c55U3UN9?D2;`of-Wg8I)**YtDEXkG2y+tNgPQug|G9-vuf1#RTtDgz91;(<<3Zz
zoR=*t$jB8b@Kwoqto``c`#18B-}fYtCfG4G<=|O$aADhuutsZ22ue3fdwqQVf=BfS
z^HD<5<D|!#D$Th<r4m||>KGc`D%urlxlVFsV)rYobG8eaU-WCumv}1GMn?@{ZRs`m
zHwQOI`(`W*wmf}KA|%b6`_3_Wxch;cx`y+Ma82Dj&2or*SxD#;#bhCK0cV0Ers}i{
ztJDFm0j}X1*U{~Q`DC7)^8EcWHZ}Ga6<W};8@ZtT<-*v4N~v7A)mR4UQ3>bIKP38v
zQ#ZH`t)JG+)gtS>)sd;)w`rRpnpvJvs5!GX9H*QZ9>*Ap-QFGJFFaC9`nXe;R@|s~
zR<oZ)sXwT$Tw14i5V7E#NE+TIZ;?<nqb&)Wfd#;vuH~<lu2t`-2QdYWqn@BfW6A};
zZli0n4Ym!Il=hV7j7%gQC!ObIAgzv6k8pe7_8>YrIrV#TDYq2|$rBCU^^_ke6R9Fx
zxv3s0W+@se^BkQi)%~M=ZfS;f{;VHZ)L430%eB8H4kQ})&L`@RR*vcyDimlbLQSM9
zS6;5`jO(1}aF$7yYU%Q7L#v)vh1N|Qj~X3~PjvIlLo)a72x>G^H;zMCA@;khyC|pY
zr*-5-*f-dZ$x|r|1a$<$$j|r|Tyq5bU37NFejK+?w`>2<+Ys6`UW;3*>Rrc9YsEIH
zurPFKZ-CX8I|y&VhWn>kJNJra4fi$ot@gS1?{Gf1cxS$wHoRf9)OuEQy!p%2gTeh~
zZF)8Tp#7})&~wFhll6Gv;5x!&KyICU<?CVPN$dsJd4$7!eQ$kP^E-Qg`)vEd%wog$
zhAE3Ces^w7Z*^|75eoh35z7LmT9#T^1KivNgb*uIc?s9>Ygmeelkv&%^oXwLzSBG-
z8sQ+wAmMV}=?kfju#hw7ejBJ_usJa#V^m<IZ3G?Y=*Lde7rC^m*FPWm8HMioJ@8%X
zyR%r4*fbidXDL$MxidLd!_K)SxiZGyCSfL?#<s>u?FONv5d@vwovW+4t7n}!gTK<M
zeUxtX3csK@H*Ta1=yDy5c=umy9VxSBu`aRmCHgDAlPi)t@55Qw`V~Up5!#3!f~?ny
z-Fh#(&LPb~W$hs|k+8y{EVB}+?5XTRs!;Muf?C|6THpPoP{Tsh0?oXrytf4uvX_aJ
z@dlaVU9w$#4^<QL<YqWsIf>06a;*6%BN;4prF+_vRj;ORUNd<aK%BAf*hZE2^sk9_
zc{vL^_d1KRL>Rcgj_sl8&M66JC92T9Vi^=Y*q~3Q52UwMI7uqknQ4ruE_LFs5pwjH
zz1w#G;xSAMwN$7+tcKZk+`hzCi@3tKj=@|}p;_ss=c<_s$+w7Ag*oQ_oc2!h=M|a0
z=oKo#N&eE=FZ(vxpME;U+N#P8Z=3|4B=Jt!GR^<AH5us`P#j1*ecfT3_<Da~rFQ*Q
zrM#no<Kje$E?HgW=lgbEbv~`93iMI*_XTAI*91M~CFRrHNcZNx9yx2vPwzQp9>Vlz
z8YlKhr&;HGOrp;+8t$ft<%B7VhWa(=lBi6-G*i>4BCg|gI~esDz7+5^ztw9pboLv*
zVMn7w^TZCOdR+neSYAF{KK<g}WRDFRPQS3r^{0d79|9!}J)Y)1!*~wQeAh16F`zC~
zSRyl!2%pmZ`d`2Fv5tekNf$XfuBxcZt1CO}IZuXW5!kZsvtmJYZatUsZn_WEtqXb5
z8PW$tulJrWq%3P%?etc6TU^c%)d=$ud+lpp?w$@DwQjH*{DeA`2aP)Z$mwj4Pnu&(
zukUM~b7{YjU)3lq!GiK%PYi9%O}qE6>m0dG?y%}}HYhFrY;Zg+IOxAu=U!Jnm*Lu2
zB3OL6pZ>+obN}uE_M8Ir4CcFkJ&g94%9x5sM9ufb;mb?M@q-`G$uqvQ58EeuO)$EZ
z?jRBI#c<K8+w-~1+xM`VwUmYGZN$rf_@JAh$;?5Ig<A=j@v&KJCbQqZr`hTDx#5DJ
zK1CSCwjZn4^<F=$j?^CKtF!kc4>`}(xlhOCaO^i6QYvg;Q@`DF!lSL}&2NL}gZYMi
z&>C;78`w|Xjyij_<_OV+yVRTh@o@<5&IkzB<Os(*2r)mNACIjfDSD$t5S%Lu4ZB5$
zb0T=@BEHkZ5JN9^%b7u@;Ic>Pn8}f~M-b#=>M*?(TqubSWrLLsdE)JnXI(;cH3R*+
zo>ATE&)<%T;hosS3^i&{2`$q)`T&9rGkYm#uBeCrU@QVU0x}{g0tz@o1V0f(vVWaR
zBQhb}`F$M;0U^W+0r{_Il)w@GF9!VJ@BBI5i3>(R1ApOzpL-V4-%q1*W!?Gv9FY**
zLl9S&l#>HTbrUBuGdpKXdzZeahuz=;hJ&m&kN^=q{D&x~M)wo6KWX*irOQjj=K?17
zwrobG_Qqyx?zRr_b`XTz1;DATnTrv%yX|W`X90I%n%_?dfOGg|b{gv6kGR+f)4Wtv
zp_a6FGNa~Ud&2gFM&vFvHMNkFskwmKGwHux2Y(6ESh~142(Yuexw)~qakAMvS+GCl
z=jUgC!okkL!3v&Wb@s4xF>+_MbEf^%$lvWeGjld^vT|^-vbUp#w`*i<@9H8<Lj&*V
zU;qA$)6CuK-#ytm|1~WzL3a2Z_NQ!5*#FfwcvT2~RY1kc-R!mYGb>vlGth?!2QQD%
z@8|#9oqzZEk2hcb`^~4^e0=|W^&hwX>#G{hW=@j!wxCZJk$<n(UoZdj=3g%gvBRhS
z4^jM)^Y5!b&?0w**#EU?B6kOQKg@!4B)58|`~n<-mi_)02L5LPKll;+RFZEGI!zG}
z#1Q13iNA11+)c-rq90A^R5p?23N|XmU^P}YEe(3Z6^n<DLle>cJ(so8yfOmY*d~vJ
z&eEpt6=^8<y9bmQ!=GQ(&?idkl138gzOl5K<G<bh;AA(i?)FjlY@xUTdL*={rBOI=
zn6c<_&}2XEm59ephKY!T>yLm;jev+QhVVa~5+=8xlar$CQ#=d$PmhUF)A&o?k@}BK
ziDmsncCaLD4#&m)pL+Ut|Ky~b|FJIwq_>RzlDR+i-}wLM!7mMoq5bbugLlXzgh)>M
z_J9WY&42DO1XbXF718g3i!Ea;4KaBWKcxN-!URw9aSi=%qW((&ca;%Bq?a4f5g-1?
z9$T^lu>X4Ff0?3+JLv81q7*v%f9??*%jJJu`hT?^@Qt7)TlJhxLhOH0lE0pGu*|yi
zpBIEM0|7hon;RaI#DDA&s1n8hSEc{2O8;q3{!5iU(>yWjd9NuKcqS7~NfBXBl#*Z0
zav~e-JFc<j`TgNNsBf*@p(|NcSFu3&0KGzE*jxe2-Xlp-*AnPj2)u^?YrLZE1N}sZ
zQBvRVo^NIEYW#aC&2ine=0tDp8I3g_Gb?2hm>dWG?|qh~jBJgJ{ywH9R)$i$nku&Z
z#;*C|N0e?xzQOtym+WVjy`37${JIXiPFFK4bs|U{t3uZ|wr~;CSVfPjPqLpMg@4uF
z*2qx00~6e6{8$W|{U{Pc`P&|bDN={~`0L~EtM6{}^QaIBazC$eWr30TGT14WKShDf
z`d!=Zq8R@f1qD(;`m?+Mv0-#mAERwiZ?mU0)eri<;%4LuYfovMynO=e4brMUcya4h
zJb%QpXwh%FMETn?Vd5a};_t?p&n%R)t`zPR>IdHBK+Pr=&8wVoeh*^=7&2Llwsb_?
zWC2&}-f1K<yLuzrvw1iBlmXsi$qjAHJr?$LN6S9dJeFP5y&?CFbT?~dp<mDT!XB3w
z?Ntwo)V32Fezj?m^lQV>heeoUM9`(RaUT0j+`Go|Ny1MmAN>@7CE$-usE_tZVZ@cN
z(G)v?MPQD-5&2rmPuE8&JPTT8x+d~_v@S~2vt)S#I}g#2XK04{vMc*}X2<sHSEK4b
zu&{IZ1TmOecD^nAVCwNGTG9l&@ArVvKekR=Wa{ferp*K)ud|?AIC}feBrEgGY`te<
ziGypOb}DvgqsmSmNI#K48w%jqM1aWI2-}(kz1cTO>RX=KcP4_k%7$;Re&^_QllOVF
zr=JaN8eTlzjKBL^Kmre23YI1Ws0l+_n$8w%CbTWmWvO@O-bFe0b0<}@$Molfv*|Ey
zs>)E9l@4nVPj*X9eAVW8+N-7&yMwn;T>CNp)cbf?#{5TLiuqWPYHZJ5WhbSyU$*~E
zaM7Dpfvr1Q5{ZMMho>dwG!vttHv$9Xzdb<~De@<UnS)N>>tKZ(pxoa*1u7MW43F!4
zket=bJv2AhoJXGf&3j>829~Q=%HjMr`mE?nLzal|X0Pd&1i8+a57@n|cOv}BtyeoD
z!#St%uB07-E(ajMbb}2fr2km=76;rGHj*gca_h&Eu*Pvs-S#`&9u=zN8HM_|{w>r%
zJ08&aY}-)-4~&R`heDSXE0XrQN-oG0BJW;+2f;$YLkytA7xVK%Ps;h*g>AkAGM1^A
z)q8?u1ru~!c~-rV;Uzt?GQ{+<@F6xbwF(AsRm{VY<5tMnk=w^+<GOL=p}D8lo>d=6
zk<kg$!2pjbB}D!j;7|AfGn&hd6~F*bXKcutW>tWLYBwlU1S8CXkI)52#g77UV`$3f
zY`&j=<Ab_e^gVRKNPlxH%NJ2pFFkt-TRmbivY=A?6sCUx8%$!v-MHoLgW_7AJZO0&
z9~U`B6x^E~()vr%iYC5)e+RVZe%Q^}fA?`|jASSwV!#6agQv#N^ildoirQMGT0wg#
z2h`vHZVwN%_i%gnE)VxdXnE*=Yj2DmG14Vd#K*J7bP#NqIWsHM5HUY=x=F=($4&|?
z*euoQ*obgI-c9kd)4;8a$MkS7`+y6PH|WJ3R&-pZd!YHMV?w83^f$*^TMm|QEb-95
znr8_DQ8ui~?2}`hk8){$W|a9vIPQ$pJ<PXhKNzbn9)%nS>t)F$UsfZ8+df3RmU2p5
z_u{=&u@N>BZtM3cPfesJ*VPb^VdB&up~VCt(xP4JXvox0UIXxksT)mP-_%XzXK<Ie
zz8beI%KvtxWnRpW0wYR_F=ot=oK))I=^9)BL#YT5R~zX(CWe4KeVRv=CE?Rrvg5MU
zc~CwsQhvNU@R91mM@lbyGlxz`_4VZ`9d8Zcm<f^$h3k6~UOPQ~)=hf~Y__}4!7u=h
znbUrU3V%rvX=>aykVbnMoo){w<DI^Ky`4lBA=1O^_Uu@sa7(y~=;O9z52FZG{g9!q
zS)dcWy7-l!2bPM{JR(KN!%3h1uXg>>Dv;c;bHssIyTlM01@3TF5e5T2HEKW-=fD;7
z;}oO42Nw}0n7FCGbmA4p&$diab@^Q?#7Il%dZO+&^Y?7kkf3lRf~Cv4mh>P?xVmuQ
zE_n(6X$q#BTrL)ai9EE3In;#&22XtlpiPpjSb9ICtDhT#@9{5X{7Fc!O%xR`2!>=0
zt}nzFyODn@ITIt|?zYn@ncCpPrQJB~YV#Dicm}ChOhO649*LH%12ih{R~^w*(Km_t
zt1$!s$5Mcn^LMJO@Ad_0abLQ=)OmY@t)M)r^k>V6{la*aL<o(J`NSRIb<i|fT0J13
zDnU+K<`J9o64`t(!&W+qJoJXBL^lQ%3AYFrG<JWxkAwFuS_Q`$WmPWp-+)}|p})S`
z`wX!tJ78KigDE}G6cIu?vb_9Z4-o>2;Xv_(l1=W?{U@`W%tP}S1Q^+O3A*359IVV8
zYJYtWgv}Qzf}n>lwD~O}_+Y7Ire2^RqGQs7Ym(D@?KWIE2JJfpbk>;&uvwAEYJX$J
zAzh>?`lfY-t?P%t_K1G?n9mDV<~aCqKKuy>aL$KMjwTP5HDIOikPYlceIz=;zq|f|
z*TU9~MUR3N10X2kx5DjmU?(<7AELmwm8Cqu-0DJeyTpi;Utv<Io}S_e*Lx@$J%5J<
z)bQQLGS;Qe1#~uSg0^cE1B^C(h=vueAp`KHK9KNxih<54#qztF@x=TtgHyx)M&_2{
z2#t9t)OJaMpmQ}B0*L~kqOnN9ztxo=51bRohHYiEynm$L2OOwPqORSSqEExQGH!-U
z^kM*V<UxJK{Cr41NP@w_k@1i+ki~VLqJ9(g&k0?75`J#NPbFUB$sAsCU?%KaYCQ0H
zn*c<P@|Nc3W^Qz<hL(P<3dcjtzvb~ypl%rgh6Q1Y1#9mw1_3jys;0HTfv+#(i?F{%
z@K?|9pd^bOE_f8q1Ry)f=V1A9-F)%Vq9k+QH3dHZXT$KJf){Ee!Jq`+%ffd(HGJu(
zr(>(p;B6o<OZ~mP|A=;mh`|_OfhqDpwhtNnB~9K5Lf<KNW~+WT5HHX~p6f`v$5x-E
z?X_2{OTQt4d{?SvtAd>{%^t?Z_Z!i{hiC}A<)4)V+Xd*nMe}KGg_Opab*;e*sTBAi
zgHyq*v-fXSxG6)Vf5DJLbqM&ji*ozw9am{AI>tV#>snBwAm4okZyX=TKm=>xU+xU<
zDtu!LG7Ny3@DfkpfZyPPt8&+yWJGX$Dt7)NJ;k3>k6Z9gh_rcxDLF1;A=LMTkIVZX
zYsd~?LmEZlCTjni(?Di3xFjs;fNsZPJiP;sCjLJ^3UDG?)UOP6MZ`qhp<6nhBt{Gg
zRt|Fhv#o)VmI8`pnD?=e7+4ttmHg-M8<9Y=TpSP0{lO6vS46{tVF^=<oHPY5&e0$F
z!wrmPY3Qew12&Knd`-om=&zEF!?{k51Hpc~SNt9WF1}bG8)=5`n1An<KI9L}VUo0*
z5**ku{-cJ5$mFE|mjcm$QI?cBTByu>w&(}dHhO!%FV&{p>PL!22Ae#KmL`0bOj=NW
zE77Z;l1D?5I4M@61u14p`V*yJ4|~jiwG+G5J@#{f?eG%*Ljq#H$PKlE2HwA<j?P!3
z(})e+TO&xhdrGT%n9VEOiF%b=5R=~Mb1Y$7s$L%rirSm^B^$ZuS!TcOlYd-hDsq%E
znEYiUqfu&rzfSag>CREh9h~IC49}j;X{+}8TCX1)YU5`^&9)w`fnCs}1hjbC>yz=}
z>JXD*$jxaTlzS02QP-zy-=x8KW#F}Ero_Ka$hqi!G~TZD>l;ZqA6L(KW5w6QuxaN(
z;eN0K$EnYi1oupp09vm8j<3KqSEBP~)DWuwPrYbh94ecCYBRy&+=7UftibjqPdxd+
z=fLYnvPsm_d+c8w+M24QfHkk~gyUe@PP_W$U{u7(mnK-8aA0)<Bs6zK8zs2U=e(I6
z0#hBP&Y_Ls`?lM7(CqrltVztTcgDF33laT&q*y*VDe2yd!M8r{Nj=+Wd?wvOvxGBy
zzss}>U@>xbX{9491p=EFX&DQirx+ifZI@JZ&_<YI;)W?M!FDQmfsryRjuG7b6!hhI
zL_z&PYrP}+sY%~jcYME^0;}2Slu2}v_NINM*V$HKCXeCuwpL`6m3TE5mvioRHMjdu
z`B8)w_|ADB?_XWwvsJ0!O?oYQFtP7`v7XewA$sePYohbgwsyQv_-xMPN2cFR4PE}b
zPPRt|ytSS`obh|NDwl)F8hRT{iVUxjEiya`roleHE>g<~nb}%UO{fw_2h$YGDni~(
zvM~xuSDH<H4Z;}1{;0b}+8w_ZeJ}SWH5ZIzj|T(~#y(FOhbKq!`E09a%G<)uf35C}
zFgN*RYYAQ-kE$QJc&&EPM!znqp5^FX)J3vyzA(3KMIny3R1`2a{4vQhW0}jR)NoW{
zcwJl<NSJlZ?sw7aYOC3Z(N6!0qLEurxVe0=`R1fz5;ALBx7nz4y>at-#=2yMhmNuP
z{H7m+t{uJ=na~lNC-05m`4Q9NG@5`XBW($$nLHCOfu`wOgg?H?3c_aAT;-Gvx-s{k
zSY?^e)-3d^iry>Dn$fW-(;#MVstI~dmRA`S)!@=&VR6JewK^zt^jvbGN-7I#w)1rD
zEl^_ltWWmyyG~TY(F?6;RC95A!!lI*XLlZ%V25;cPVF~e?xTuWVMh+(eDbO+%%$M}
z<=D!f@pikv(miY&F#v;r;k##kQX7QSN3Zwh@_=eFzPfT7F+}={=Q$8##EJT%O#?(6
z$0*6r*JJ*A($H^yzI-kAU{N0OI5H5}++>8so%Zc3yS=EoMc>8&HvY3YXB($DrYY?c
z?nX=vZ_z!HikHt)6@qSfHaOokio%AYX(-}NR>4?H30eotqfV3|wfYhb`^{{h7d&_P
z`hlW9*eYAepM|6ppJ%vy*+sl&^}?YM1wjQ7(px<z`CghYiupCI?|CAui88zo<xFIX
zk~=BASPVKB1-SaU88hctW{2iR)znvVKjzqF-m%$DP2sedczWJOb@QtEyq&$n5wiA1
z+rH_f@4)Z&(z4CMtu5-qlU~-k4~>26e6Q6x_lTc;6-q_0#&~jlk)6)Pc71sOwa;*0
z3AMcVke_5QQ`)dyOrglDr9U&JNQT*bw%}#nO`jCOV>v2qZzWE~XhZAp_Ho76k*Qb{
z#}VEIciI`f>8-SMsr$Bsl#E4F-J8R?{j7C!YLm=Q%d9PBcxUBS&yRKeYB5uNlEaFD
zy=;qg4HHydS)FV?<zAgYpBSN-u)4okGSmIZ=T@`q))EH}Qs@GheJge@hH!KYrf@Vq
z3+_X2eRDTBPy_{MrUHmu70_(rf0`v~GjwEMhO<=3dh35CIOe^2y$P#agjAQDfsi^f
z15fgj)_47}Vdw%0(_ZbQfz588MN&WAiTdD&=H>;kp*jYBMvo{!YmraB9u1Ifu+Pff
zKdF1|<Np-|W-+**It-TsuVbT{j&()~=3f~WcU&ZA56dblAZyn~-ALrRq4GN!Crfke
zHtdu;iq_u@sVXeI+)&>e^t*Ak9aZGTDj1Ga@^#<JPs%ZKx^kNPVlG{yvoDufyBaOp
zY}C;3*yO06H@cw49(sN1TU$F_ZP9tS#knmVdJ)PMIhEi%z`t=`$LuEZoRh0K3#t=*
zz$khlr<mK%#lb>y1Dk|S_%Q5)u1(|Q^>Iee?DGYpI^g?K_MuAt!3pO8U#=0q>~vBp
z#cMPRDdbZik%mQE3%{ABDr=acn~@5{xen54Q_(~;J7zr_LBgpQpR?*ZsZYF^q)(}H
zSjZ;X%09HnKe!_`yr{W-*eQDRbV78ikRn1SsU15i;=pFBM&(n`%T=yy63T2LsmPMJ
z8={1#PVo2yH$Z}opFVexB!{EtIF9#%c)K}IQc~T4n|>q;zesz~?>L9={b2NHxHgpS
zR%)+QyV{!LK`pB3CVjGD6veuM*ZFdY&7_{QqV4wBN7m6x5Bjqjt5Iv%FRe7Yrjv;o
z;GQ-Ztvu{xc*b_${gO2h3=>J6nC{E|_3a+}_rCUps1HRYtDk4>!VgN6$C`!OK3)B^
zF?>)so4raKEu>b>X_Z=_D#|QdF;1NAbj8O{D}lqaeYCc~>D#>a<@P7twaeM0zT12j
z^9QG%M~<W?_6cpD__&I5pk*$@iY0CW<%@aDAx0NBWlmHhJaAMZ62R<J!Fcx73ED59
z^ntI^+ybTZw7_pLy|-NRLEilCnd0VGipI9O>Dp(5Z;1^a2($MruR<})zbzPU9&~zr
zxv%QS{Yf!Xc&qHx(J_;o;>1HS7z9x5R@nS&F>8C9GgL1<RwH>)N8i&|$-{JseslrQ
zgUJ{m!OwMkxX9aeMb)|X#r%OKmp_$vNU-no1cnr4+Qr-BLeY!q6=z%YwC5Gq{JUx*
zSZUTacT7n-x`+C^s2ab@DblXQbJW||elER!l8VCrR3aF<cEWq9G%)nwxGU5MwmgYy
zEhqD)+zhuzl`bUFe(cN5q(z&OQDlQCKfbipSSK+0`fWd6HxSPZ>Ej5Mzoy71aPp<&
zjd8PDOLlHuOrp2V>!(lB!%q^0?7mE@jgvX9(}SfIym5LpOJ8K%l(5e^%N|Me>o*2h
zd4Z%+rNE$^LGE2J>Kv<&#JC-}W?$zEHPb!#)P_BrRV6;#Z{;}>I-lL@F~DFCVY%Fd
zWF}+?>*-#fbEIpj`uU)}-Jx>B+**`lOeZ{BwJJ;}+HD3oFF_>&&rbZl7e8hF4$=a{
z!`$Kh$;)M_U}H5!xQYhvk<a()%PhEAa`jda*O8|0qw(`gct|MWiKZWrZ$Z|g4P-CP
z3e!U{QEH`VDMt?JR^zd;7N3xO?+RrWcE9i(-<@+K?@CF;gowu1i*hF{;G_obo^AE>
zF77TynbXpzo7}Ku_OdmWvO&$&yJA-Y2P*1U!Wy_BRG4mpPNiR9qvWK%6f<;Rdw?@Y
zx(Rks6*ypzXSkl|jnl0DC1LhEQ3AHr17H(p=ovHIjbA5O$3)xLh1F)<OSdg<)xX(Y
z1HyTzzd5!abJXU?<6wId?U`7vRum}Nb|RB-e3k^gGP}|~I#J%pZcW<b<2o+g>1lv%
zmr#6GqjI!--*AZwlDbTYaE-ES*ebAuh>wJ3;qi2fHkB$iPvP)$t<eqLn*`Z4)|=R|
z=yi}o93K)U4K4AUi?-S<T6ewTvWx5zlD;`t=b^8Y-rDKZ&kgt1=K3Z|633pIVBp2U
zWFJ?!PftO*_kgsCoL{vat-_RwzM|G8WwAmj>HUD6gr3_0fL&45vynI3`&-pnBoDep
z_wbgIZjUl=h1lBs9K8U({Qj#u{;5gFce8VL$TbP+$f(Ls3(KV;bUbD^y_6GPxdl6e
z#V3#D*|Fvh?YfWk{t7?Co@a$YYJKCcEk1>_H8AZX?TKzIHlDA9uX;&+N;5IK@r`TW
z=L7K6IBl-(Abvw6$+XF=u{=&PS046wBQ-HMQ3;M+T|GYzE}KLPYZhAHcm)G!F*ff^
zeB*^mP8xMr=67&nDGk>tMoY@Hz~1`H0k>v#-Udx}@AToMQ+!R1&Grg~jVPVEF;++V
zw)3@k<#Ju8;@McyVl6K>%S9~Pt3*ubMZdjqN}LxFU8H-{(uB?#n<YsufWtHc6+C8+
zmd8$O&+b!wCBHS7@jJ5TC(-+P@`&Gyc5Bx`U}pO~i0Z24zF|D}mt!2^V-@mDp<zrq
zl8l+ocAOFQB)_uSi9`GLjT=f*crtah#69pP)e;i4Gn=zh5w9-OKM{{D=rf@@@XC)s
zRaAK6Pgop~Lr_$#zk)OI%fjfqcbmb&p0L&K^n^ZqKGvmYlV4MT#)c>?L-{&|%rm>x
zqKEknGe6fDmB!FwjDb~5Japtuv)`2i@I0N6&!_X~C|fnv?X`rE`$)i{4TaYLGM9-X
zphtkKhS*;k=UiN2zn-dR?VgXQ{zx{+F?)^iOJCIpD3hjWOZPQ$WO^TVr<%X&dg2-!
z5Ovf1Ak}syWbu06&2?|&hkN$YAcEXyzad)2V%XCCZ8TA~i<tac{{!IG>qYY`X2#}m
zx-XRqbt$f`HkIn$%cRY_k;aidfsMlSwQpp}6cx*z#QZj18@wdCc5Ns!8@n0|6v1(#
z?)j+nS5Dx4f|$l0zOeHJ=hK?6R!(b|7aN&|leR#A(#5p*u=lDrcfGNyMqhD@cCnr9
zrN~fve241K92N6(^^`;V3A+u0<qw`-In29Gw!_jsTQxzWi)YC?*>fW{K-4v5(}fA*
ziqIBx3>&OG36#cyq7=6U589N{Ve8WDnM7|beb<DGAG}bK{Mud_62b5sw#@YIS2tUk
zH;=-q*^m4CQaE?^8^eKLoZJPFcsM^j;ne`&DyG|G_Zl+CG?1A=JKzqlq5aYSu`q*_
zUX_;|GG){e^~5@nzQKZmw1c$RQD7(H2^xiqii@qxH!?vo+UAYSIe-StgRpqs<;3k}
z@;T;Iaw6w=0dUEGbG~|AI()%1``X0!ddE;^16-UojaMWcF^&ARvq0LtAA(d|z;0l-
zVF(*ZilCUrPq0YWHt&ca`Qdka=9jyZ4w>oJb7)0zb)Ak~mCn3JT$<Zc`sGLJKF3M@
zc5w-*;Gs}M#bBri`(21mMP;sW&QXougpm<E`fiv7*t4o~AdzW6X^$7u<LGUeSqrk8
z15=MqR6qFm+bDF@`^TkcLDxqOx0>{$J}HgvcJZ5>;c|fZuqb>3=}$&Vg#k(9v%>M$
ztkXxEDP3&Q)*Vq4WcoW79c-i4H_KEv4;v<+IaB@fQxrGOsTq^IBi)XdbD<f%yJ8<V
zNA`Cf%wFyLl0*Y+TmfvME6s4pq25c)YukKLh)RLBWBdJF(+Pf~U9}!ZqDH4T+U6-{
zD>lMAQKa0=&7U75Cc^`+;o1zx6LNS!(RtSE$>DYSn$u~mOZKXG(_Z@$=?h3thT(rQ
zEPs2?gM~l;`bk`Crd;fY?4`mJ3rljJ<KdgKB@aqkzpIZUn&T^~nAt~bm{^MQE?cFo
zWbajDNw*G0=x0SYV+k}U<Pv8{7O?z+BY|^#PG+>u9-E7&-K|Yt?#@c$3Sh-V4K#wg
zSkI+@F<I$?8;O~atE5{%gQkgpuV!>R?rPUUH2g#^*V@2iJz=xC;LBw-XPiRb_<gn5
zdjN2!YrMF|Dw8Y|HjEReeRSPq8+Z04_)i**Fjg#KFHXzQhx@Z=ded%HJcHt0NN3jG
z54%0=Y1;Bg0qZZ1M5l;vUC#_Xx9D#nR)Zxxl9Cjv(%0h|>>D4Pm1^cb^HeLN_CFg^
zTU3&f^kTF&4{oeT;L_ty)n8+O`x=diKAe4zo0Cg^))gJ(hu*(I;N;q6x8XSF<SLbH
zBlkUD?xu5{oR7L1W;%UKq{GBEN;+?zqz`#&c+sQPXCZp2eRSB%F2l*OtEQ=%3Iy1A
zkv1&rixN3;+s^Kbm(F;5u@%S3g(~bMK^t~#5h+v+!fvUY#$l~)$ctZ|0E%*g<x|!l
z-3{LVF`~d;77iT3J9^?mk$LykNI0=(y&DuU-1l%R>K;gef2zcJF`Tatos=ECx`cRs
z()KwS&%K(!!;f@urfquP&E{oJ>38L*rN`xIo}i%_+WRRAuNtVfbi;OCyMQ{W^taBp
zFV?<qRV<Pp*VV(wQUjI;J=H`TrlkZXH_SkoG4bfH`WR44p;4!REqpvQRp6TbP;iV*
z!$jm)sB(SLIiF?Y7v07-Y$6VL*pzOYvmxGpnApE>K2*{Qkz+PF8-F)z@PnX{rF?jo
z@GbN7xY_8UNfbYW0ir}T`-Gw2^`^%>wnXG>)5b}Q^wjBAPxI=2o|5n*tuRpW{4qmn
zuSM|a>S(ajs>IZpQ<8RHQwg3@Jq{MbqhLRrnw{wI>sDnrXuh=D1NpR%dmw({j>LBp
z&mGcl_8k~fxt@=>sucZMITt7T<x&0qC7Bt$h2=RJ*W+8T_oqLReCYYazPsWHR%I-h
zCgMXfk8SERl7?4>!JR&<E>pG=nW&Q2yCa{SpGPIiMV(q}1oOBkpk5-3zd+qXQ%{J*
zh_~H<nIM5VX*OhhWfx}rol^w*s)C8+$@Bb1Gt(EgrTCUCZ~nvk+Hu#e{rZa^X*(I~
z5@ws)gV>hVJppM!j4C|W;}(ze?6diV?SoG>2~Y0e2z|2RhhVI>x+z8BKar%BNhSmo
zff%(|R!mx;!oV8C6dJjMn4)OR%+!?M(+VoJrsNaDjz%i2rt`@-!MmxRW%;gi!W0^v
zCzFO$^ONQ_sr$(+x?6-BH#K-^1z3zhqVyBv4+{f$R&H+2poXJkJ*L-?@7^z-J8gel
z8;KLv*_N?&Sig|z&gJ=%71?`z*;B~J@53|iGTPw9mfkbuq`^j)G@z!t*HFV<+qOPD
z9QDCL+es9=L{gG$D!gu$1x-_MIXi^1*tTX^+EFBFQM6~8LGs1u+jwy$f~|r~5s&6}
zBHghXI&Gby;E0=z@%1F%68$&Z1Wai+0u;7q4?HVXt_||v{)HM=@Q13iFDWKD6AdUo
z-wmotOaHJ(Il7CDWjq?(o9$#;G24Wp^Ki*m9R0nVUoA#cdOn`_f^$RB849W142xw<
z+Wb!Sa{9XkIX&Nt_5P~FLrOqHK*%Hjo};Hx>7p;(#WAVDtK*TiyU*2h4uU=|JN^Pw
z7=aSs!<i?7T;p0+d%eESAOa`xYuhm;$+o#{qqNuC-?s_kv87qC`-DwuVv+o;+YjVS
zQUQ*uN#j=9SR1O+(&5+yiJORnS})_ta<1F+=-UVMw-vWA_M_3v8!m1er=yLrHzf8#
zu+3cQG%W%F+E-^GPa(qkxmeRb(%lv;SGieZtH;z7Qh?G_m>EHC!!3_^Y?sU}#H2>w
zCA~BbwKTC$7*-mpGMjj4Mr7b_bo7?pJKIjO19BK|?v6ny?iZ3I!ul5{fJq!sHA_)7
zUgq2;bkxTw-3#5e&+zy`Qm#b4GJ&n;ck<PW!lI+OG&<u1BpZFJcdL4(drj+R=}+}j
zJa#KPbLdwR4Sg>xwO4E0xc3sZzq_#TYXC-q0pu9vk?-yd4dKlH>ZG!-=`dxQ!do70
z#&h!A40*A`wE(hyu5x=3_l`twk3>5LG~Z3klLLlx-lq%^-f${hFi&(Yf5C6>OfZxw
zT%Fw$3~Yg?Y_WDeH`H*jt&hp}tnBX;ozU#{VN@Rt2&SPBy`*k1D>Nr9Q~oF{DA5{~
z!wXrH@K$gt+JkWZ;#1mrnc=zp(R_ER((!{|)!ygX)ROWB<|s812&!~CZF^J=&x&ah
zaoCx(3TnPEfOUO!%_n7ie(XwsmHCT{_zS81y`zV4;H->ziU3F^88N853<=&L?5F%R
z8k9S{nt8RxNQ-j=DnK3xBIz~cP*g{K7?Sm<)k>SO%KfSA2OIt2oO!9YR%2BcnOd72
zdkQB4QomHrIrU0aQ_h*s?!D(MiY4IVn%SZl`Mbk^FhVNN;Jn+^<1S=G^cSIG%M_#<
zfr`31fNmU4#d{Pcvs9ED#Q`Dc=vo$VT=~9&@$MO}CE!UbGQZo2Y#TcL&2<36L;?^d
z?8zVVWB?N~BP_Y1A;|Ue{hyvL0CN+|t}MS32gnaL#5f;nWYc=<kDo!JYxa4%fGahJ
z$|K||d<WRh#zt#zVJrXNT-r5_zy1jO$k^yTq)mJFA~gnhzD*j8m^1~KQw$tY(KCxV
z2^@kSV@LL(ck_uMDf#)}Iar?mk9^g?na3enz&Own7^8w?2g0(tt2xqu1K_zy9+IHR
z!f7caY^+Swzd{S(6T$&|4V6__W=GG0Hf72GX1PcKH)&}q(T4ypi=g%{9(djYCnh*(
z=~fXi_^wZ+TC$f%m&pHsID`OSK$C6}TWxQlARgBdzjy?=Wq&bRK-on;e@8+Mj(6TD
zkEdYpwcQ82)*DlDYQSqXqyLk?pdR{v&1)eb13m(O9JX6Mm<qf(n*f!tr;E!P6>^x;
ztG?u?=7($D=J^Sl90ZI?5#Oc!!WV41Hqd`At;;&Vxl9!-q=E8N-5~vy(WS3lfF%r@
z6YQD@1Kd{_?irkpSc9{bBIJ+q6#z@`h7q%ShdwI_xo*x$AssWqp#oHT1YrEU)?FG9
z=Yj{RB<MezG7w3%?@A(Evp(0NmPG|hfNE(P0Q!mv0Gb5I|2R=Z-}562`Os@nOp@zY
z`tgw;lK1ToPyqn%5Hk)W#e?#c=jTD$Od}|W5C=yzxy%LtS&cwltx2+HV~g#22ozpp
zzzSc_(09lv3@vB&0fK15a5P{_-E!s$`h0{5R_%#?05u8F08nUv0$)8;pnx1=v<f%K
z=gWhckLXXM!Juv`3kpC%!*v=cXn;%Qi7b+L1x}oZtD6GQzeMT_G&zK83Q*_+uxD^6
zc)@6U52-GJ>IPmiQ41$2>1pr@K{g+XvQPdDUhx6uy0`=S%zpx?$Zswvfu?}gH20S#
z`Xi*Jp)Q0-mJ5QV5K+H^_fw)sCLIw^;~AC9OJWpTcN`p4Um>X%Jbwy13TlqW`0e&o
zlt8J+`Yvx?hGfe$eYJT%2LQZbG?gX8QB}AOuuI@sl7Fps<^q0>56;iY%Yb#;YA8)b
zI$F}`_oYbG%!am&jMl(a7{2b_fNpI+prw1QxU}GPz`avit@Iu@MB2t6kBKM%<ciZ`
zX=QSt?b}3oVCk5jfyUknRQD+^x0duZh3sy!|EmI6OcgMug!zmyxnw}J4cRga;=m9+
z-K5O&1+HKlu&ytfLKH?sD8I^&sJ<hi^S?o@#D%k}Am!xWf&gB|L=e#U0jCr9WgmTt
z1}_}r@o0cVuOwjaJ5(TEzexLowBVHV45R<e=fSY|e{cZU;)!63+h3@8`oY<G91H6$
zEU+h=mgPUfpCAY4QOKPE_)D!eS?3?WDgaGPLg@b6`ia>AZFiEC6aSlBZm@xmUs@Is
z{&Fguzb3@ty_5n)4F8qfh7Y|9m`=KT+=qmbKpk>U&<|ZCzzi5`WbeUQHfPY3fQ57v
zCGJvd37X2IDtn*?&DjMsf2NIqtOKgHtUxJRKD=^_aQ;oqBQWng|A(Y-9aRG7YshiN
zhYz3F&Tv%&ZfsrsLH_^X2oM59@ZB%>>iUuHvEYusEE$Kr<b+EO_W?E-KyYrR!e|eO
z42DW8J4o|miN)dj>%TVRPy4fj%syF)NSm(Vd-ihCZ?h0X1%!7|nU_b7ZfX6iP~_wh
zoS|J)A(i^CuLE5|2)Mz6(-3`t=e)1+4s7YUj~HMKl<+YWRwxXs@pA$9`>47NTkySv
z!5jDv`)>n@Y9UY-GSbws%HPBf3C{h72N#4dTo9dcaZc3KAsry?YRcApzCtdai;jN$
zr#u9ZL0*P<++ta1K)#^XME39Lali{)-b6k}dAPL0o;E|Ar;d9E)bg-^X(jB<IPvsd
z{}jDgi#LCJUz5p#Hpa6EJ>tPr8tVg4$7&TNuw9gGEf~ov3hc*~&y?If1D_KOXON9f
zfm7;zsocHZfI|3Doq1iV4Z_x3(Ym=+87d#s<0MP~rIa~}?I-;%XBVH(pm%CIeN4}$
zzRvX8{~}pOy7Jv*rpD?AoISe9@Y?5xv*RiEwe+6C2s)*R^=lp)PP_wj8|kz}v9jy+
zI=kP~aQ5bUczmBj&jO+&A9xI3+Aec3R&crBI0(!;?Fa^Ibas|NqMnmn^^a5=X;?O2
ze!5Nh7Z-W>HXq^y+gw;uq1A_aY3D#w*VYVPpjkA*b(xPtqlyAwyKQ(H0F)I5w}2uK
zo6}LvSx_Z-kCq@%j5^Za)%Y2VIo;KdchTEf$7$uE3KQ5w*$ufC-(Jdy?SbgF!sl1G
zVFa{r?o8v557ZN+`UDuv+BNQ}jTV0ZtmhYtLl_Da;8pHMWHUHP{5(O%cIf*0barr1
zrx74Go1OATyHW2Rn{?z_@Mi^Eg`ZKn=x++k{OrtHJJHk!q-CylDbk_fb}hUBz}(`x
zwp5|&000qgpC~}}N6I#XY6fT-jMDpO>&#T8N$zJxS85N|;Ksery5FbR(p4KlFEhnK
z&)_>-fF8h;M^8tHL&aqM=IyOx*Pi`0${G(_FtIKlOEO_2@=8kLPvdAiB1__<)1h1P
zbbun5ZR6VG_pX*>?T&~6G{7T;Pd&cqHATZn26Q|p>Dx?;Ez!&A!9KtOXIjLzR^41L
z-p=cRqT5!lY}qocvJ`yw=JOnFvjh!zfy90J3EaXIoSxOe2^E9Ok;M-_r_)$lx>B2U
z%Ycj6v6OBDOsRGA<@~{CR_9dAYKuW0IHnA$TXbK=a{i7dlRT;<$m$MF?<%WXn9&1)
z{vI7Y7JM~7gVMH|$gnpkbhsmnaHzI9AB=RseRs<09RQ2(;a?n#xEa-w0;{8Qtvx>R
zCPK4zJgV<$gN(n}pAyjYE7R)Slm7C?{ZzvM1G&}CXx<QVr+38hw_6}JhR<3#Cf6M;
zeEPD+CGi6;C^2wj)b58D;LVL_xGnnBT>yICHUosYQ9VBiG6BQ&BSiEno9ez$(w!Ah
zPtv!>hjIV>bk;r%WH_PA6M!Hzzu8;7DYMy_Akuwp1iEbqJ`9EHZcGBO?wU?(kC6hn
zuIE8!yf05Ko(9j+k}$h424o_NNWo^-*cpziD@>4VYhr<P#_<yWQjiuSYqDA^i|QVV
z;%wV06J+RJB1LMV8!EdwRpaRyNb+xvjuL@wdfz*dhZBMQppIj|ea#QH-*_Mm>b@+r
zr}97b?bR((VjRvc-b}m(_3XCyT;YKU)$MtwDh%ZKyLM|vm25X`6NlLLOhs-l1`8P~
z6|7Std1ktp^eLA)GcO`Mwm%jIO4)w>)iHeYs5On^*-U=ZZ+cxEJB}#DzglLlva5<l
z_XOR=CW)`fwR0OUM@oueNZxN+GPkQK<TWsU_kmL@l3>>?!s;g{<<PvQ_dWh5ZM6KB
zSc(28`|Tn7$FmwC6{a@^nvE_vLO=Zl%LISuN8dfr(w8ciArBi;Ryr7Q$<?<Aek`*m
z;U|?XLYQ5#Lk~M=?%jCyC47jX&}|;DgLHh41rYl&Wp%N8_PLEf%9S(M3^!57`lBW1
zCa@v}>-SvO)je$~%gvITkMr42P=e-_zgc=bwtaO2(lBT{dY~dD;sc53^_Ftv+KVCC
zE?_(+=lpK3JN7Sd`z+4BzneLZwD+cUGPnHlD1ZJ{5LV#U{YQ-53~8^86Ae7KEG1oS
zr+mPq3I^7OI_4ql(XTw>K*g`ybA2@h+i8b3tbXXt4ph3=vI*4k3^ObPkffxaC|@n_
z3k(m}nA`QqWEv<3sCsDND%$~yinw)hY!*B=xFI;W&LI0(-~?x4Xupu%rOk0c{#3Hz
zqP@W5P2`nr_&HE#uA(7zJU--i(Z4f#-$kbR>bBiNW;f)dZrTgW5b~$QjhYtmQcTKU
z%m}?+v9(Nqw=1t+=yrt1{xhn3`N<UVB$ln}o0X&2%I{K2_zpZ_;|wQKOhpHWcIv4j
zHN}jPQfm8}>n@(6iU!AZCuh%eGS{zDO5<)APEvM^@#{|}bk(6`@OpyOdC6>XhsH5A
z+i!0$K3*Kp=V5(DqnH4-ET|-P^N`sBV}0Ojusp^xLJ1go)~d1N#}uoeN9hM+E~13r
z2iY9&m&&+K)8|V}TSKVA$Aq->9HkV~zJvs(SE|i=Y@`QyXqYoit=Ex`%*N(&+csnn
z!Kt{^^k2g<g94f(Rk(&6Na-^aWg{j@HJ?RC9vCNi_pwDv9j+#HaM%dJE*+^{dY+UG
z5-wV4_m3p?m)o5h(0&}$lYBLV3|!fl87$s8-03TbgMwkZ>O2&fE^7#?yCUS|$y>N`
z)C%s`(^}vDA;`x<K3?e8)&44ToRk8!)l~$tm&=2*)Nz$>mcy2}$@;BeONDY7g(N?_
zl|dkqQBMEm%MRU{UBxJj^}v@I7NkXr)|qbd1%0ZCAdo$4kLhX#XuwF-Q%H@?b~svs
z@a8MtYV016JwHH^@&dYLZS8e<DZzdy3NkCfLQXn#quVoV;^Q$qgbx|zn4lCSGckHN
zVc9j13E6%&(eW4&5~H(I^dCuB_wPQGsoRdDH3we%BTh6(#cS#dTYPYoE^=Rym!&>f
z6x?!7rMN!=jk&2KOFEq$59i5uwQa2WoK|jVTO6i865cHBH(Na}DG+%xV3f2L17x=6
zmH+$~1Y%)o(mk-fW)D~kRZj51$oDSulcSmG&%;Z2`IVQh;|XxRomApme#2Ce?y95f
zHt)K#f4Sb(e6g(s)wzV{>Ab*q6J(!eQlPa7nmUiTp@xZqTAret>t38BBd__JN++`j
zYdaJ6sOTF}cYya0{n!Kw(|i+z`H88<89m}MZEHO$noh%DTYa2^5!(leO^fK$2iAo%
znWbJkWp(9mKQM<-KwbtdFx>)&PIUh|ZSkt008vFmJza2KjYhDI`DJi#E+o7gq<IW4
zcMNa0!$0hNLB*cd{()tiM80^ra7IH>tFXc|<^kI(tjmXvC=1?$KJ?>ka>ble#bzTu
zJfEwtIuru+obe21u8LXHI8aW5J$%gCdAG%;<+jW#n%A!B=Jh-_Z{o8xrf;tE97jvN
z-IS{?%xb*$58a&l4zp776@k4LcCWQIZ2WrVVg<`m{X)uGGTgu4*_?Ko3%+)FUIycK
zp}J0QtDc0B87u2mqJIF|+Pa9f_`sH-l9I)=1-2M2Fc~C{Co!`dA5$%jeNPyZaW?04
zdcA}+HU7t|p`c*$`+HYgW`0#7`nQKk(APzSc0~&@nm8&I%`j`pnVNTsyz@2XnEeIl
z{MPUN035GSAd6GKetm`=O1X5LRi5&ISvj|37xkzXAR>N6!W9MimT9vo`YHOd7ktO*
z!_4KA5DZvad0n}^&WX*Y2*lMF;#z%1`tZ~&>|WD5SI@~^%d?9xRMouvSk`>vLaVdU
z-Nu^ZFnFz#Nlnipi|veX)199|2K9yc8do^JKktZoc-tj`-1Q*4I?oRH;DcbJ|8foF
z!tc#oH)HPWMG?J?vE3Va6D8thc(XNHI5`18Q~}GFZ&Q$?IK-+5o4Bx5L~eF!epEFI
z0c+@RRVTe<aqq@k>CIdeguUVOx-9nGUEz}aTFIynrVa<yBZud|CJnr7-wRg^Dyk(4
zl{;FliN|s{(GGRv_z?y5-uf4i%=DA4Mp!^UXl`g5_$qwXTo2yojThcCAd1qSdH*)u
zv=?9EwN86&2@dJWPT--bQTCN~{iUs?mH}f)ls1f7;Sv-J;Yeys7#ddav7-hI#XY};
zL#$W_!&}%SnXebuw=#5b^-tFaVs4b4FXabV9IX5Rd5D(gG$f9O(hIq0;LmUwk#lw3
zHex=0Lc=J*D30?*iS6VX;J5hH`F(c$LRXI&{_WNiPz3j!HHmB-p2j*4<Ws=0SWa%A
z=rJDEm}cw#Hh)X`y*wj#Pn$0X>O4iCc~8)zB6hveJjSEJ(NcCl2uGy8m2J4oahlw7
zx{P@9W<z}gUbdIR0FtHa;)2X63((J(>FxCru9sGl8*(9VpEa(A%8HkYc>rqAMw9ll
z8+@kY*Ug$VQ@R^W(|S~C3&pyWH1y}Iv`DIg;$gJSsp1q)PF=rN0?m!-)+cJ`CE0X}
zHG<wroS89-{`yX!`V^Ac`sz~Rf{=0g@B^#K*@RV$YEbGiCXo{lC?0-Xelc7*zv!Fb
z3gGzI`nXh4RLy6>kl@4hUtd)DJ<xbi>Y8MjumhBM21m>tCN_ImyeGc)^H3-dFet#G
z#a^jST2+n3_jh=zNwZW$v_89ZS<uLus$+5RotE`J9DWp%fp7DKjZx*6023Lm?Y{h$
z?AfQCDpH$zp6`WHuK_6f0m`RJ8p{+b_MD!(ItDdNnWx!CxXO=d*i-rT!EqMfKCmj$
zJoQnuH@;PXtbZUlN<%`S4^pR@O7kv4$=N}&mrsm#`$G6ad>(Lts#{6#GJ+kxC#L~r
zY~|_q8X3bXdx?|%ZZ`bvQs+d3Qh}cxoLbLMEl_B*lBp$W-1Q&s*m|>#52EnhS@02A
z$&cS(NH+ZWh%C0{Lo&<YU2PFYx$C{PCh62&sYpYBvQ5&yVV{P-R&4wlISp(gP3&GA
ze@B!=^Z@USC9R1C!?O)YaBwB3e6DjNG^tX-P<jLMk@^UZe+1laOGjJ0kJ1(+*fcWc
z8T;nLq98=i$Qftld2#~RDhKFIlH@obpJWIN1vKjJo#q!`%x#De*BQaTiv#tME_J!N
z6eRXUFJA?0>V2+xy={8_vkfPbg3tt1S-)Rnbq1IxMISPq|Ih#rQj2F|p(%4_dOJ;D
z#WgfHm9&pnqOL4FK4n(2@-=Qyz++{{PRUe|t#P-iLqIRbc(>QrgaWan`30|1NZBgA
zb-XK0yM~+yaf8vpS!0hn6hPx<$N8$Q@dHSQ)Qj{P+oX=L%WrI2=5SVfK0Zn=5s>ST
z8z8r1TIHhSY{F#c;<m#+f|vh?y|)aja^2oX0Ra&gVSzL%Ac}yJ0@4a75=u#<gp+QN
zE(H|{=@bwt>24;dbayjRx|NzVXS}-B{_VZ5b6w|mzMSiP_<wONV@{s;iSdka-}kuR
z+iwSK`yfqZ*z(f8J}{J0{Eja8Ja++iW;;lV4=-~iWZtNZQwhU#B;IuH(17!iY`V}3
z3S9F*f!U}s*QKtT{#2WGL^wF)PY^Ev_L3*C$jz465&s{{Tv`pJdj1_N?-qBW&4IXh
z^CRRq0yvPeaj<?t>gs#<znS%`s0ECg#mmngAFUm?HNC$wby?YxJ|VWO_Ktj(B2x5<
zg-dgtC*<;;mS9|<>*Nx=Mk&sU4=5%A7+$#T&wHCjYQJpZ`^sB$l3`!2taz%G{4NWs
zYWc{L-EFANFREm}x1|1lwfs<&y)mo=w}fwHyLZ;@XdiQ2L+bCgFrb*gXSimcNA|V2
z$W3CzxNw>U0Z`^IK+A;A`;;+va-s#xaE$5BmF4mIt<2mS!4k`Z;p!2~uftRtHZ24z
zos&Nf-%qaA81+}|k0>!xLFI(rISN<PiLRnssheD1N@w;znCv99a`+U1?Y@D}E|7i5
zq2(TsNJcOmd}Kz*{%dL-r7EWJ^Evg)2;XAPp=7tSTIvX0sPTxdao-_*S%3IADbdhZ
zOSH9QTT*p<WOKCmej~`q+Bo?{)O2^bPZcR=o<7Q&;@RJYdeVRC{*n;dwVnDt;lig-
zl|t2Z;+Qu-TA)NJ4Sb6VB8y}xpH_ZMy2AFOr-VDU(Xw>ZQ>(%vzRX(Sx&n8K#HFB4
zx%9Ct>-AM~M#YPGDo5Wr|1!W|i1AE4U&%mvoc;n2U~52*oyqJuFai7%7Z2nMlJiEc
zF9ZK&eA8guF_w<KPnGIWhG}Y~d4^%UVf;uk#^V^o)(<Zr*v+Ykg}JraJLR0dsr6S|
zds+MsiAOl^x;&eII%2*U9d>m^aF;7lmrDsWz>4Mojr6Z#K9_RHc@!T<><ed8pYZtE
zlyH17m+oXtO5&k=!ee1pM~J8BTkuq_zRjuhE`N0)AHJ4k%3b0S&4*4~J-5#vFbDDk
z%XHA8<*P@DU#XIk%yfHRREvmXaU#Ghtb@3$$~fPl<gOfz5(cVc`vnaFo}M9v7@lhu
zp%J|16w}LAD{8vsU7UWtYp&LNlLqs%=>c8Zm7Bvd9M@6`X66;+S8mXaWk`PQAbh(R
z$=yo8J~wO7^C)4aD7bNsC42eCFX&HFyrR&FAfQwl#2wLMHcWvv2Q$MgVu?tz9oOdL
zeJ+zLcQEAka{w|MtG||FUIDpT-2s=gKgDM4{K#CO^JG}>7w!E<RqoS><Xx}EdaB_b
zjgpCNOP0q->lLblNE{GtHhgb(RnJ*De;6Zme=+GES*MNGD{uN#o>pm_p_KAPDH=g4
z`{nwE*O#1L0|M?FgyYcsbIU1QgZ4KNAymToxlgsYz8Yui`K3<+Jj?3>XHFgyd~W$k
zM_r(@eD<NehtOJ;ZCma*6*cwCWVhQPH2#(V;CwFN*x@t+$yry7Hs5CpRnOSrSUw-|
zSGyfvQ-MMp*L6!u3*-A56j6;yN$QO`v~(-$GD#l=12>+B8H`_LpYB5WRojwh2&4D<
zb7NX^)(^J*{Uk(bsXv=Sq87BIu+*4JB2&JAUb9>2onUsL`WzFNe7{li!7XdMG*XiO
z+-=m&IPbl>=A`8978C`ftUPS!UKc=jnWgqVoV8-Gs?5<-SW2=Ynigji;8hUC<3gPx
zXFUPP;a}B#AKt}KtcoV?_-4?b9iJ~ZFpl}iU_)MZAWdt1m#eilcS~_;dLt;wp~-2I
z-m?dvk_JudXfZe=*V?0UFdJ(*p_m|*Vf@zVs(3POadbKTTS=wu(B7?nj>;N)O%=c)
z+0qr_Ewm<74Vj>ufbpjz?C5lf=u<B@2ZzraTByl54Bv3@c#HCR?kL)xbcsfV=F;fa
zo_5dtybup<T9H1RpRbvo$F+C^0SAw=iVFMh=qS0=LHn4MW#V<|pKR5dC5zBdkY2bU
zQJ6AnjY_AlJeJW?wTXBcnjD%O$CH!&Bb;}KG(djHboZ(ndZ(D!ok1A-Tpq3KfCTi;
zr{Nwb)&mKhm=iKaiW4-88^=rcJ-^Y?k{zd$eBtd%kMDwg&8hV8(3w~NZmR<CY4?O3
zr*WsuPPxK@!m+PO?_W_nIZM?F@Dw=vyRcj&kYs%2xxelpO;reO+OIr&H68DIwD#Br
z*z8?-qvcVN{FklpZIiG#Bl=}C>FRQOYweh>Zi}9wC*7X<2fX2uT>I9m-_ydIsCnn-
zZ0BO$Vmd-JH8QcOu@-MreNBCR=sA`N$LdUYiMztr7>Y$~)}M+hdk&~LB-0%pG?$pr
zX&={gZ2cVRczydg<)y9^O2bt~>c?-siFwjvZl1Q1iS7P6iMQBekMS)=g9XR<nw815
zr1vb+A4o|W1)TcyBK_!{Sy?4YIgJcU%lCMc?zM84iypr!+xC$4b)cg@azDLNGbukX
zquc6h$K@ZuVBlji8xY4D<MiWr;a+KkzfGMZs%$-^k=(^2!X_qud9mw}mUekZ$I2_o
zul;8P-`?l^0&3{Tr3w$yR~?M3p4+2PEU(w}dblMDhh+lAOA~0+<aMNJsh!?`ipx2L
zdK*3v)7L}!doDKzj&UWB$!xlYcmFV)gDp@|1aAb@?cuqOjx6}FZ%Rk`N@nEUTLT(N
zvS4F#REc^?934P|K}O;aqq$O2MFXaL=C5ZL)5cI;Rxc90$EpP)HG4Ew&Df%-Fr(?q
zAw>Bf(9!c)BZlhI{(L3o(87el^qj2dPVe5Kpj{5-OB=K1g`{ni@(2N?7P)X!YJgRn
z?V+Boe20s)7-q&b0UOB4jP>=j<J*mYMUVC6O_^1I_UWFvr$pnod5X|c%V&=jTJM&x
zd8=~sXjM!<;_T`r^oPh%d?zC!W8Ivw`arF#C##w3{9sNyT|vT#J!*%~i2j6o@n*U{
z)9#@awA3|XwPL2b+5P$gC9iy&aC~-OH-?}cZ+*!zRv+7%bV;P3S}0z8JWDRgWuz66
ze)<!BctsdnY{O+>jY175+mt``K#L1g0ud|n-iPCwEq_9+fx43xS!1aw{U_GsIs8q}
zm71a_C24bI8I00+@|NRg4%^Rg+}nyn_ObzwLFapHmiM$^dX7RN?Utfi-(;>jtx0UE
zN$V3!{`pI{l;umUllF=?J&!#{%;`Yw{buKq`M?`l%)&3|dfoD$tHK9_OSbF&>x)dm
z2kz9oCdxPI-jlOl_WE?R=zLLvpELmA))L(JqE|}8B_)%`F#y)Zm}MPeG3bueUU#dC
z$iI&K)l0R)Vpo>UCZUIn#YSs>rX==KqK!nMp5)=`kAuki7EGX|Bv<)?;Oy43am!Ls
ztNSOC5B5Ulj@+-+NG6}#rZ$^yxv*p;F8IA?Pd=$vK~y+!u;yISP21VF+W`rK3QIz{
zMQd*eNmf8Q-)&|}qUF5RuXZ?H9^Y#HNu5D{E`-3;iO*y9rCqjq(~qvGn@=U$q~CSk
zo_beCEG#O(H!^yK%JtCN9EG(Fy}=f=^nJ&n-RHdFW}N5#7aM6EVl1t|h3UJs9>$(W
zv-si9_N0C{%<uWuT)hzQakOh5L~V;vOG-9kIM0h}xGQ5+zmTNlB4$U;BY7#i?Zz*F
zx@A5S8PmLfo+0hTQ8E9qqT=Ap3-#8`4ddyHNv`+@B7Fi5_PcvlPnhLLnCI8>Oe6$G
z>3T3WC7-x6PA+tHokoc=@e@RbUZIkJb1u5@s!;P;Zz)Nq;8Io@k%wMDOZO3~S@bxi
zNGEDP307hM;!dpp+^$(_<aURM%xZXA^i{+@#C&gap*uk~Ejjj5lEd#9yJT3cn`(5$
z&)dj*$H#-mWtf2d*J4X>;q~(e!z?j}=ZIHRs#_jc?w{cq{nDH><I(G#i`Hl<QJ?Qi
zvWBa!kxPc6eSX`AP)2}?_x5$Firq1$)|M%1KPme2*RD|Ft@=y_KIH5mV0*Rc<*9QP
zFlp$fAe+p%ObG#@y%*JSPmFjW%2VpXF4>J$mupUaBULe~2o1*X>Sz4G@sZEK8TpcW
zZPUBD1LPyIU&2S5A93FudrV+l&~(4+esofib0TSbT-S+mTA`#n@!uAbn1gs@doPkh
z17UMb1IGfp<%7;3a9w!Mk&OOhQ2?2-rRfc+g?l!r*G<6cEG;v#=P&p{5(a_za2V%m
zoRx%~ri;xEI@eehPxxo;3HybE=pty>gGRe*2WUy_8++}j)(Ks+5$D-A>}~k4*V5u1
z#|pbRNIG?j?3=|OxO*FslBAe6Oz1j1Sn1`;bPwjPU!xJA|5i^R5KBu*$J^A5S>xFK
zas+t4t@=`+j};1K>#VySsvuIn4OGuybBg*@4YxF@zfsaTKVKu|ot1&*@M#+pbz18C
zyAp8zW2z7^<X(tP#bw4T{l`{IaX4ND<iUP)hwQSJm!5~z=83#u7Eu1BaU~F8uTWR!
zA*;zr6%{@&7cW@6`duu!B?R*{iE`%N(@*bBlPI@iI3~=YuT-jtUn)d8(=_00Z=YlK
zt5J(C$)8>)>4$1-)U{1YrKm*!8<NZHPL~MC>Pd(&`N^+prdqRVtneOi*kt3~u^W6~
zsqhR-*28-hV2(I4M}IlJs7<w4ilr1(6N^&ZQ4s~2p{VUq6HE7p(;C<3p?M<59D0aJ
zO5QJa|D_eMNH#Ml`x8hRzMKb~<)PczMsg@}1(dq{)m)Fw73Sd=&a;DVqB@)jhkT0!
z#ZuM#oPpswyQG5iiHQLVi<_<XbDz086PaHfv^T_?J<A!!sfaX7?qRiSJpx`RNGEZ>
zP))fcr~V;LNTow%GVRUgV7n+8eg_b^$DkL=)-w~vYZ!9=TIu~E^p}>fN#Wi@!yQ@c
za#ttXEwh<LTj_qbZJW(`^)L06Q_yOqqE;N?e>RsV-g>2L?ozqJn>CcxOr_0Ocl{Ep
zJ4YPALeV1p9cSiSn)h!@5NJF-MShVLxAIJAzAlG7S7MC)?K0MzxI!Pthq}+(pZ$pb
z^ZOc)(_;c|<n1}lrWEQ1XY@&qDn=$;H{|pKMk~smR^*ZOH3WumImmsVM==z%gNo2>
zSuz-*a@#=4OGg)p5VoNAjYdjm?hlbV&dpCwzIsI0DU6NFH|3+L+%3;1Q?=HXWiJkU
zq`jTg-X?l{5WQHiBpNGv@SUuohWzH|UO{D~sBH%x)-nQ$|9MLYrTnrt^6#duH!IVE
zhJ`YXGx(D&aJF-8O^;~tzJ5aM?tGzqM|ZpTYz-Zig+U`|^R{N8`Of-gkajC(C?(*X
z_A}NO7}B;|+bMbZh5kO+3&zhgVia6|oa=V$uWY%V$um@z5gj|0=X3uZ-J^sUiIVVT
zo_)L#v%EUOj0M5+r;+ho4!&OMdR_#A%>!K72IFRh?QPIsr;)0HH(XQj2&6-;IYXLh
z!TI~r7lm5u;jj6mN*oIHo^!|k1~A^x4H;xj)sF%9LxR<M`QhpEx^jact5#S(_oK^|
z{DCq8`7Un1Y1bKxZPc%b_;HIoS#RE|E85V>J~HrJD0Fm)ip|=7&vjT_{+E5`xuvex
zAAGXTwIQ9;*&!#_H2r+@1FI&h?_54+z`;2_UU|5x`H$b=%J`|G3-$4hzb;b;oDP}%
zGTHFvEpbPbPwl%Yif<R&Z_`TC-FeI9efjNW4({t`FOZiezG+A#GC6Pmjl}vi=>wcP
zg7Y%?1O$(8De)=EFI}FDKPt{ht+2#kS1V01H5ieT6_%X+{RyA3I+!CzmFU0D;lXCR
z^TMj(;<>#@PqX&~`0HokuH%-lSF?{k<)v{Zip%hU6xwIpqd3HIUA?N-o8WYTyGOAl
zre^2meOz4p&n$_UxiK}hfP{0D*Ywt8_qJDP`lc?2PFvw$XVF_)OBDBctn0A`;O``L
zvWM+vxzomMG#H++8HhTrj<o@_^TB=$w7D^0gNWtwr(kbchB9A$RLF5Pt~(#DqL&F4
zo34U`Gm*OVf{`L9kILMh+aRC!?sRYy*`Tqky473lg3Z#mpMbPS==K^kGRH3Tq{uG2
zAMRs=#*I32ix4!Yt=8)VH%QR_>bcSQ#g~MJ%i~~*f%_!=3q4gGb9ZGH=o)H8>B`lQ
zbjI>?mW36Y4(`nucFuH=0m%KXCV_{?e5~vi_p@)WhpHTx2G}>g`!O`o=LlYYj&)rv
z$o9)WNm;%QC{-)1$HA|oUEtb(v_U0H6C311D#*%hzt}faNJ7JRWBVeOx+p6l2?*<(
zVNx8q^ZDC_>(A2=F=+0_i2jP}&d=|BZwlICuc@pVJS0+9(Q696WHjHElypatSCZCM
zCJ$$ezwVXf0J&JAIPRL!;O6o$cTp#}CYXX}43$RXNqq8jI^0!RleDZRV~gKHD*1T9
zU7?@VH%rfHhFr_D7#A!AHt&%~sQR7hmNP%SPs{>ld{gIUbCxc8Vi~?ERW6p#2pCYi
z?dzQ%I}QQch%#)Ah(=J+hSPab=B49&YoQA}uTE3Oz+{guLnb=wd4lvNyJkt-{*K9N
zI@;8ql9Nw1l7kmq0?LQs18qQsnQIiAsI;5)_m!VQ^(kAY@E5xNdSE#`@HPCA$5K*Q
zxfFiSH7g*?VnDvZZf89~qnYZsHj!lSDu&}Va%QsTlS9kfxLFFu#P_4V?9G($-vh48
zengClqj>l{CY1+|LtScKq@=PgtxKe%?bVmHc<^|y{%m2(jmp<=j8PET{Wc%KA_o;A
zpXor>DQ*A3hr=)Aa@x;G(>{e<=iEpGOr1;(4wku@^NP(-84hkhL%`6La`^HPO}^>B
zaPfOCiHTjIn|ZRKR!5l|!}+D624~Syn^KfANKB5XjLbJC<hy|FY!oS-AkPf>_B|Z0
z>!+=BAJ5=TZZe;-I=1MZQNAy8|C2X-;q}+;8L)sH1VpGl!~CmjA9sI*S8-jE-aW18
zu~GU=0oE`OakMk8!kvwAT?diF;O1g~W^<!84jz%Rg6Dq7;;`*(&nfaQ)_CS?DmtE?
zQZwzn?otE}{N^LdKMgI)XCep~G}lf(OK78pK}XL#b}awluQq<c=`%;vUhL-|)6d9d
zQfi<2(HZFg?8%}VOGq^KJi#r-mG&f~r`Q&L8E3`^;pZIjFbh5EA&+NZq{<U_+Mib~
zq{g=8J^flx@y5J`x6~3d@i0N8+K8~-qNO<8R(H8ILsM%6pXcqy)cn~~KkGB^1~VyL
zz+JRd84|dLN5msO3Fn3nEImvnq|KneL`pn3f?adg?O~G85fFs85SY}W!hT5=b(Yo-
z2QLs-gOt#pmrYD8*~mZeoeRueI$_FrRR8C2_55e|y6*?l@OQ*PZ~whCC52OZ31dAh
ziSNe>GAiRT*xoVAMnM0>$j;zIifm*B_oTO}8skX#>8(<S){wX>!Kz;UnQE~!t&y>C
zFsG~T@@CNAwn~hKe80o3rC40I&jBE98)k(HqP}n#hrP;JM%56n>A*z-ULI@R_?FA^
zTlda%$=Lr=J@_FNM88tn)m80!sTU*+Q^)iwg6tYatW0kzgbDYjDTHwYrX%FEXU*!o
z2~xEy9qgS}V0RKRdg9>qQ$+C?hX?>MoEZ{;c&vOS6Ro;FH~Z=L-@-3q8Tdq~um>S_
z<ZupN;NnsL;>gMlfpxiFXRCn^e~62J#|vS&-LVS&wL73DdAKp#o@s5;@dh>v@*{}C
z9>NA`X1z~@i%<Ug0%PKt)-Nq4uzAuW?#$hT&y$PQMr|z((vVQ^z&TZUCn^38l$^22
zfr6i(%}w9^*GS5dk;q2aoq>_8!H!Toca|A;*Q0uIqu=9LRMdkoY6Q_YwaeXkgL%I@
z$lU;(HP{M)K-t-F4bm6=GhlKt#%APPX$ek~z`0_CFCj^Z`e+Jg`s0_ZWZC=^mW%9D
zfpkIx-Ql(w#^iZhgVhv-7(3o4Qoo0b<MrSa9Ib1{9ZICQnjZZ>w!qwEOW}R9L5ubE
zp1bgG7B%3JwJAN;IX-wwt+%|S<>k0(&_vE{H?Jrf1$Hi34P<&`_-C2)mU3g6YhIL7
z-?)Oz6ijytviHMy=C1kSY?ab}{)Eh;!D;fYCcdX~7m#CcCP9hvhMuKXzo3t&RNXt;
zC(JN}@!oS?R{yudTeK8VzJCrTxd{nf=3ip26T{437D?lHQae$0wu?0?_(UcvG7bVf
z>Kww>zh_(V8U*p{%<WDsm&6X){jM8dhVc!<q!q|LQMw1)K;_$9AHnG%8Z|s>f^#-{
z7YNJ_?F+@kzV_=;Au9@VaGkl<?S9H90R;j4bo7iOvmE>}qbAHt$tly%tZ<~>?!;B-
z;kryc5KGpRAkF&dnT@ufAVQV>znKg)ewt3Tb8)nQZR+lBr-ub%19JA$6Po+KM^^ku
zr&|m9)cj_l#^;Yg8r_Yc=;sBmsL0`G{_GzsQ`ZZ#(W~`m{%7_Q?eJ*$EqGuPb8pOb
z@&j3(HVhULZNL_VU{Aq;o@UqwY;v!W&Z-u7mI1mM*F{7Yglu#}md{@AciTXOkQ5wR
zsK7hxO$NGF=WL}9%`ac9MAnx8rk<5N@N3Hzc=|gzGY{;?E9~8Iroixz2XX@*ZlkuJ
z3R^&N?CW;4*iU>)F}2h>g<{G!pzi2w<wTo7-Y`2jKK}vIhD5}jPhe-#Cpq>pc6dM+
z`{myzPO+-CC7ca$I@AYSxP<U0ogakTfuuKpD=@scUHcZ;1@tCMlR1W?UAym5QXL9g
zc7MD#P@<t(=XRe<Hfv~q7V5(`2(9CQZq=i)sy8LQsuV;Id7>$`^m5YC1YfK#!fx%l
z3mZWWUmv?p1RW9m^9dGq4kN}_kC`f{^d6*oPW-hrURmli>2>lM_Y+Ip09YvS6KseY
z$kvRlaogWx>B9_tejgQ%r2M<39{Z(I8_ArO2m+D3-@i`k`s?#d&y_XgP?UI9=Q@;H
z>HaWLcHafugc~NgsQb>~C%;OQ^7om=4M-IL8lT7|tKMQ$HSVWhUdC85x4Yf%xSTWM
z`)oB<^k_#PnoJmyc|M-Q4M-<SfEa>{zYf9ZeeU`Fa1~FQ!GiUOJJ!XYj4?HC9d=9X
ziQOr(l*y{sSHID$=18qa^b7k`IOi#qT8ux8<uh*sDd01pCiT^z)fqDmmFvljRrj|T
zw77L4a*#hscw&~CuS<=e17npVcdQ`abI54oFmip|op<zAvz|OvqZtpn19n1n&nxrn
z79Jmq)Vr$=MgcJX66_A+z~e7K+nkV$-c0$Z_zomXYR>gz{1`wH>rU%-vt3C%bKyJp
z?|3nRH$g&i4o`*hr$r67P729krgLhOn7kKUDh5>?P6zAj4Kz(iGceU!ou#6?j@XsC
zeB-gn0IIWEmD!EPRUJIqy^!0kMn+RcwYJ3z@!J_sjJ{7=-^6>7v|_wpyM`$WXC3sx
z!Q0}fUE4C~_e45Q?R1km9o8Fqo9+PIowTCs`s>v=ZQDsw+KbD{E|i?Fzgr?lgv*DJ
zgt&y)a^lVq1+A9;=|0T#g(tVL?Rz@u&KYYmI%+x5Y&m6@({o`^PL!Gkf$8Xbcs94;
zv4YcZ*m>4T5A*Z|2o_i5XxHX?=M&;%cukEqJLA`f8&{7(pcr${pJKOt2!uguy*b)K
zh0xVB(jL%Z&q4BY8%Y}BO4+x3S8P5yz`!UlShwh~xRi?~-COL}e6~oba*my+O2{G2
z)s|{tanE(pNMwI#DHnQU%Oh{7{}|jV!(9#%Tx8{TS~n2&hr^K79UEpGzqo?6)M$Ob
zBmtWzON`nh*8?o1w$~4KL3HBNHOGW;ghPL2HcUzW*M<v_uhAeOWC?S0yA9LibE~?~
zFbZ4n5XIoQ53*Ggj;fzu^ePFd25B~_L&ZWhUZkf;XUq5LPI+*?bLTIbQTNBTBQolP
zWlN+)+%3gU6m{jLw9q2)HL)3QqMZR*_w+P=st#yjo6J7Hc9QSOHhq*)6{Gkp#EZ5U
zZj*#d<8D>Is3Z)FU3b^pK>K{5GFL<KMSGXzzO?8I{;+7If_Y)uARHMHBrtmsJ#!Dc
zI-cxj<c0t!KCWF7lE2>B$s0ptHm?5FKqIa_&{_AR-haG3Lo+80gU(>P!}0rCKO@t-
zwgud*aB^u`GB(}hQkB^-*YRStHS@DI!|GdXTobMQUW&!5vDJR%(V#}=NG=^4%+aQq
zZscGM+9q+(o&$@V<y5_#rD8&z!Kgat%A0z%o_hP>@?oYnI32gQe~A71sSnd|{gKjI
zY<1+Ecg2I#0S$J;fylKPEX!D}Fm9uLYw6sz;8dDHU(agsZf}~~`zhbz(I9=GvaBs4
z#O^|Ja-j6WGQh9qplqS6l~^iWz{L(ypCE6G<gr6rb~vj0-Aob*r%-03^368`lr)ys
zARFXLmI_#B;Mtt-zOuQIc9O_AVQ~g)IRg7Y^7lT7anG$JTZgCK8?83!|J;RHnHQ<Z
z9SAGhH_}v%8C`WXo4A1UMjm0{yq7i!Pm-CdQmR;VUERY|M$J+0h#ig}#9w~1RXVh9
zgh$Q$Q&NTk$#AI2W#*JhsXedyKlS1nFZ0d4v9EM|bXbUY#+gl)?6JL1+OKPY5Z5MU
z{SR%XNRA?Z#pvyylES>~7TLRO+jIY_A%Zq#=}W)6_&)qh?lotWXn<GJq~HFl<`0@1
z5MT^bzUQRRc@l1wmMpD7BY~voX^U!w{T-^+&YOSIfCX}yUbkjI+a#unhSA*>Ynv5;
zo2T2B8m=v^m91t&O=Ez*clYdCY`AqUP4+{cHA-!}M`p}+KJ>u!q+>phrgmql<#uhg
zvXF$<xD2i5^e^4lKV;vo5m><`(NycF&Ky_*gQQs`#Gi%ShACfp@uq_D-j8t0U*%pW
z>}f8<ZyMH5;AdQ`7RoqPFrK;#mY8Gh?xiY=94+I|A<?T72@=^up<)Lqa~Wjvg;>+q
z!T8T_`;F;@1$HpbvthP0WBSAPHMuP;l=#H`_SiYZXxW(aK@J&miWm*?jH_~Mps4Uh
z_d7RBV?VEcigaJbSQal@X<ohht44iS6k5=I?y#*@L<TN}JQx(PoT%!mS^TVuU0ZxM
zpi>&p78UAyjRYQt)c7;HeuDp8S6;EV!0*s}3-2%D)$!e1D!PmTK4{+5Jtzh*a-z;G
z&u;ZvR&k<^<|uVtyX_Wwn1{9@@r_P6+DtooZuyR+x4^JfFP5_GHQkQo)?)wcAo|EY
z4+9sCc~$M3<8ocJ`z!>w%}CDj>9ZRk(&iiL)Sic{wj3J24^$>7vj|U8=gx`OWuP1S
zY!0`MUdMp&Znz56j}0#Vy?5Yb*9CcD6>F<NQfvh7Sf>Z_|IR)`2zp-V78tct7eW<G
z)y^vn8OAJtzZ;R&7HMsubEPt)0UPa`M2@iP)wj6Y02j4DdRs(~_Y^Rvs#agUTNSdA
zqgFf^G0kl+7I?B&Ji-S}-^M@7RVfi?SwZ{7>F2swlj1um8AlW@n7D26I&<Nzidini
z?CAPpZZ*sA?eG!*?HgaFl_%8;AH@T=I^Ec=8t-PQx*I%m`)J8K)6(n5;2wqe_vW-S
zYbG@CH?P-OpFAGW)NruDF!`5EAZFzQu}>yA!=(m|^TdSgSH}c~bS+(HVyZYLqmc@q
z+kB)*s}g_7IOz<B4-QNHX;?msTTXQWin@J!hu>cQ$;PvZK5f-#oG=}A(J8&qfV7Wi
z1(i(5jJOpx<oP?0We!lRPR^7sM%T2C16)^EtZ}-1{1#7`J9R2gahIagr=+8X{>;<~
z{$7bI^6%|`=hku;WbDwa)Si3^_qD&@jUAm`^x#R@>?sK%jIgcr>z2!HkbNDinxhr%
z^yAG{E=nyc@dnZ_X*AO|3dV;&`lB>AWh3_R+yUs6)K!K&^Wp=Rit$6P-mZd%!MxD@
zlsUlW!CyJ%Tvs~UQz~ZBvS_`seWCx5<{Yvoaqx;S2M_F<kho{BhAkiFR45)oP?T6&
zg``UH7V|=lI!1lm#Y)>xN2}-bbGc~5#jOnx<nVG(;~rY1h2@O)XL-{bNA*y2rEl$x
zD#n|%xN+kK_%aRq{tm>sV$Uk$_I@{YBkq}ACr(nYrM6XKT7@AYX+t_SlIW^DeE}fQ
zlvrBa`ue;7VMh0e*=|=y?CTB9k{eIv2!bH<bUR4xsB1xUzTlrQv@D5hNd5JwQ7eH=
zX!$0(lExfM_!mFO1#5Ppb9{PjqVTzb?eQVjk5q-*zEq%sKdNk@D@hWSP)4>gQp`O;
zqj!`F-BsQ17(^#@-NCdwfS<@|>{6ZJTEI=j0fI1Lielcz1ogUppP6<qZh*`<1Kqvu
z1b4gKXo>k@|01!xw#ltazt4!`X*}baTzWyf^zJQV6HHC@FT!tpV0saKtd6nllUpM5
z*pWOgJd$m3D_kvA0QoYbuEx)aA%2jfzcL=kQb)NdiM}}D8~gpM-$_5Gso7{y3|!gK
z$~<knXwwoQo;{!&d#tsQI>zsUd#4btuKoKeKLjLm?nJr>Ys-ggMs1!EOI#&b4{%ks
zK?6`mU^$UKY^C4(@t!}|qC}&cz8?j3Y=7S!=<QM<-My=cdPgrTcoXZ+D$(j3nm(qa
zfnjQGTmhFfBB#nDa4}RhJ8Pk2AFfbz!B1F5TS_j5r`gheYFv4^pUP2BkS>=$@dpbD
z(<*_=!NK8oGLP4z%ul3G2mI`m!wFq~rICf!5p6h%4M5nWfy9s3IY^oow#?}cv=&KZ
zSpHOjCifWIvHb~d+v-2D!`06R3TisP$@SM*1}?F*(B!hQWttd&uNHe>Jdv{lSnTpg
zQLK*lI(n*{)Pk}kDc<y_<cJ0;oya8rC}OWBnu3~l=c`Nvay{Y`GZnn@<*)G>Hp$Gc
za^aprzggvF-j1Z%SFns*YPXdu>S`M3(KA2{Nzs=^B&I+?bxTLFiQ(beqz@S%I0zYZ
zaiyNMlz*W;c=2$<V!aK#Sf}T@Yuv)!m=`oRI*$)Jk7~cJhC7L~Zh+lqS*EukHh{VE
zvEgUcvj~Z#UgKKcUHgMgH7hSxGCRKN$Rz+f!%?K|bA}!Y!=qo+GK!XE84s7I?Zu6h
z6?mM`t!+@DJg-`9em9sBjh5ePjW=UETYV{2JmMIfhMLFYJ{t;@t5)1zw6s~u9sR73
z&ABP0fOgz$h;ruxfnT)Z;ifm=EG2s99*U0PcZE$Z^I2?UO_q>^`n;<`zG31To6cbN
zthDIfd+iHtnQj=`LA<XMS4SUKV7n_QhoN{_VexQ>T^$Qh_2_X;s*A~ZsEa=3%k71M
z>{t)Wi6@zhUE-&XChh%Ar2?pD=5ESi`-@dQ{HrZh70BK8w|OU`-ie#rE>-RqCc4-Z
z8;yXKfsdQ1y%z;_v4+hw4KBWk6r-1T#JdL9pE!peCiY1bG~FK7y8}aH+qa9mx@re)
z=Q`Tw*~ftGxwZjSM*|&&MjF5Fm~FY`7gQGFCu+d+Gkjvw9Muzh67f}?h5Kb2<8Eo@
z!OuS9a&)kJq&HT<LeiAQKNHKfutK)xRg$1vU7gF*-<Y-arld=8#MuW^Q?HxNf7Nb$
z>vl%qNJ0Jvm}n-8zlAlsvUJhKZ0ycnMEL}s|0w5R1t_k#n}YUM+lzI_oO74TCZACB
zTvasPas+j+)1dC*SbF$Bag&Hivew1=DkMHfCHy=p8#Mx}6J<&E3c_2;11T@aoC|Y?
z<-RU?mvpbCslO@scpdN0#BF<9lMHB=(wjQ{`1VHSTIAyd<xY>~;&4Ip9wd1r|2a`0
zf7YA+DAWJizCSvtoW4n#VW$F@ganoC_R-jGWrR1`;sqSR7K-s>aCB)-m(83{99c}2
zCj0d<)I#w`&!Y<JGS{&(>uyQ!&b_G>(s_z@|B6k(LyYn~#=N~RI^49xzL>#+LoZnl
zCr9G<RB6CySdR<vp0Cz!Pef5T#oJ9(IZ>5nUM+1~DTo<flIn9g$HFFe_?P|Kgm60L
z_C?2>id?Y|Lf6m}ntCMk&wIOe!KTI@YZaly0#l`i##EKIKl4x<an8iN-;4e7=jO!0
zYd%*t9U|+!ujtm<JU5eXGR{Bh;_o_VjZ5O@*XQrv&9f;ib|QhhKSGSXAM*^d2?}W{
z%c|I{3YzTJy{zY#tTF1QnRe(-cH97s8@%i9;Ob%uPUlEF0^f|@7yPCZn1TZqUg531
z*CY-Om2t$X4L4qpeSSxIk%)IjW$;e6MsdrC<~{=cs4uM$+qWmPbj0ukZ-7JBqT{GR
zX=8i!$tJ`Q!^oe#gvfkgcbvmpf+n8qG{udZ;Mj^ToZYW4o6S@gI&^!L(@J;#PK9nU
z;Zjer@lTV~)Kjh?muWa%MMct-x{90vezZFv1A-UrvXjoM<!CLCIcIqVLEk40dY`Vp
z5kRAhDRnQ!1PDH>du-WGZHb##anc<zel+<}1Fyb_f$qn&xJYQ1{SB8gDl&VSOgNsm
zZC;E}$2{%V4qENSj#`Q$w8@1~rpAFpG2SIkZ2PX-#frEV%v)h397^Dyz0ehP$Ejdp
zFgKx_(}Vx@<aQ>Cb3Is+^j7JFv*~=DyOx+_9bo`TqPtY}>bvFdM$=928)$ggcd`g_
zP0xV08J6c_aM$f~I*vkhBF;EjB16?V*=?Qf+l7kGf~DAjcgJ~cOA}X%+FkS~;y_7~
zWS<uA5aR@{(g*f}+q%bFx`!uwsnvdfewk!@1%pQaB1Dmjn2LKb61*H+0LA5orHw9h
za0RDrUrPM`ARiX=@8|uKd%|zea{?~*Lh><!%}}3}Q5O4Zq->dRUWm1<M-ZTrL$7bk
ziH{Md;72#bLI+cVL?^?JFRdqdQ@ibLp32^;IKhQ90(Otl_<x0JvY`k{e^ylD0=P4#
zbl2uP#BdqDg_^<t5T%-}@!Oqy%AV~))<#49P3oJ&@;asLQtan9rCeJh*jG$ER_!ef
zdAed*2voq!o7Y4A_?mh3BeRo(>oF!Vz=-g_DAdVjhjvZDDf@&Aw31?167_(N{1cav
z2l(@UtZx7+54jmagjA_YjK#Pl_Dh>^okSV6#}H8Fazt!S>MBW+Z6@Oe03;RzCffBw
zwKBnz7|QdLRwu}bS%)TlD%XH~Y0rgr4!v-Gv4~#{qeL*T>>_>Z(@1{-=2dV-;$zc*
z-vk-W*eGUj$)~3ZQinQgut?xSpt582!5I=M3ymVw8MD&U7N>)Kb9DE6XTntU1I4EB
z7LPh;&<!u926x2!8iD|q4!MHh!B+@oqW^guuLlU1WNh7`_5jdkV0jEu6%tmju58`c
zZBYuQKvvT8YVe3>19WjupS(2o*x>{jiEx+W32JyB(1%B@fxaxlO?I04JVm*c=Cz)<
zvsm<THTr?abWYX!gDH}+mDRJAv?Qo8*Nt|>>1zv12MJa=IB4nz#op5Q#RP=&@!1`Y
z$csr5KO~yInn<H|xNN^X)QxCB(LO_`y<QRxK`rCQsq?>run(+x%p#Q*Qx`sG=DZy?
z-MG%LnCE0ZC3`jAF2w2JSK}d_W!ZPaWbx{a=#RJoki^Dd4rcj}k^l(cGI?~C-<X`U
zT0G~Cr&JvI5;^O`KR|k0Q1WLsWa3W$FM;$J+O8dl%K<`GPC@j-5B*5#o;@bbH5tr_
z7bWy-s2Ku!%sT)~1A`Z!wi7^JMj*1ktlzg1RuT1BO*IBahIktQ1KW#9wLM-tKFUSY
z9(Ilgc}gk!brPND0FvMo>bynM`~GvlV6@}nlBoF@{=F~L!+=u?51G9)L!NjPAsNnt
z1Q9c@yM#P^7Go%Pux=tO*M<zoxc{@tPp8AZty$2eU{7uxHF_LK+d}l)9|dUH67jD0
zC|A~&pXrs5yxQl}1`OoR+&~n#k8g&EqRP-;b`|R0Ae1}winY+XEayX`OUW(n=N&vS
zcB{O>xdab->QKqie(x!W%<I@Uggob$gMH$`5tI~hXpUkym^|qa?m-$*`1B;{)x<Mh
zd7fh3c-sx(o>JXmOo{{u`Jwc`AU|Xb#3YBu0LYi{(OdeCrq8>fUC~EoG<0%fM9u4E
zozKZw(ECX-_}nKu;?2vz`)ltjm(8>cy4Yu2e4NwxuhS{nIBy#A5VHv%zat0H(ye;%
zF;cRHHfL)4jcIwLMIB6mY}9tg1?jB6ok0j}9^w{M76}2*@+Y42>54e|5HAZd4=s}V
z1SQ!;!&VP=H+;+;#GNYOnmkzQ5hS<?G@d#Ce)errsB)4gyI_X&5W`-x!qX2V^<R#G
z>D?@NTW?o>j)C5`aatQIlJl-kP|nzrhR`!9)_wtP#5;|go0k-2|4a~jIJHwTUdho&
zCzON#kMxj?nV94WYwQ|ERH*SiFR|(!P?U)~K78M91fqZ=&*AnMUVn>qQOA{${sW*|
z_>O-u9G@xjd_8`v;Bod{#KFExgW%zL#wN$dts#hY=+P?JMI@C2F1s$wGt@aNSr;Tp
zCftXB%tH}tUp}j39^~IyF1xf#%Mb4=*l5)~;!4ntEh57WV6o1(mB3#PV8ab~d7fw(
z6#~@IsyIH6773F3Be6B{G2U3D@`IR-FiFonO;VNLtXl!WS(>h{ujh+tn|KvVG}KUI
zL<GVm;|`h|{<;TWpr{nKdlfI_$OkEK0!HJ~Dp@P?nueU}Z4Gq&D-FjuATN?sPbGaj
zIud8U&>?#4iZ;AAwhi77y<{eLH-4;U-r@qJ^`UJ8)$=F-peHb&BF=yHCE;HDaou8Z
zqPh~X@A8&w@yj_x(yakE*eve;q0mbLMJ-31)t6IJ<2J+hje$#VCHs{A%D3XvTT_x}
z*izeonWVZl^NE$cXR}Ia@>^uO1qN?`j154K0SV#J0q=p*788|*_Jv=XY60*@i%yO$
z{)H5&qb#v8j5Hp#sviP4DZ1co*Nn6aV3{EX-lv$XVQNm1oS;6Jc*Mu@%>kJ+9=tas
zsG(M+CENgk{1?BTc~qSu;2J40*X;3I*w#vk^MG@01WeE@KuuekHp|eYW&~E<yxE$i
zeBjz(-=UPPRo;Wv5gr;8`wnN>PgKO$n{QX%r1(-U$T9ZDRj-L9TEvZ6WUtF*KHmT3
zhko1>mqv@{ER)DFGxp}>mp=o;T87B_RUzfH9OWd|xwrlHWdS@O2f_#hj4OckTri^b
zLb8F&!8@K(mniFR6<@7oI2vK&wy9G0^lUIX9buy2i9r{u8BJe=y?~Mdu7zYFk9`5Y
z<WnU1uBnXY-e@rCC|q7%|A-VWT!OE-{PT9vB5;_Wb?^-miE!PIDZCn!;OY{?BiO0_
zP%Cjs)(KHSq&{oJCsy=0_k`8p6fXYbYd{Z&*K4>?*8l%%5Q%EJI$$_*D^~4=>|ez6
zKPDxC+?S6x0fuB#C>jP13(iMSmYdUVb7c^7dT2ic(VzevFrwS+##@}Mif?u}p7+$J
z96-Pfbn#=WX<&&><bj`Uzo0J2-A0*f7hzhh1&;kX4`P-OQXBBnT{v}2=yy#|r||xc
zc<YW#iQQ$CE=;m|KxP_2$4c@MOXia%5+K23;LF|;7e;Wt;?ybDa-NRglcoDRHS+pI
zWThJt7Q)C-PXwf>RFHlqJ0yQ2%(c5`#28m#k$QB#@Y=Wm3HaXoa%?^<?Gp-BgoH-c
z>jFZO;|ZqSC`YoZ&=5TWgc&D~Rl4LUrj791t<MxD?rqewIT=a>u2Uu+&05xffDa;{
zR@4DzyJbmsG5ic;;v=f(9gpdO#hkHbR)Ghsg%9VQZOgn~OPTw5-FE#l!M`{u<int3
z*bfPL6r%VqCLS3yO5*JAjP*QYVmBih4I49j_`!YK4>m-K9<50aV*dHTui?slm2Um8
z2XqWf*>{^1l;D4O?)keRcW)|i{HO{l=`e9mP8~xr=H|Dj6g}9a+TZ-0f+e1TVN>Hi
zvz3GSN`Le0a;N}&dfM|TFY-*6;Srd<nWB4$+7d)DMWBx3dGnqanXBIu_OD+Q#l1jU
z7J~#YQGJ%sjLB$$3&`ggV8R!Z<4k9-&vnF`l0xW@Y38K*BhL6&E{i;5RxO0>HU;<f
zbH`V3Nix46WEAT^GM!WSeMK)Ixbj2sbzR(q2|308H3DUMz1{VhRQsiYqf@6)${q3C
zuw;kGBPA5%ykS!j)Bp9Oeim`>esd6VA6Dc)emb&1H#CZx5HG|dB=EiZ9ry&{=~D`s
zQqB^wCR)QwA7Vjq6~V|GB9?f72l13wyTb0;@tfNx@C=b;2REGQpG$-h=9Ku^RN_mb
zvaLw*m2;0-4Nky`2ffdg;(MC0lt#xt(O@)W1k%8qc7Mh}d*-ezb?#&5lR(V+{zdgz
zZLp($paw$Lc55)(pP@3c!x8zE!nCv@IJP&Mt;ye;p>&nOn}T{vouSMGCMy@V?D}4a
z^Em=Dv1Ek=ky)rAOn!7P2Z|XIoUgqoXos5r1m0&!^-?k~$?jiBJYp#}XrlZxky}?m
zQgImW6q9p1=V)Z<O5#)IC`4G0at^k#pzP!8UY;Z<)@w&UBXPo*T3E;(dq3x-o$u0v
z=FY!k<8RPGEYTe~KnVr%U~bWm0ig@E@MzaJ+W?*j!9l3s`#~!5XyxqW!6lg3OQa%9
z%8aY2D(Wet880a=4im%11S#O(E8}&Y7%GE%uJYPNVmW)GrIxzf&YLJvH$3Wl+@Z+&
z2O}CC|Cbl~&!cW6CeaYeIjnk2isD~qO7xoOs}Gy)zLucpn_cQaFXz)q6sD9ow5Qoo
z^_~xTV6hp3<-^zUY9c4&@;?x~t>(<%8t7OW4l&KG64T#`yv1`}mnH5DsBL5+(71&=
z{>Uv!^7OS(9sfnViK8A`vD?hcTf{zi%bG0HcExm1ZSjtvZc0b^^hQ%s$c({~c8={V
zO<Udq>v5m%wQ2qt8_Rjr;u==4pm2*LQeR}N^pa`GZ_u~z0}i$y6R94uMrm_6ZjoMc
z!AAH&?&G;0rw7)$5{D^A6{08aXQ4@i2dy?bxPFh^3r}Z*p9TNym75^;d8ZbB=+XUf
zcuOu?y0>Yr5POfYNwQ{cot|hm)DFc!7iziZAUcyB+)R%NE9a-U8>*nv#CzdcS{3h%
z#&V5Z4dWV)i&>6S&2Fc?tK`z@W~h($F${f}HB$x2E!olaZ1k(Xdx7G}QIW|J)AwNb
zQ9^0MI~ECoiGkQM=b4;VY|S@e7K|Z3!!C!CM+VwL+c+zye5*8Rj@lyTkM!s~OCs*u
zC~}#1VmYD*6#}Sb!pjyemna(o*!XiSs^4U)ImEm4F4CXVPCathP1`I~X)>se<+_^q
ztJ+S|Xm<c3(eh6J_h}7bt93htWwuy(*y1m-I}xb}Kd)HP!}i}M=R2OAz>M(9(+7c`
zqdeRK<(IGicbI^+@5Qfbw3id|tp0sisX?mI&3Z;i`7tatbFskR2jbs87lv$ta)bMn
zupb~zkP?n`$II2m4UkCHYJ^fTzqxtlL1zBT>-ITJ3DQ!pc9;)isM+?PmCkWxhWW4S
zX4o?C-mz&?p%{rUeXs`<6aUhmDDnDS(^hHTV~@44v<LBQ&2#T1a|*gjw3K>suc_6@
z7K=oxr)b&Hg$t^MbYAh(GE6crPyM8;l;g@@R1<7!!Yce{3tf~H^U?UPVEiAyeS;`U
zcP?M{`L{><uh<DXH6OwM_|Sj-+yC<&|2bp-XFmQvm=Cc90+GW_r448YidD%}jRv-`
zAKD!nicsAjH?Lm4_uqH;113nBK$i=WqTk!MoFE5mWNhJzjR8UINudBB>1gfy9v#gA
zy%h}@KqH7-xM$%-beegc$U6}@$|mu8;bOG~%f%QfZX^8BqOJ;iQxMNx>`jt9iO9SV
zi*&!hL5FM);7QxTHqBeV(qY8}Uij5KSy*W`{a!i-EI#)$?T$FKE22Q+*JuVb_^O2y
zC&qax8$m{Afkr(=XA8QnSa>7MWf?5!nYOW-<u-;WG*ZMOs&gZN^pi|aubkgVdfxeA
z7Ib@^BbWVv*hAhtc*#@~K%F)y<@G)Q`NeHEbh8)45Ls@->iK$gc($N4HwsUXlkG4c
zE#X}mEyZ>^HC^%>l5>wn-UKiU85-%pOyN(!F6)<poMnm3l#{eKfKhS-%)kY`1^l3Q
zFN%G@#lV9Krh*p_hxSqnI~Rc&hEf}Vxz9m6MqA-NvkMK(H$}mstsN}3%vl~rk$<KE
zHEb4@kVBSk!`^VKVH%Dq%nf4PX9=Q$J07b>%AyMOLu}cIei?iPF`##AmlJ$)v|{P8
zq$`>CMIDvCto$5s;e=?&9QNcjj1W20Kbe}BAbHcxKU67%D#P<T(450P!LBlZuHGJD
zD|A@VIBtVh$mWq%`V9<N*a(y+*bk^vtvxmFSg{aXUDpLWo2*O_g-7c5BECt5+wITm
z!8(94_CvcLW#6yR;WBG|pZ(#(SS~Ccb<z!I@#J?{E8(>bBqvOo^52~l4({Bqs*nm_
z0J-FWl5<@a3!P~IFy&FkpG^@}XUW62<vQ3Q#91=&00UNS?p%m8=UI^G_m}6ma?wZ5
z%Y0Px&#w>W53mZN;e8onS<m0*m!5LMt&Vf*+pT6)l@h($osnU9bq><ld!c`Xc>koj
zpB~nJT6Qj@{d+=W5^KR(pK4zbc@+ZIHq&oVNA`8x{tI?)E*P)NfP=6dR)gB0N|i3U
zBVOpa`QVAg<K2YgW?H*0arIGn$;i7K8LQ)!&sWk;=0G=t(?HE`4w56P#yAHf_I=L*
z5-en-$|ZQP?5@-5^=>fcii75+SJA!zbypcs+XX#}iucJH>;qQwH6pY#SO|i^lQA;O
z_HO5bdky$y9`pu*guT<hl0CtYfQJVfkPPJnb9+9BdenHBT{~jqOZ)}eW<zasOS|dQ
z>632CK_BTgu&_Igegp=r(i|QD%a0$aomG7nz~x8FojEQO2)TDVtuhhEzfxDfmH6e~
z#QC3#ORNGawNI6<<$`}E=<Jn53cv^G-B>gDn^!)S7L1qfLbDD>WphGh;|TxGIGtV9
zKF%(X3S0bhCfD-Kh6^Tt*85XFpOKqmtN-Nb@3Imkdi27q0BNe2sax8+Hq=S8ad{|J
zEHFoOPoa*^befQ70{VRQs!Nu<%IJA2d&kOcCB>5-6Y?xO9qlxV67DLRo{2raz@`d%
z$q+8U?X>c=cQ+fbd45KOe_bRuL#EkB-<&$eW!#ovf5_AKtm1fRbTRT)%!8?|u4Az2
zXgXMRM4u+m+S^*9j4K^_g>q9Csa|dLt|~KJC`rs`=dp7i9;#ydc_IFz{gsOGr4shr
zI^v|J+hAI3@LPIOo@sG6?b))qhG5!Yy+$yZi*QV(egCTE9C6-|8=|Y6))uX-pdgrD
zc4(I!|I#q)^IKx{&!%INOyt4zgj%yt<NbXS&vHzP^p%ob>UWkP175oXg<?&7iVf-*
zU(KLCjhe^Jh##9Wo8aN>pRO@?2OqUYmQ26a?J<-0A8`WW`(nOY^h#O@Iq6#%*u@k3
zw#5l>4>i(|eOV2$?#$K)(b<$<Y-Qrf!);nYkBqCrpGp^`EBSlM?W!$gY3!Me(!i#a
z>9#6a!OADbK>pb`$K52a=QYNf&0l@Q5wm+7{B3pxw#_>Wj-5i9E=B*7a@V<CI7)Af
z+q7Kh%FB)!`)lSijY9(RGO61M-o)May+~pi+R>mu>Efs#8T~f4ZtbHWnD+1o3x8sg
zr-I$%ya(CYW=ARx*4j_*1x@|d#r`uheW_v#H7SF$TcM_ZpUCt?F<`>TE^ql(#b2lD
zQgpBKYT+(#*j1c(z<ZB@+U11dbLJI?fEkhq+7Wte*NFMnN526RYrV=@8%@Xgq;j`O
z7vMB}dji}QONB=UNw)7|uWj1u<C3Jf02F)D$sSV@<dVKhQbNC+$@mISucfkL{;hlf
z0nb=fMr-P0Ih&&Gl7}_~JODe6=;_kDQxX}Ypq?Q@r56XbC(jZGmMi6NKVZ{1?)PpW
z(aDf4D$B_C^bLH3)6kCA82_Kh3U{ZNAi)DHyKuaS7E;3_=POLHmIQUT@ttC%Ld<o5
z%E=D5$d!F4Do;>mw$fq0qfT*(_6Y}0j5((LR`E{E(=1tteVrY3!^!dc8q+sY@Ag$k
zoDAzcmJ>C$QQ6vUTRHkx-NA>@nw{sPPeG-&cR5*qe|vCC8=dG6=+xRMOs!gqNmhdS
z)b`5AWVgRHL;YJNLa+044#<3bV@<X8m0fr~Xh?gO=iVH`TqiK@*>J0oHWWVYB5~|X
z#z0Rym-_tNYty8pft!nOAJ*NCrEE8(iGN$H<}B&^*QpMDP1bVukWE`kHH%$*p0FAr
zoC+GEUcSHOC4a6RnII^tH^Z3KynnB|*bjm+PhSnGvbqz(=_+csW=K)2Wbl3>BKjPY
zJQK&-#Yf3f1f!dtrD35$4X@U}*PP0=#$WcCDXR-e#wDrDjnA}~9-FSI&RNQuh48UC
zsU=QStbw#C4%Du6TO^G6EpEq*mzy63FYUn(S{|>MPfZrgxW(~6AcMDKeWuli$(Dep
z$VWr=Ze^>@32F<SsR$Z4brs`#^m6*p?IkX<B9XssA~5}-myosw060Z~hac_?f8dIm
zqv^8pungm3ykPK>;9r+2QrNu|Ge)Oe{SnfH|My}usnoi?&WgbIINC_?@!N6E`|KCF
z>eFoV4x05?gC}R}Mf&;$g$;@mcsXZ2@pp|g23CtW2BYFPK*rW7XnqZkTF{aADu3d4
zYUxN7Qpmq_STpNIl!3GB-WcvQ5tuBoIAxgll7X5S^({U%441?uySrS2y&W19S2v@d
ztt7x3CAvDp7E3E$^@tL1nd|h8Ke20pBgv6zCfL0?^3izxpH`6Ht7M{<ScK^@9rTL(
zr`Lr5e~i(aj%X}!IMMI@i2G{!93dIY$_bBH-u>Fg8(N`4Wu<g}V(g;J`GS6_pt$FH
z^1M_VBxFZ?`{hX+J3<sc6DH_O7e-Zy_{Xdy^Ew>pF363LpysLjRJx!^>{(E+q|*{4
zHzRrs@q=`zJI;YN9vxQ`lf07Q{YrK=44*T3{c|uYHV?Fgj30N+L$VV@|DpM6;&;+d
zAHu~>shvO9)wRP&yjOlawmM2}n)Gx@$1cY?O;2Y`Ijee<PI<GcyR&=X)-EAW2VBF6
zAA+d3Sys88K<C*16lT}Rn+|yx2`C$x|IWq53h;m6Q9JJ}=QuK0@tuc5x}2;wPk0wz
z5qrwR7LL;RUfEnv!OCKzHiH~`sMCT{Q81`t*t&A6`@~pTGXVI01I{ljSDd#NXBbjf
zUBhy=^Yzb`EzCKe7F~@?(U}7Vt*u)Ub=WI52zisE)YR+g7}&AkSsaBPg`1l`EQyZh
z8)?=NyjJ^F2y9R2lxzp$mEWC>ytu`xb(FAJ;$-BG<SpEQXD10f;PyK-)rH<ksj0HE
zL5|bq$VJuP(;zIZ--^6YPa>@o)_l*R_XGEHc~%mUe=j-2oB^Z!1TgK{zi)d^J%*a4
zx@4MQMOotgn?3Q;C#XRxjLJ(O3=Wik(Cd~b;Bn;Y5n1Nx%NGZdCvj&!#>6cP`pxpb
zP#QbyPUZy2xavP}a3m(t@$&i5KhA}oQPGZS7Mdy$FiTs>L35jVUbrt|iz-_P;@`o?
zP@iRYg4){0n?KB9OwxWO*ZMZnq<*YOa>G(Md7QEwHvs8txg9=M;kZUmn>K~K2T#vM
zfZ{V%SDv_ti*~noU9wNVDf-zmugF{WbM~ew9H}3V`xd{AH=5rIQvglHO_!?K&tD(@
z&yoY?B@e*a!VR??;@{V?*InXiViap30OY5I!}@%hQDR@+lC`G9%e2oKPd*j?Rzk*S
z&#|w^qXu7x6EKzJVWGy)N7CMUYk|ZDOY7j&l_)Nt5XqK!MtJg5s625`2|GQHk(aRF
zEOjvG;xR$=i)1p{IeyDEDek@1w74YL%RDST?UfY*HfdV8BrC}$CoS?G+CQ&<dtvoE
zv$p(WF-b}!4J5Ewj|HLZyfZ|rvG^a$$v)7@x7oXTsjk5L!Ri_zLB?_zTVhg9ABYvg
zskjVeSl7ZA`PycK{v-KNR!zKv>0xsriGyak-zWVnN1k4t_rpjIZ3(jftFJ2$YVr!=
zP)k4rqId!VI<<%#o&g1G3FkCMuGSKhK%hjq0zpK=5ELngiimg=P!dOuX`oTVDI_pK
zX)%Bu1my|_3T;G=Dp*9K4bkpL5Cd(=e{bf?yldaSx4XaJu1Tlv<s^M=vk+4GVO&V9
z$t|}fhNdT8Iu6$tO+ET0PGe`UX2Dn)^Xhn=Ixpf0!P+k9A={t{$<+GJ`9+BRs38tb
z!|xNI+p;z*()ChqM{_&VNSXXTw;q}orSN??DOZws`f*evu~cr3-Y=hQ_iP=uf5F^K
z5M%{{EmyrW%pkD+PBbf4C@68fF6AJmnG{7VXncafR7)_`>KS0Lok0b5+Y-4n3z>Q?
zDS|KB?-?ql*Zwil^+Sxd-bnC<T+hKqO0wuvZ-N(t1az|-nz9<8Q5y-Ka*ZZB7#Ws<
zwFAf6y@>umESth4473`v$@0-5=j2Ok!%DJEADoG&ymc$__B$}b>{V1+-9!iwG~u0d
zV<oK*Dn2ouFFO=OcC5)fp!~vWe&3+w8{J|<L=+<~Mn)}K@tIr7R5}V%+24@1(pfG(
z-Ycj%^z}_0_a!KuCc&w@35w~HlQ$&aUSVZtOGVaNdR9jr(dm7!5N3GU7FRNX--xMt
z4!7D%iE*N^j0zyoF7foL^2aY2=G}3YVlG}y4P}Ys8+@H-Prr=kQ4L8Y0V$+mOl77?
zZtayMwMPnRjcs&hb8L_CtuL8+vI=8Umk|%h>1OG`@Yk3y<Qf^Gol%VUe1XK~iYPM;
zMeg;nj%pktb&KO7stFlcp1etEm*vZH@DQ9xPgW@JaHT&v&kgK@j*$9G5lT@C!(pj*
zF8y6y&N!jB0#Xa1atburblN#k5sa?nUwtE|5v{8g6ym?HBNm`&Xy@12e}n&_GTp&d
zbErx_qAP>ctSw_c<pPvH!*qkC*UH^nmamAtZF@?$r7!WcJDEy#=@(UpnmZs*e@=&=
z`%kHO-?x%i+!J&Ffw1nk`AUg=kEyGoXUt;W2oyCsd$PF1M_t>(wFr5liv8CJM3^*_
zZgFsDKA$;EIrD(n=iP%Fx<G3#ykV3@<<;cV+W8jR>l=DP{d_Na$R*R}*rNb<zi$#L
zYqrTz08}}-vR83D1L08#8`y(|_M?oLLC$wMM+&!cCU<RHHvh467rV=^K=xzeOpFgP
zy^HPC0qP;u)rzQ_p1{wLp#YJ46-f*Jj8|xAFeN&>owAl7ich3h^(Vx)L&R6AVa&a>
z`Xfj8!{tPQK+r^B;AZ9OtM1I4jbF@x$5z;AG<}h8nh<9!<-qd~U_e#n_`!qO3Gp&-
z_mdAZX0J<|jisUOX5S2#Qpw?EGgxVN-(X)e;7Go6Dk7P-Hy9Pywc&%m$VnR9vwT+5
zC<k~Je5WhMf1afyLy!o`%fpG6jUhELHE2+}Yb%(#yd6l@!}`#Jf8Jx!YnpWcj}E1=
zIRjP@E<<SKeU*rDiQz^Kz(XJsOt~R6n=<^CwL6jywD5PW>yI^UOf{>IS}>OjYU;Y0
z$#ZV5d#m}tB5E-RX8+E!2#M<lwkhPw@^EbQd6fsSvuj{yuDYHI2?nnLBE4+*xWzT>
z_;11%uuut;TQqgn1th$e_piOKw^a2Wg%z6bKz_RX4V;B(&A*TwztPVhy&0{FkmTaG
z2|JLY+8N{Q(tS!!_#q)P#?ej!s2fBbpMM+*;eAI~f;<8x?mkj>A*iSPh(>Te0CS&#
zi^~>pSLS<k^_1b>xh)r4VX@Q-4g7+st2Qrsf!bPXy{<e}_4T9r1?)Qu4tID}z?|SN
zjP5~)`W6r_%GYqN`cK(Y{vn_OB(P<T)tsAYK{wFEIzd;PDg&BkuxbA@q;QCHmGvZj
zzK9@yRSaFFtkLOF^>NJ;K2XsuaA<7FrHcvsQiN*Smb8)l@1f2Ou5=dNu*l3e<U?2b
zr0s{7Ef|Ap(-uO>;_!NXl~?G(UN;?t^Xatgboe4mcrOTclx2Nrv6-i6Pdn`0!K<6x
R8Gl|4emGYT7oJl<(m#&Dd}ROt

literal 0
HcmV?d00001

diff --git a/docs/source/assets/kernel/query.png b/docs/source/assets/kernel/query.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2d15ebbfe26ec00d2d57581a8709f9f2ba69369
GIT binary patch
literal 32710
zcmeFYcUV)~_AX3EKtKgVIsyWM6sgitq)U<Bkq%04ks7dobOq@`=)Jc9L82lcz4sE3
z-lc>Vl6;GOw(fI(_dDnRd!L)<NwSi)=3H})Ip!Ge7~`E0+M3Gbr1YdXI5_00Dvxz=
zaESDP<Ap0kz^BjKs6`wc(n==<1#MLY1y*fOH+v^%I~*L9h{R-KJzZm}ki7tb&(~OQ
z50iXWb;HejGl<8nLd<ZR<eJ)*&@5}in}$_6Y_GasRgiKw@+gkKdS`7)P3B8$`&4_P
z{Z)|>hM4gc410#fUL5xK1-c1=?002OGKDE|cwP%67PNKa*s6HiF5=3TO*X!~8I1EK
z<O=r}ipfL5DIHBCqbu@uI#~005WeFRL>+w;au16&e0n{yh7d>IikpkHmXMB{MY)qV
zR4)l1r-rhgfxq`d?7rehI-xrGk1Vks2LzNc9zP%02rkKfSTy@MhRdpb8fYwpL(M+g
z-%Z1j>%ra1J)`!<nxuPO>R0cNAJjR~tAzGNanvi-r`zB6esW;>>~q*SA&|0#i27)<
zuZExaG}Z(#a?mBPx>9qXP&kPs_uZN=?s~#QGO+X{y0T}gKg$MHZ<2MQJ0%pEZ;5uj
za(2DoAUkXQ8rif_`-oFQb$dYfd)K0(W)1g@fOWiG>SCmsst~_J{8>$L%RwFYTN%ag
zy59%XBcfB6%{-3ERQolfu4Eo8@@xzFHRc3=;P~ReQwg`;p_@XK8}{n<_Nhi()ho-8
z(}$~kON~rrDlO9BK0u})MChEVdj(QA<!xAzf}i?_e9O#W$WBRr&HSUd$tqIj$qMmX
z4WTQzxFLjW4@J%yLS@aaQEcP>xXwo}l^c?N*Zdw8SK)HigLOWt>X$b{k={w4%RjT@
z*noV0s<ha}A{5{G`E@64gfZJ4?Xu1kI!EB+h|J;3zm~QWel#HGf{PQwK;UV-`^IHk
z689b}F8wnc@+`8ep>$P`wmgY;zc8GJvR;cEXCz+5$5p|llP9|QD*qV`H;&>n;Yo^a
zdBRCDidSt5+>Ur$Uya>y&2T5ay8Oa93JLy&cQy3n8TBhx>Bn!^-xK8#(<vmqm-!wh
zeOr#3fHeGP76CV1yu7F5-8v$kSCQF5MK?;qf<ras9kSK6X}u{o-x6hu4|3fKy(@@6
z9;)$-A%)y);n{;5nZ#Ue#_r6YDBa$uFY=sTo4o>gTlodL$b3pD9gXuQTJU<%cTy{M
zGY;wq<@jrKHH>#3tG(yRqi}jC`958R^5DkVyRTO3w{CWjF2$03e{4mNa<A93*Z&FK
zDr>>@k=Uk)jrOZnt=7dxE@gMd$$Q#5tP`rqstG4=YzI5zA4?yLHL#$V!`tqA6g;3c
zW96qvjx>E`8aESD+FAdz**&PyN}9Pc6#6}2L18~IkfId_P6fUKCIaIl;;{0pR7Y2i
z3BZq}A3ieD;J!V2O_R#?TIO}7i0;QWISltda&z!=y0Y!EH(aZ{j*7_TpozcR^_p5$
z?M^vcV*IAcnkuELTbFGO?o^_5`aRA{j^y~b_X6<&T`YG?sL4o^g_79{o%0y8>U4IW
z&g&HGQI&{`W~M33sgxIF=E@cXYv=G9^X`9qqsIHOJCQ!omHTtfb*Byy3Kv<vID<!0
zN-(8>;wO4yPgi*k6O(z9c_G@Zxzc5i46AiX^t-f;D|K=`Rqf<yDxc?U7V_xz8P1jZ
zYu3lb4w7$}HHWl@wZ;W!&JTRK|AbCji6{4+N6JvwEgfBbFTE%O(>#L;U$ye^2tkb$
zX?sa8szdJD^fRZlevy8Wp*o+D&4RfUv7Czh-Ew{%0li8?csWM(b^c=Ehk|OwT-Buy
zoJu2)yc$*?^~t2IiCR9tUpHG%VDi>PxqkOW+w{%p#cB1rljoLW%;Q62B!eF|w?`!k
z4|S4@x60E?U>YZNyIIWU1G-veP>sFld9Nh;s5Ui+#F}Yih4bn2&~s0;8d?+m6kqo>
z_v<ktBvIT|)iCEajy9JtmoNn-e<k6VB>FM>IdM+<+8Etv&@IrdxRjK%?<r-XPC|5o
z`r<38tEuB@vLd-@eyMh;`l)k5ovF2bBfX$>OK1q+r+Yg0y7?-Mza{l2S@+B(nU7SD
zm=~%S7;3<66svz4t(c6NAWekJ70L`v#f{-L_iG}cQ`RF^hhyViVspNb9egQ$7%Obd
zm(SO2n{S)&c;y(%SVV!L;AKo>vXC;7jAT4{Fz=Hi)#q)pHM)A#KGkl#YPKf5ZoM48
zP}8$Qk^Ys!rqaREyS@3mvBF(u<9w)Zim!90XvT8aVAp9^bQfRviNibl?ew8FtA(#8
zMMvxVwtk#1G0Rg+`FrgrB?taLUDo-I=J(LiHvOtAj6Yirs*xYgL{6jK=Nfw&%Uj>M
zg}7zA6+%iZKU$_DY{2+fTdWB-8>cYj4sLnq<d=mne4(J1lG3=9Y2wt&H|p*c$)wz1
zyl!^$g5x{e<D0`mRGD-lUR%B4wb2f$_M&fJX<Mw14=P&~SQ%Tv`#bt5(#>U2PL1ZL
z!@pv$_<w)(F74gP2iXtlY%Gsc6}xh$bDV~}a!YfStpjZ$ZTziWtdrX<B1WRAIz>B|
zmU5R)Iw=SC(`$n?v1Wx$gkG?<)P7Tuz2U&VvyDS7zAU~4fd@$;8t+t#R8M;;R}A;V
zsr({fH^K?bzEXT8%Z9qAyK67s=D8`OexSmmNw0FOGM^@$@-tB<{y?XfCON{gkf^{Q
zFE;OO0h0<Ui8;Xn^6-bsj|aD(Cg!P53;PJu+WD&T<r5BP-h-Cy7%S8~o5DDA2Uz%e
zQQ*78R&@6*%l!!OlJV;ClDije@zVK2H(OUuX%yehO4Eyb19E$7cT(=Wy5pjbOs+7Q
zhDFzwc}mntd-%<ew$YsNo*NRCNjFB;@wkk+mAV+xRt7^k?KRX}H8I^6tt@!KMSR*k
z?`dX@w^|#1D)+|yWRaSXD4S{8eUV-D%kzWFQ`Mn0<SS&d_@oQ>+%FfK;f{Wd{`6z#
z4woe7-T9yOE6=LcJS;pA<Ef?$(CP*n*8phH*JJfNv3F>sRHT-r{M8iH(n0h)vn_{S
z#%fbLo{)oc^J&=l4*eA0Y>-XdNoF%?Mr2N;mRtn5*_2Lu%E(SfzlIhn4%!<D8bV11
z+hff>TY7<qFapFJ#QqdvEY6j_#T6Ap6;pa-6WuR}Df?u+(N%kktCFS7-TwALL)YCO
z2QQxqkiflJ$YrN}ql3zJ8~69k$&bRmDHVBmENSbi>1uhIc}+xQQMvH#@{z+$u>K2q
zn63le=Y?V!oEiOc=$$9?sfz|qTRpX14yfsybutfV19lBi+sFNfU)KaIe!<-<UXOUJ
z=5)3vB+v3^H1@X6dbgjcE$J7QlEWp?<AWQsQ!o2gOb&e}w)o72n>7)?nmvvS_WH=6
zFQFB)nLe;msS?y~Mia<?mvoO}RvmtF9=wYlBIaeWX1OV=6RdY&gz^~MTZK=YJUIEZ
ziQM^o&hfMBwd_Mglw1w=bQXgBcwV=hI$yhqiwaG6jd?u*8StCOK03EPvil0*0q^?T
z9dDjm&V$XFBAGV9d;#d4zH=zO8)b`E;Di{X*u`m32WsfUH%fXIieOvt_9^w@#?<<^
zfzyF}%U*b0AUWpzmuUyoO{X<lZl08NJtQF>=cN}8&T~ebqb;2Gt51$bm+&+KiKD4b
zwWNnYaZ$oJ0j9X`%t+*~lz?)k37ACOa5|=QRNQc+9&mTqVx{Iw<0AOa%Lo0h?=WVe
zd`%5rfqy(^!J5xuN9C_0-Oerb>xifw(>r<r1RH5*q-w9Bfx`nFU%?^3rN<!zj&Omu
zEH1;JV<lW}9Q?oj$HT!1cfuj~`x;H)`||Z3cwe^p*EfEA7!EP;>jv<CnT7ZJY9hTX
z{NKko3&1&?hq?-?s=&9dji;TRtCypjx75xR;0`z>?kdKB1a98BeB-L>aQp)LpK#JM
z@;1_VB5C91!f$2kW^Kp+(#8F<9~|kIlE9&howpV1OBZKXFUglOY=2!L2^?SkEx^Y5
z*CpOBWY~-}v{@D0JndM;_yzd|*<?vsSy`n$ZS5s>9xMIb9QY-}=IHJ1E-4@Y0)hBJ
z!u)QY4g&WjBqRg`g#?6z_<$?;y!>3ftzPoEda?hjli&S3w)3*_baMB0a&u+9?AOZL
z&Bt4YjqUPAe_sC@r`=1Zf9~Y!_4i`|4=8YXM&LfbpunHLfu_=z|4M2*y|i;Se(dA|
z$PBoLtdNk1^k3KiZ)g6w<G-~u`lsc6F(JW!Z~AYi{%2EtFFQ{KHy7ZZ-m?G9*WZo*
z{p8;br3Ee@{oh3KFFF79FCb`HQfYxdlO{|0;7f@VFprE*kF_qNt4kfYd|l2i_ut>Z
z@dL5g9WQ3!8&CD|L%o-{+jAsGOiSp_eXb;MX@kO8aS{i|I8XV4Q7owo_vR#pa-tgF
z!$<{2Q}u+&l01D=hKHZDQ97fZdc4)klkr8O^tY{}jI2o_CRee6oIxGJUJkoLC-Vrm
z)%g>zegR?7xP!%j7nc`guU9w-Fa0VBF2OZ<od5K}s+&cLoA`vmo;3sq@5+Dr$i~5?
z;lcaQ=dZA8D};ook+a&f|F>=k^osER<2`^A^5nR9Jfa~fG5=*KSFUMiQ~alK{?)lC
z!Ih#>9BZ-vHe7l6x03&Dq*q3-v1aMuznl7R!v%!0`QJwR|Ag}YC*uFpHHGQ+uXb9?
z6an~!kaUu4(0NLJl1T}BY}9M{EIQmEuA@13r;Yq%-`uO$|ExNCYSye_)>~137RTq?
zf^PHDhrPlBDOeO7E6m{9m~HIZwBpasuuT>Ei1eOG-H+}135(T!slDg}XusrE3Knw$
zA97H-wyqSKpd~wH7P#MTkGVJ-T7+L5^b}aagD+41x#K1+puTV#b++F#Aj9w8!v{@S
zij$jfIT}{44sxsM;#~bkCLm(!Qm^kU$#QL-FO&~DJ1mXbt|i~Y6c=U+p;0GpFJ-`Q
zokj{$AHF6!OXw{rt$4oLpTUscK}-;jC(wGDBe7YO+p9n+L)~X$pK8~u%9UXoD|2k*
zg*jVGhhMA+*!O>u7`oUC+6!1c->MW3LLJ8x(0Dx7%FmO@SCr3xm#>)l_o%b(;SuO{
z>*y>~{cWq7)fRx^iV-uv%xXGYO)q7J(qD8$)7c~$Sx1!g+@}ilB`f-H*0h%S;-&cW
zL5mmV_4F14mJiM8(LZV7f+uqzh~HK@^qVn(Uf}JHT7rI=6Em-GBwTs@2A{hnp%(66
z39VQ;ecJIfMzNHVN++MS$FCGo7lr&F{o8PS)C5f0Mr8EWzo$)J{0d{>udntOr~B0k
zOhUHrjH{l7rb_#HI{O>bA6uucDs++@*~_<EZKA5TYkI<1NQxfNgpCmtXhB?k5kVkf
zn~rG5>Xl^6c}CEzhyC)84@&0t89~QWL((y2t>>Gome})TW6+$B%}CUR)nT8wc$^&O
zWr`vmk0SrwgRs9XjpF6Z9^GIA|81EDeeet|gX*WiSaj}^fxF_v1h>E?hQ&H4B6x<;
z{R{4DX25PllHWlWSDzAt3w4mM!0Py$Wv20m*KVPk@{&PY0=ygj_M(#fW{rY@;3#^k
zE!AM=x|W_!2T0J(R?5{F@p(Z8=c-U+uRc+m^uxoo@d&<Wa;G!>CY(*0Zh_+gWJk=D
zOk0t>6}My~sypak==G@`Ov;|G=ejdg!ZH4*^REF#mMS6B7q@j-<%5~bXr%8E>wPpN
z6mf;4kAKZ^nqYpg$BC?EfGip}#b%ssnEjSNxJz%5Bo4@7rOQB-pO4dwJ-hC-?Is>s
ze0;uQ&X@HQHePbrozlH~wgwS*YdtIYaP}n>Ul*oDO8FCvJw<8DBF**Bz`wo{EBP6k
zYi+&iLMUDDIGp5yL(ek@K1qxe1sYoT+0|msx6%_0Oo}VodTcBw!h~<?FnJ-=eF`7g
zb$^_cc-mP2GHl|iItsFIsUOpU7Dx&P9B)_6`t)OfwP<<b@#6JfE6R2K7+9ole9YHo
zaPf&o(HRpU`7=G{vBb>H@`H!+cj_Bc-u-qGBK*$;C{+J(DTLRng64^jcO4a_zQJ?j
zWF<u%8omloIu&N*tTaqoN4ZM`DEl}Expay-h8;TNJ3O<TWh!4fzVEv|IT|#10wrkR
zEr?u6lw(*X*+$*fnW)`kyW*UcXDu@mWBd%EsQp2Xfsu3`(2Or)lElY%KSd(>o7Zz3
z<p!h=pHxlRr?)3P4gNS$=Dqj%gs9<zNEP|TS76<z)f=f74y20<+!Av(x(DeG09Fyj
zZu3?}BzlVVah9K}1UaSL#ji;R@c|-Eys^--kI?O!2Ezewc(2PfHnWv+Q@5A{H-zqW
z7bWBvQ&cI$wV)Lrjz#aa9eDOMW^d+hzTWgS0Yu$X(vrxjk@nGq1!=CLjy)Ry<1ze-
zXjp0}T#S|4z2z7zG*w80#~>I{?lt(3xLx)|&`KB4ql8l0RWIfM2Kg&TOvf3%GcM3C
zM0nOCB9?39K@+t)%&0OKQ6co0p=HcLiI<{D4epjB9Ya}>(K28*Nie^{<@7!wyk~2w
z=3EJ*Zt|5FX;H~u{A&B+i%9Wr9XprojA53M^a^sZ880)g#!_7zRGFu>celxA=jf~a
zGjkFI-XOMrkX05do|YWid(^S8!;p}!CjK)@%ETbXA=st{k+Mo(Mic$35VmhC-!2<*
z1F#JI#XZeFB6{o)`qG4r6q{I8^9i^2N6=gk6yEy6E^JTJT#I~+f2sncll&Cl)!=~P
zyj}n``fxmlFSSBmx8BC5v*KAQeXe=)W=W||Tpu5VahgA3wq&SkswPV8qs@0p@6RXm
z0gE?l@OWn2;6dJHq#R9o@?C@33b}q=Hog#=U)qB_sTX@8WJR`w`E}*CZ3i2L5DPDa
zi54!O++FR31A~D<*`}pRi+B4i$Hz|5VX@}8QFS3k7AS)SQ3mayrU!hdpUTK2Y4Ztf
zIG?0wCud(yD2`}T<#R{^Qr0E?iNYVisRG!m_}y=Mzd>l0Apw8Wa(t216VSa0wPu@n
z(N7xbVtfzQK6&x%Zi8@XMCWHJzVpyy;&Bt}h+JfzR&_E5=*<R|OI7-S>LDK~MHvg`
zvUd+ko<-9e%sI^UxxvnguPi$<kcu?s+3Yjy*0<94k7S;@Z(U`UelQ)E1Y&j#<_#yM
z;DG39yDRQGY00ct_#`{i$`xy^a}P+aieLd-&|c0=k<JI3?4ZAJ{~5s*){kMr7~|4R
zysS`iO!hv(%m?f-@$x3Z3+{d?lw8`33WM!7zsTa^x%hasi2F<4$^}xSVuurMDUJyA
zYWL*n0jzrAPpMp&qD+`AOn~g7(DeenQCzy5V2JU6xg4!l8Kc-^ALM+~B$Cujv^+3#
zpYxe$r-0I-Cc`{~f8q0VZvlS7Vt;KyPL4Y#3nb)NYBNFgrpt5UsE#W)(|@x#1=6}l
zI=pdL94lRnMC0U9oa854M3$G@c~=bSDj0{F<MBxFl9t~7?f(rzLP*E{ObYk=FTWPs
zayzQk1wPl=_m6Uy5s9G9J=$r84uY3lO>h>=YWpL{5Ak_6D|%eC6ht{^KrbK9=xZy1
zCQv;b3<bGwcithcY>YnzX+A$SXvShtZLXeXv`^Dki6f|4RSKkb8`&GcqG1N={zA&+
z3C}-ji7gwL3JgfYx7e9r2b)}?VVwBV$1)5Uw(2lH7X|aAraUPWy2BKVD(9Di4)}+Y
zx=torDBG1H0VOU`)-BYG87tpk>g??H=}oy`%|%DeIHW66!R(ZBHSY|Urca9qG5V#9
zXx&yvdk|59ch~>mo0fl#>Q^b~r-l^}ENK?nu(;RLTF7o<IODFhe~MZ!-#3^nm6CDD
z&CcXJOlS`DI;08oDyBJhHT4S*mTzCFo*twuI$hH#KP;3(-!*WfDWi9SSsy`9PT<(j
zY^BFR%3sL@TCSNIl5GU65bMXAcbq+BAgnrKIvN37=B*hhVEo%@tr>p@UXD)yG+|?L
z?@w$~r~ifWebI+ngM(x^U=(uN4~g_<b%`qWwY0R?GibWx;GtSE7~?L=qN4lfp~TET
zOhnQ!>$!0q^f-lD5avF0F@6KG)yKTZ!FTiIi?bq6!~&b0*UiZ=y7&AmqahbH0f7HQ
z8!t8@+;@`wN_#4-J0&=63H64>R#-swpt;kz7aJSIa(7<S2m3i1%Md0eEpH1*S$}^Q
zDOtHEUxN~aqaJ_}B_~ei@n@aumFY=^nV(kIcq>0!0`qv-8@bIZxE_uCo}^x01)AxW
za^}p6m!{<$NIVZ*d`qj;$&ZgcHP58Er$1r2**ciPq8B&YSbO;TKB+HkgPunvHHrGu
zNius@e8G2-Kc+zz2>*~0)zrTuw?THi;I^6lP8Lfq2nMEk(E);o)_vg8i`UXRjd36A
zKyy90c!I)Bv^5mPcy@Wf(+h%5=ET!>#0PohVw{DiRnnD4Q#PWHs8#DlTxS~{r`Y49
zg&G{oYovDSMsj@6A!?;P{7J>1SS}6@EY%>hpE8X+eCatM)BDNb-KMo1yRpe_F1eyb
zH!Y+fZ%PhQpmE`qV>-sazGF;3Q61Adhcca&7C-7K;>Jxhm;wgETiG(q<S8<WO8QjD
zqV4!Ls4n3Xd$7EsT_NokF6@O@YQQ5!El8uybK8_MsDC6s-Gi>_Ocw2@hF-1`K8J~k
z)MXw$PAc+u)FU^3`x<!{Kq&PwMtHv!Y5hK~kJU1L>u55Z-l;Yoaiz%TB1@@U$q$}R
z5AP8`qV|pvKYn#_L7wy#Ag-JNmc4*KL2QUplbdC_HB)mR1}ouO_}H-Ht8utjXV~Uf
zzYKqKKBzz$!)H>u4Xn0uewa~#XK;~tR=o2s{Syo0E-*=RD|_(4Yx=HpUPGVKk4Z<n
z*~<sW!0{1c{xcN6u6(}!u^&DUpa*vELF*88<abq*1BCKVJRVh;|7Nsl$Bx39%NVWr
z=kd!WHOF!u>Hu{CuS23picW>1{{%%bBu7I(k8@(){Dw=c+Bmoj-hwa6s>7<u;a0A}
zhv|r0GeMe){oW)FFhQYXiZl1jAj3C)+tr-_V4C$*x7o9xkh=tUS8oZjrWIaCF;nHb
z@J@w9N^DoPee;;%e%}9JDcao(JK3Q*0<5BSujzNIGs)e5p+M_nQr@y>+X=Xf-b^#Y
z>H_A~PsZQKoXn0)cC@)S1l@*BxxvxjY<znLQiVGMQ8Nf%mVh8f#E0ui89ncIN_xwn
z^ZvUwB6Xp+Z6NoA8q}-Y)5zc7YJx2%X9P9>m^@)?G+rER9h|$^1;NQWcc^$ed_!qs
zq0QUYQhjHO4KuF7u)WrR#)hCr?URMqg`jzo&k+(g`k}a7hdq`UAs=Igt0Hjpj$50b
zn7bXP=BHbH&}U!qY607!1aYbAp4^Si8L0V)0+4X76aCqm9M)+Y7Lemq_{lTU7%=U*
z1IFx9+f}ct$#)i=!H9;?+1?B(sCE5L!%Y7qm*}^RV-lKL0OlFo8)2X_v8t3TZZ5Wk
zXx~2$dQ`1Yy{9h3BX9@xCb9_t9p3=k8P4$hCkqz#0-vXY$eXvJVZVcc0d;Wd-b`MU
zGG<`P$9Mv-EU{E(^myFf+(iP!o}a4cI^QPvv?Afy^UNZCSJ+O2(i$m)JIUG%DEm#%
z)~EGW-CFy;$bL$(M#pU_quvQP2D!W&4gX|69TK@QQEA!k`Q%T4d#y+ztNN5o{g3us
zsWX)qSZYZ27(#e-GSQxDf!58L{8`?>qnm^G6Pm#uhi;9FVU9`Xv)CmT?5zOf7VMiy
z?3?Iu7SZQ=S*|Yes%O<PQH$#HW9AwL2YZpnakpRiftxecJUZVqW_k~+Vsb<~ku?rJ
z5_wYHGpQGa>PN5E@9GwO_bseBMBwHusmabQxv`hS8p^ozOw4_&qt`C9_o#TLTU_*F
z6Hk05%xR0Hc#`#eX)Oaioaus&b*FXN{3E|t%9SLDovv5j^u50-V8c*{pQtZ%SB@tE
zKyEUPKzw0+ralAm43cSDKGxMrwlp;JP@JUs*!+4JaS5rHIZY3mXpS*r^cx52b1(@i
z^215-sP`u%G&|yFyKFL|IfWbXs^<MN0L1+Y#9<5pX@lUJ?Kf1K@5+xC!&$-vnob6C
zF;qMyaTsZaNeb`SlZ4nlqFI{`>Fl3m_hWxAMG;Q<WXl-8YLP#t*%z0PIqWLkjTbn&
z$%3Rg>vL?u_-XX4WAczEos3>j>ROW~gpTjVMf%jYuEsG+2u**t2-u-1-=deK$)W=S
zi>#BNQ*w+e2GDtiib|!uY=!VJ^GS}w;(=YQ&Rl+P>h+?e(U_Q-aJk;i=DRMY{<N=5
z4b2~K<p0`HOW@Lq1uB@uHF|!NT+TcfJLVTp4KVw0;P1^q32;UVn`?K(d;_7sMC#u(
z`edrfMMSD)HSNW>rC)~|vF(Xi0Q>Oz+CbKeUdFxId`BO#@-eXUjV$aW=M>q$qwdq2
zUw7ioE%3YrguG4na>%z1ykja*JW<nSqjODN)#r&%&yJ}=kU=WX25J$$_oL77OR>qp
zgp<RZOGog0i(l~?fnO8O>goOxZ=g57*Vk$L?b5R`;k`+lxN>g8#6uus!BA0w1AUbn
zJXPX4#YNn91_WRh?bS{}OpyL$NtSbSn&(oa?u~gNO>mcT(9w``IF^`WE8SER=?Nr2
z7CDcCyhcRz=Jg!v=jskalZJeKEOf5D|Hp0)rr_WvcC;AL|M3~uaYcvo3HdL=gkwrS
z%!{g>(;ZqGd@|$Os(oP=0EFH=^f<NS9wT#7gz<Z_9*4DI4KN0|8JZ;w*^fCfXV4Rx
z9{u$9&g<g|JUdTD3N%W$GOk>P7G)20aRorpUD<fVm$Ck8pdoSEU6?^?QMGs~wr-%J
z{}iQ<0nFl7+E(ckG?(-dhqfn6u>8t)?LbA!UeZJYr&{xjC4x)*b7ppc3F`}Erl|PL
z8|tP?T4lC(uo1N7C@37w_hKmN$-s-sz<j3kg1Lab&sOvCxFpb)d8xe7=%B{XPgsr^
z^VV5yt+dAdx@^?^Mpr`Fg7UOKX;ouhmM8mG^Ul=n*eoaB=kP8KH`{f*S3_a99{z#C
zMR(=p^Et1R{Yfc*X8#4G6~aNSg`)3^9~#eXI;NK8b@bMSNzP5}A?}yk#S6Tg&=uOG
zzvqH*O&gc`^#$K%&SWF@Xn91`^=w%i<Rpha8Zn-VE8xN>e}3x1aR)Mfw(y#g-dJX$
zX7i>RXq_w07&6vaR#6gfS6J~XekFgP`SpCh?Q8fdB=Dfi)%QWKsc&_R$o|*XS`K7m
z#@mBW$9jn~`SeXYs5?5)Y#*Pspqi^An;9(He#cV|yqQAU!J~6JHBlTg`teu4HMPL9
zyA8CG8!Iv{3bST~cBM*|bTTxE()1`t(Ygl1ClfB}hzZ-EE|bdC4Zm%NzBK*)qxT1V
z>^{c%ACCzlGh2-t-L3Ay{I+cve(ZmhlY8Jc#zb573nt`ss85+NM^UmRB-6f@q{>jD
z=_upVw!%e7kTB=jgxrxbw_bap0X(GVkk)aKVk&Zc4{zATa3x{9JOi%L@lQUKhw$ay
z&!2NR{zNB=)K%o(t6Xwtsz+?L$;vY+_Reh?Q-OzlF+0cY7syJmXv4#)$qwVgxxL^C
zzMB*->}Ayt$!ri_)&|CQiN6RtVp^`4Zfy~E_)k?^)RZTGdBK7>>j(9A^H!Q2zC~8|
zE}v#C)sPP;3B(xG%$d^7$QUb_4o79M_^;o#VYuH1WN;W)VcFy&)m0W2d;#Rh_(KdD
zmCkapK<on2O$vZ*r#&Mxtk+t(g{spsI#%ZgSXML8$|1705sT)z?bi6M8Hk2eYVmw2
zrnI88e``RjAst8y^E9dT$jyMfQoQ;lxMnNJMN%N>Z3jMt@;DJ}PPQjMbA#;WJwaiI
z{;LD2V$Y~E1oT19tejp{$9I`*5QVnRg5#5{km|3r!q2|o)B*;~z9QjMwpT{K*E#s%
z8sm=FPW^<dBSV+V0p0I~F3-v@FMntEE7u=PEr<tXNHVJDbjLDS0%#l00AZS+9M~BR
zwtk$M1UbGj*0)*So~rFr<1N<<D@#x&NV}oEpi92gFjFPSIyu^5^iSWwnlWJmR?D4e
zXM*21bSw|q;o%C*$(}FgnaqNn5mPW4?eV-db3H%WSRGPkA%`?RCDH=$_WB03=sgG+
zb7nu7_3&-4RO+KFeLR1*7crvaSHw@g;8Wh3vC}OT>+}65iG58ziv$3_8Vl7tf835H
zuFr`V8acs$!VuV4YP*Tex+~Q73iaHYL9|}xVjt*<>&OAFpzqyv_FiZ!vfS6caA&4r
zk3f*Yt;J&u&F#M2{H&yzRror0b;zq+QpRb}YD#HkYx8&IN8|r!=(Y8Gz@0PnGa1Q$
z-&)%wlq>TkTcEO1IewMER|<rp&I^fEc5VC6H3)bf2$&`9Bkm+|A6mT!{T4nTFiABa
z>{F)UB7Xhbm3}l`0Mt#yUq?~EQIk_9<1m*6{lmF=qoB(;Q0b3Z`WMj2+5vLn)0({$
z|1OmOdCm4^-rVBVGRyBd`3EHZIhqF4B7f)m_Wx-6zx?dJyk`533g+L$@TY^{N&5dc
zp#MuH^!bK%s)&;gKlJNsV(}J>%%GL2IH7S}6wg8M^83!p!dTU_7h)@2rN_qJ$<m$r
z%QoF!(stZjn{LhWnFj?J{)X+lu%mdJe^#nRsPu$n_tOuRG>PzkYp_zCI$F%{7aMgB
zYve?JG#5sWm#1@8nm~Wpm0zq#RIst=W-i@TkhmM?qW+LF{{y=+(J^b<dHMqn>$ax8
zTFdegN1gctM$25((800#<geBPMqOs96+W$Qo8&^-<*xMcE}IY11=45#x;6*HFDw2O
zTstr{ifBjh^uwdJR5H&8*Zm?^eye$C3!Y!F@;-{sJbeq13CHAP3b6bdmDyv3I^be>
zmlq71dR*I%fJm=>(sYq44OTjPUUBxk?z4>_gEKlZvk;d{D}RBkzL9^qdTUgG8ktO%
z_S``QE3LoxGV{>5URp0*8J&+YC}ly88-trN%iv=N{nEcp1gl2KmSqKaWb<wMs^!GE
z)O^AOeAIHYvb5Bx&3~&5&WIHMtT}{%HHHmg(vLMtWy?20x<B64eJ;ya4TM>Q;f$Ut
zeJd9N0n;xtspK=&zS5SoSQfn{qa_k(`Sr+sL_?ggEu50lz}#o7V2cSUj+SE%Qa<F^
zy=J_f>MNUzNeej?SVPodv8hQx%pF7wn>aoplJZPh6yLXS;E12h!RUpP8Eafb-n>UJ
zJb!{W7`~6{!%}!)G_aXJxV3W7&eaa318~1?wkFNi8^{J7zN{M5E{ymRq9^t1UcV5q
zr+-$rAALPz7VR=B+z-AIb^maZW}hUn+NP5TC<nSUGHZitvFz@M+~`X4u`a)Ue81L&
zQOkaKMW#J@sljM;>L_BHEKQ`QWerg#WaGbcD5P}jDRB}0DDI{_bR0JrK30Kpns`xP
z;DLrs*3TaowfrSQxAMK5bjTW`7Fwgy(r-G=Hns4o*_o)&^2JQ%p&Ps1`+DaQfqaOh
zPsx<`h=z-~PzRCWx5XRk_pF@p8_rh6;dH^i*l2pu?#b>2SuyEem}_S8*7B6$hZFW~
zO{oH<*tH%3xJYH|dC3L>4mR2gBWJKbN=_<*JMnCc(BH*}zLiE$mCpbx=x1;dyyDoF
z@^G{?ljp^z8+r|~eZ!vOw_|zF|78#Aom^LJhUGVKI&mWg1U=SzTIZV_I%ArNcAU#R
zub}pj8?`hHs0sJm)YnQBMT*1FU)u$o-*dp+w?2&whp6!i={d9-Ukwc-q#k6D@%OIF
zjCstU!G2qt2zS$sPAx*(o&*rHZOWgBj&(2s&kXBpUF<t>|0x(&SSGcvGP1c>0o$dQ
zxi%bmN{E}NSth}#`!*z}6F?B=?cS&&iwY@99<lEP1k>E|Xcu<=bXOP-68gWml`MIn
z*PBIxKh)arMiF6MM%nUs#>kU+b^iO|Ouj}bx_CT<Jc2bkfuXi<bR@>vL8K+`uW>g6
z-K^hvj41&IS-K%e`NxxGjSw_qcIs~IR4ip<d=33T(?bPg2%s2E_qpM5u{Lg^0)@nP
zGyjeJ3<DxYt#Fd{yEKZt=*JV5twc+TH?uG|o4kLR@&O{?yS@5Ps=zxQX9Av~mo^8w
zH;UNhJ=B!^S$Dn_TM6`+D(sL3)bScy?my0|!V5Y-idcy$)U##`<ZcAUxIp;$xP%lK
z<ALDsDm4I7+Ahafg|}1F28_`-6g0WWxm79EVnJE7u8E}+wtr*JX&$gs&u@A4I(MCX
zjofD-OF5wpCNsCS?WfAZ_}_|7(quIT`{1<nlz6wRg#F7ft<&I|D|2szJ-C5qcYWa;
zdhd>+><Gc7dOhKdk~cx?&j-k4{gjhcfU4RRYcA{Jb#dQ8Ci_3-1fDikEvki+Fdms)
zNRaU8H>wB~!JTFq)=M!U3SCCJVKL~wGUM9r_dW;j<P}kL$YiyroT1BShJJg<5H%rP
z5w+xEH6M@#I?99VA=>B0)hK>>G@n1o2l#Wmu5)$y{Ir{ge}jTAB!id*j$26@tbXXy
zixG@=gU<H+g$@`7-IiMXF|z5lh{xkSGen|aE){DvTJ+TZXnm;qbv~x!!KwB$V6>rR
z;bmjgD&v*_`KQb1Hzp~Q7s~p!CtjjxOb|Z-?-oZw@8v7Y7Q;tFg~;;85EkS`c}Ruv
z$g7D>?|%gN5rk1NJqK!0*Lnnz;vHYZ@yGxHn?0~CFOY}vJ3ik{A^eSC>hS91J~piX
zXrx&<Pkyd#e<?2YxE5$B=D0fM$3y0MUpsCs3nSuCB|>lIBlHP^#guQ&@OA^aEbAZ@
zpwpS!_AEO$7S6NHYn*BSQq<6XpsQ@Jo(6{4=<^x!{Mn{X74*(*%}yBm3q$mstYPx{
zaGSDg3lvz=8=N0iLe5QdzOj|k;5E1DfrEz?DgM+Ef8201e62E$Nr{!`C7ea>eUk9%
zZS+|Brs1XF!25Q}K#H&kzB6h!vvboOc~UzdE9{WrW6%gQ_yi^?s)H?02Hz%ZWEHTj
zeqSsJT<P<u`qpn76C$sC#T`9!y?qi%eBWnNuXdv#J&dB<ad69D8^>QS<a{P~cDt+{
z?=l|g;fFalj{U3C#*j%f2?C%~Wr+_x-hil@$#j4FO#s~LhHN4F?@hQ$@zCeN%D+o(
zy)1?Upn1+$gg~#k{JX?_!xi_3^EVB|rWF9mF~71G|DD#x!FziRQ0i;#@|<bZSr404
z<qh`qf&MEGY7aRHuM~-p{Mfvr5<=#=`7jlD7CU~KgMTw!MZo%viL4I1PU!fa-@5pH
zp+9g9QmxwTI?$B5TJMMM#^s>RKYXy#3OV1h5{&$fzF7zNFU?-ybx=F*FzXym`)oC_
zDtc!!XD1S$J!_bzz4u{*X$aYYFIg?x7g*^t#XSxGJa8oE7o^*e+MA2z>sb#a<LI}j
z6@l6O7|+V$2awK|pV}}H+`uz?&n16q!~;A`ab#5Q9|q19k-I?Sgpe|eE4cJ|WKXZ!
zvK8VW#km+uX&z+>;?g)hNfYVn&6Jrr+L{pfPAC1+{lPK|K{spG`4-n^W#!kGL%=gC
z3fw*j|L0lMa4W#3#a#{l3eNy8kW$RBTAkdH5=qs7JzyyM)~j9J$f$X;*rGQ^3e%Aj
zmT;)~q}(5s_|whr8sQbzYb4jSBe?%9JMusAKd?B50TJiH1Zn=*=-&(W-(r+^fHWRc
zNB_yG{4?wS9Blw$%dmM$q`U&&?%OcHm$uM}IBF|w)d=E!=95S7?A&;U?Yq)=zUB5h
z+4G$KpE|*n|J{6B8-i*1Ol*VCM<8M0Hwhk&gim(5<`tL!(m5~a2_Zjv^`+=(Ztwa~
zZrig<@Z^7+c>6}^1)!1_zGYJ=yb8>@gwuh}cLep)KjXVXtp;fKBO2mVCv9MT0Q4<d
zwJdXFa)?@XWNPox#}L$`jSXr%9+~|9V3Rw5KTPD9*&xR?1fzqfwD(1CDgTknItMUK
zQpF;$J)<J{oZ)JQ_xGI=IDwvIagzHzAF@UPYTU%;E~6@WU_f{@M8-}=U&buK0PhEI
zV*iXMg9A_>u0;D2CNd-x)!mX1+xXXLxW;b8=DXsayk*PM5vL=&d1+s=zOnL>lL4};
zRFQ9=cm@<9{nZu@2js+)ErXlr#9cp~0c?m~pfu!qJnJ=(IW2?wD<QUi0~jbQFH7o3
zuo%u{Qf2OG^w&U}^YA=UJpQ`H4bY;l1fX0i++3ci{0p0va>*f)m0us=T{-&rCFQ<N
zFHo&x*>OEvNftj`?Gsk#tJf2Wx&AM-n+BLksPV(`Jq=)OxNX+%*%MCd1Ex#FROpVm
zWsIS)eXpRF^!^(<!Dk_7i<IC=eL3bi+Q<+!Ro-9N8uV9UmK0@{piyU_a<KZs5@pgv
zSr9s_3ugGjvyjuwehxg2*nrH*r_0RDC5@^NV5^yY`P9Ir4$QA9SJuxLf`AE;xt(<w
z+y50{6`44LAVJ5*hUQeA6q%G)SO@Fz${S^afT8z|7VEYl>;T?NCfx(~XA}OMl%KW`
zc`7gDwy+1rwf1kZg_Tv`NbNIx#_t2Pv!qLU($@4-C^T;DQ4VJOhIQ}<!f6BWPDF08
zEyJ5M6G&o!#R9;?BH3no8Cw3U8s0`JCtWqibQBooZxzR`9D6lxj!4hnYHwSp@=GTR
zm~j?_2ny|Xi{fi|x48$*8w8Y)Zx(lM+HFk+BvZ?u#mOAM+bjoD$S#E7zXCW1wV#N`
zpH-Nnk#X>^8dbn#unyLJ0m{?b9<mMOf*Nkg<d;j)?uKJKas!~j&|T8aXx2u}eW&hJ
ziF+hP<huB+oI_>mC}obctCnJ=3`S+p{gaNlF|Wc|f*V(3omfg$skOl`7?7(s0L1$o
z$azu{Oqh9TxOXyv-mIB3aJW5C7ZapwfQq(Qe>HS8ME9ZJbmyriL}TXt)|hI$c}gb_
z6Yoc!zmMc|2bg*38K8Ly;RJwc7}o@lu!Pq@-O)e+AdQ5I^Rjr<ca=fxp7gy<HxKob
zD1lZ%`1x36ufCyqVwx(0Hp$+BR>o8)RWW>pcQo0;zw0WU(09j_#G{XqnPwYDh5FOJ
z;VL)pjR00Vlsa^)0@i<Vu_hPSC$XOO`WvX^JLdr|E&=K7g=v<H-e0J`3CF+|nu)?V
z>4SGuR^d#&7iWjM!dY9>0PCdp(h~awL!T_4Ly>pPOI><)JPww->`TiVJXvhoOlH=T
zj;n8}brWy8k*eWx7Q1et!D8G-9~c<4MUu@tBdW#O412w(Ml-QM`Zq0tiYshJTR=(D
zVIV_lf^YbGTf-QlB{a`h@#Vp4IPy7J;OSy`HJ9!-?VYNxZ=R*QG(;#K0pzyTX$Ic2
z8x2uKllNz8A9uwS)z(&WE$S&5gXfOlkRqXGNHkzPuX<0^T7rE#=YdtixGue|g#ao}
zJwEOdo;^qf3K4^cIH5^oi~-x#)rXb+?Z@E`Jtu8M*X6TB@yz7vI)+^G0U0#oPs2O(
z1#TU8R8ERym;!nG7tomsCLOS;<q<#ue7KGEK*vfR{9u2rAvVP!X0D}`KUc1WGD6!u
zLYE^-3X@s@X<H*_-E@C-x(w#CsbAIvszx-W@+&HjBQ(_kThe6uVJ0zKledwLX@HQ{
z>{^hO_u}F4jGOY*Ay{v7=!d43h)Qzl8yE2EB0KCn?xD{~QEqIcjDe>7os;4gGn(wY
zrm@^rjA9ISy@`L<t!2MsG7$A>IiA@mXlnI5Tb3*7QI-X+Pq^PDrOl=@mRu>0nJN0#
zeYVp>2$pdY#{U`s_)6(dmr^lu8kng<`CL1He~whzAOl(R8c_e+IiE=`J^TzQP3isb
z3jn9_O5uu(;pWSFNT1luJ>T6izv<TG@^763xe)Q>&gX6Syj+A+^<s}T_ZEIe2~hNC
zPwY6!y-L{HrwetxJ}B`8o;m#BE?l0sLMwsXe^c$L>RRiJD~!*;0)uqJm?eWWHr=**
zG$#loDY>f-Ms>B?eV;+j<Hq2CzwTV!!r!r>;Yg(_f~Sgn7RyZy`3Yb_YXl`0YXHzB
zXq|RT-@5`nC`VrI%dF)zWr=^1hhnYcP|iZCLng^3xAM~0g$6$sjjyW&C{A}SaLZ*6
z>c~;Ac1zNK?J+m4B$-<Gv;s`A>O#W;jr5!82q%_izAPQBxiSYjLrVW#Ov&OIfPX>8
z?_`V>FDF3y;NF3Xj8AQ*-R%8iZ*=`!_hZ_7g_-^(Jxb2&C+HWjx_5nBb(C|y=s~BI
z&V6d-1;86n9s}g5^^2d_TL+y!4psCummI`iyPYW0fU<oHbOzTRGx9K?`)=VIpmH|p
zOluzcioZU-;<opp_OuiMI%q<;q6HONQWpdP&HGg|$PG7?Vk#1kc5y!|^SaP!j8Mfi
zC%k_n4<&O)U#rALnxPC#3jk#X=&QcoyVEYvxu0}(kfT@wkSoASTH;v+jSy_QwbYf4
zX+UjddRhSXbm|`Nd@yP!_q<T7);1^T-raa~>Up}$h^bc^33my)*PLo=?mHJG(DTUr
zc9sq}V%7ESR)N6B;Ibw%TyPP~1JWw)TZ@n~wB2Za669#u%IEN>z)}a&R;7HmK^CY5
z$<y*z9Kd|$v|~K|^Pb8*Lf1X}`pOYvVu+QvlK}qe1<=Jg9YCXNuB1iYkI)*Wfx(78
z1bUv3?h!{a(Q>NIy>D;HUJDOM3%Art^1Z|WS@rh*jb??W1YL1+Id4x1Pv_5Ef)4$k
zz8r5bnNKvm_+lR}GqjS0hi^Pl<#$-QQ*2dbGMpgg2Zc0b27kA|Bl9q!LDMrW2=s7F
zJ`&zSA9TbJ@<v`95a2KU4d~DFMnHU<7F`*o{4Kw}RSKQL@7KFDo;{b^SbdB}mmHZs
z1;eLJ&Qn|s+<mm#fO7Wz2G$*$`>|;(lGT{z^p!cm2yHF-f%1_Napz{V7?t)YNFJ=#
z4M(+J)Anet4{tJEl-wsv>_B!fxIG36i-O=&pdJ`z&T~2vzqFbDRQ2h;fAC|TTk+$a
zxxsIE*5hY`$JZ{sE^15;BgH-1{?J6{<eIA~4?<IO?uPcp$MbXpT9Tsgh`R~)ddE%~
zW?*!U2PnDmurheT*K;#zD_oObA-4R4fZBbGk_pu6xKhP+-dK73@kwoGj_W{;j?4ri
zX4}#{&i>9^{@jm-O>=`eh4^dh_P7-+H<N>TE8|MX3jHgz1|-%#tp^s$!y%)$idiG2
zIyVXiTs48TVm_kcyfn<}GqDEJ7g%;tn;DxBS2Y#R>CSkzF*%S9T7RO%X=1p9Yj<OD
zu`Rr?VUB`Mh-0wtJwI}s{LmF^+-8E?zU)t+7te+a;>^+sPuOiy!CV`0$SBg^AEor1
zj^pxy2tFSza;NMfEvJIDeh0D%hR>nI=U-EwG<yx2zjs{EeB9-q`o0<0OEu^b%;#j%
zhEiNY?1%*=v-pO}`cQI5feRh3UFs~EjVWNV5hvCI5S%2oV9Ziugi^x?ceTHdp4qGS
zzHyW=6D~os8S9kcXYj6Ldt(D)*BgLA9sPdgKcF*V6{QFucQLA<5?~n#(D|ySVH&Xy
ze!gp9Tmq`yx^XcPSG=R-#m%wfJR=4dtv%Dq?0{FX8kGw_-)m3{FsU4Iqwf|%7M8|z
zq@InyoLZQW&}pgL!}E|wD5F;T2@>SQ-ezAA?VR>Q2_NA-*+Oz;?4$ho7Asj}{}>|b
zge{Q2d^=O6KbeQ!W`U?MN7isiCQG9>)VvE|51K7<=WK8(3e8EAb7_8BAE1DdAEE{)
zg#48w)y07-k#mrY#!vX5&XV|?x2`iCR)!mS5x%&mfPMn<-0UDb5Sl>LHLJ+yA40VE
zdp?hBR`yQ?i$_VWm1-9HkgQ5wV#m3=Sww+YNg;`ft=<Gp$-9k!@o3{GLc?Aq^qfO{
z4&(Y|E_vl~UrJ_5fISf+fuUuPxH6A9={kTRNes~cNk)xwWKTu;Uic9T4-a4F7MKfT
z#a{#w+KLElX>DzqpaGPqI-@9}v1d4>p*?n-m<XQw<k3M#<2M>()}nle%|1_^uzCN)
zv^>Xk_56X$OpFC&vw2Leuq0bYJdD$gz-O(+b*n(4uaSyAZzPs?pw3_%@uH%M^Ke?;
z#j5Ku%19@#bs$3<%mt)Mq}-Yg6Dq<T1~Qo*ak%-93N_yxNq8x_hdd@f>>7S9l9gfx
z2~oRjK%lqjMVhRJNkO8lM%CZU(LPl`Ms3IMj{aKP=uF`ki3e}mAS`zQYO`1RJJFUH
zz~QE4GSW$UJJtcjt$r*e`_5aB=~(O=QryHm=QFwFCQQKwmG(wOh#J=z;_PtEP<pH=
zq-Bd)sJvB}9MC|+g&ioO;-${Cxc~(~<9!KG3UxfZiOo%tiClqH_Heq>_GI;tXlYM^
zoE~O)o*~C;9WaX-%@wb1)$1HIo1sSIZ08c^Vwc6m&O6uQwp{0$yw;Rp6OEbNhOK%X
z7!94FN71Z<c0dfuLngh4^jJxH#kz+qe$|UPE1Pbd#~t4(1|(>zA68xlKbN}zP}S97
zEDAvk<hI?)G&ixswt{iT)q(kws>l-{go@dWs9-@ejOXXhhs~Zn`D%@=zxp|-EwJW@
zo}8O=f-5VYx_Ivp3+zV4VnzDWKH}g=Qe5r^+?gZ0ps~Sx*)Qy=p*98$K^_-P*ndFP
zSdqP$F>v`jU6s#vV~gL}@Nv`QyJ`k@hRaE&U3W-&O77jA>$w90=tS{{;8q*Rl*4N8
zC*ieE5^JFTPXjF$@P0E$>Ka-aK60@tHE>v@mpKxGi?5@0cR(vsRe#h9h%s8y*MO>Y
z=jY?`a#Pq=z|s0*MRF<tj+$pK>{F8frX!ws2B5>E8cV=T_)ef4E<M>OP*v3dNLmNZ
zX>#D!fJ?D#Pubse{fsmKL4d4m>R7}@9;QU^&KM#~?Pc%%&PJrqP>$k57>6&g)IGQM
zfN&xue(xCyhW)6ksTzdZfku>9+Fk_!ITb+66P8BMt)gszoiW9I>EaGgE&!?wnY6($
z5LYU7*hGD<Hif!|PK?(%jf}?MQLo8tHHZ0zc2*kKaxPT^n?(YrJq~2dqBnrdn7UN*
zUAeQ5D3tj8%iLRqoEii-xU|bp;8UbRkToid2-#=3AYe<3?u>J-#`K9fun}O^2LQ+6
z*gE|tX@`Esi98<Mz|`1W%z3=`O-sL(#8IGTyKT~Q5-zhJ*gO7C7Ls?40x~KAt<X!}
zQR6jF0Vi;w29Uv-Yw$-thgx)fo{iwjOld|R&x`~ENp;DA;H8cjuaWl-9;?B>0I+x|
z((Qm1pRMsTaafNW=G0~rU~sFMgVD(1{tPLYQ;!4(%T#8v%WUJ2j8d>$yktbSM;v<`
z58`nNGOUAz@e<<dT$Ue9xpf|3<nYil8o2hT?ZNEVrqah?A{OvPHB5H1ce?(C%)}`W
zaON@j1Dw`o)MaexG?P&G>KvsjSf{h;rai}=r4xj^qlU(aA`O|-rVuL!;1GfZI2IF7
zO9lZl5FyI)7#z}O%k7Cuhy(ly8IzRn)hFzx3%7OCJe(80M98@vC?kgR)!FX=c5bhb
zNnoQ(;p3ALV8g*kEDYc+2r<Q<Gys^kFr#PLxW5V5Pu4QBDo;?Nk!6An?lI^;n`N1C
z>hJ;c2>4#^Rw)MJ8R<I+w*gs{^Yb6?Z|{PhCG(qaj~MI(#xtvQK7IFg9Pwm=b)c>n
z+XN&8>KqI+<NO}532>BSrXNx^35X7;?(9_ndmv`K;H#Cj1+jLh=)<>X&V)&PS?h9{
zA(56G)1G`&8nl7^BLG`6(Y1X+f`1@S+DY%*w@NVjMLutlS22+Kf68lEp{FH-{8(Ft
zVAG@G(mb+NfcuplvAo-r3=+<!kX&a4NjR^D;8SRy$4EGMp^kUPfL%PBy;`Y4iW|^_
zZxmkRTLO6cP*-!{QFfq^;jhdN8L?G7OikX<ExD30z}Oc90Um=xMr(DXcZ@3@jhF(u
zyah<xSOei_+3?es4)wtcd3nKxJ&!N{=S9q1qV2-SO@;?PD~X0BCiTz##%nM0ceg@Y
zgHSw7ID%3AYXJAR>}cjoiSN#ICen)RGC$A^?D>;PN3z5`-~ZX3>635v9@P)5YuTNT
zk~rDZbxsKlvd-=Nu<aag-fIK)Xto5&pYPbEx3&pUnbTzh$d=VcXUI9Frw?+>Fu<28
zpowy#oJMBL7znX@@x}tN>BdUMW)Q8EqewsS`RP(zK&J655Py~cu4Q#pA0+|WH5;q3
zQ|H2t9bDWqwOTU;*XF~Vr-bSOf}<pCBmC#{vxAjPg_p9ci|^{*1GN}1g{0qst<SJ3
zSMeMOF&8A}4`g<@=XAa!{Q%O-=m%xy4SwZ}A%Wh~aAyA>Q**80G2qx7G$;C8=)*}u
zfQH}ky#xyG5+L6tab$#kRXFY0!<VshPN<!aznqOSI0ysotT|R{IMDUz9s(};p{3w?
zu)^c{hQT}6PHO>pF0TcPqV-dlLPWMRa=7`boLmJm1S)E@4aOPG0M&KwuxtMY9Yxx*
zi_D1V8*f4@E$FOgRC?HBV>y84?NIAYYX#N(uuB{-`xMMu38XkU`1D%r)=yM7J?{i#
z)kaLQSh@ou2smDx02$K?7VQCcP`Be9qjS%ghXf{<IoReJ(`@<*;Q`a1ZbVzHWmWv}
z+@VmKXRCwR2>PK8dIW?mvf=qzj9lhkG&av?8eGyoFJ}OC1W($cRXC!?X~hf7IDszq
z7PZkjC+z!Jmm`AcI=E{tk|qw{2`crFb#goHd>VE(YFX?csgfMOyZXu`Ky5;{5E9&M
zN+|#YXm#o<XGZj@<20&ea7m-R@dW$6w0xk%xys-3xfm|~|26lOVNtDZ+k&7V3Wy4_
zl~8E}1Vp3<F&J7>NhL%;Kw7$VtAKzobPYXp3k+$|B_S;+<&a7`#PD6?-p2bp``yR!
zeZRgx989fsuY28bUFUV4=c3GVU*Cu;tqq?k8}v%ck44tq#bqS#9lqoA*<6i@W)xp{
zBJFK?rHU&d&LZWJ0!P9ZYu9qMckS`0Z4j<9SORR+JC%m27E$Nx0n{bqPG{mmvrVQ-
zLhoK0$?==!^z-uhH#5D(BC5^mUQ<h8RHgBkYrcI7Oe~+D%)9KFIx(NWCF1nKv}Ej6
zIF|_^jR_O%eig=V4`eDSZ4%09*f)uyl&YD~rimr54T*GrL}K;lTPj0wy&Uan9<iHN
zGmkg*YS(nUA7y>0ap%p7o9i!d31~0P5Hh=~>nmc}E025JHyc&Aoj!=FP@T0k^W5E6
zh7erq(^RRFZr$*F{pX=}4r#zo6v$WQc(kD5r6Y>1pKI2Kc&$CV4j|=puYiKCv&0p`
zlv&^gE$8x1`ITayZ|S>t8-#Pa3ZI)@g{7v|b)w^gQLUa+(!e@TXSLVYCX46R1qa=%
z`aUEsRgYAizuQ&s`TAX2>RxHzJcl-CI3w!~^w5&GKhP>K^v@iz8%b(e2^|A*aJJXZ
zW{1NeSB_J*vcnr?n<VLBqS&2{+Fg*_#D<r@YYbwXFvZ3u%9!#_>CY7`YOAa*jg29-
z2`o{OS<CwBA4;R@(i0ziqU_{$>bknKUS7ABJAsdWe5U&n-CV&)RE+tM*TcfCDJZyJ
zMixj5H9KhGKM3a~&!3;f=P~act`v=k6aX!8L<v44&PF}Tz&S=(piwnJTniaX!1>ci
z-`5o%1M;Rq-MNCa$80fa3~a8`Wq^ioOZ};+yL7pmE>)+%P$7hBV!mK+fWm&-1U%aN
zZZcgO-ED&hx$Hxcko|KG1yc?yrUb_JudIN|(5!6=AbqK=t&k0Yt<=xU>&4zlO0Doc
z^Px4#^#K#co~}Cd&HWdZ6;a2IajIFYiRRMa3pWd(2&x}*|K&S!o;<>zeLaP1%Tnz`
z@~ph#6ng0?>`qr2h%9%~o-TpS%POhICH5WS67ba78grzUC@=jSS-!9>jXm+UP%eJ0
zn{>oGsC1scA<f89K|<$+C|RH#Q`*x#q<yDC=XQ#AtIiGEckN`=u?NT;vM}Qwd!$5R
zdZek0(84-S*rb1PwzKb*uKZVzvwF>%jn0+ctNkn%?{bZ%50Mq>%|=h8i3FBvi-C=v
zhLBhCN!u6AnL9z2ec4(T!)EbG5amd)_jlL0Z)oOd=SeoUdKSm*qhX%633pc%D+qdb
z+JAI^S#YHGt@SkjU~Z}EhdteT`>T3%J>r~}2eD3bsM2o=uQV_Ccb{mpJcMat@!-=f
zz0X^;efzjs_q%4)sMhlxsyowD4RQqk&<s-o((l4;|G?M@r?DY!rjM&Ev;1@AyXwoW
zch^OemZ_N6Xl{xM-r-S4ch45++je;$%vc;sTHZ<>i_C7a|Co|Wsj0hDHfN5)qh)TS
zHWxCGQm`o_2p3npZRK6aWH+8Tf0%ObUTk@l^ym47f@UNEv0@Z{mWD-Ia?T97q2DcK
z-Su^|y*a3H|ISNE$b^F$C#$Y+geN#ZgAlM?R4kY%I21ZC>E?ZMl_}Q30Q#Ic_4KjT
zU6kZ_ILXU&D?1Ld6pBe*?<wi4<?56k`DU|4y~?i7PY=IYDY|#5c`V=kphA^-xOs6*
zMy$u}+OF?lTlcb+m#v@l+7160E}lDjBP0>kRomT79i{g_USjvp6rSQJgkUdyt5ics
z?LEETy^jao*bJ)W!})}dt7RNr`^MRU))gvz56fmehow;A^MhvvEH1}@Hu_BZUSmrr
zV`vb??VE$0Ui4Gv)@2b9ay#kAX`6I>HscCTHlFf9WRa^#ESyhgQAkf%hiz2{%Wi*b
zgLHH1MEY2;hCt1#`jB1m>EU{m@BX(H%dk;uJv$*vt$ZCT8_IuZd*k)yf_i4$QKq9`
z9XU<t!ua1P=~UiH*j=fZ2-fJf4wXXcoF~HjH9YqXI|K|w<@*mpG<s;H(U80Uj6vLB
zIQ55%2GK)v(Zos?j{&j5s8BmRRq5v_$o$?@C4DA*PC%8!HJ@WNdG>L7hj3nA`3K#d
zvE7}dApqa(v9}UKrrtzUBnVVa(v|C-t?UmMXLsW6VH@-87V#XADnFTH^i2!<_G!03
z-^I~0LpWqqb-$r|o*T&47WxuhcV1OH7zrfZ3FF7E-Vjj0lJ-nE#PubrGO1~vxf(=q
zvO{(jYvcM!l4kwn(Lz>(09@3=?tH0R{l!MdHqfg;iL7m^MEsyx?eVod+GSgpwbCIW
zaDEQ^v@VQ*Yk%!UnNhvm!kfI+x(`=!lR%M^g=~<UgRr%sXRbUh!3%E|FN>QoBHW4)
z)No{=^lil1P@$N6g{{ygK#5VNYN&>tU7`E|rIL=Jt{dd2@iJ^vAqI>L2tre&R|y2#
z8}%k_3XGiZcK18?jz2^CtNv=xGCii5;ljiLuE_HHk~-HA?9A8Hf6>wr=v;EgD0*;L
z2)5r}8#G~THtxc>r#2%49eLuBe&C{CKjT47mot%qjFiFAFA)n&PNEeG9~4k8FS9>L
zt@6CTuQY<S1ET(TdY|VePC8UJXhj!|6cB8rFng|ESFOLMTSBHx7y4CoS-ZScs18Ix
zckxS&w|^u1`jQfB=@d;LDW23rl-vh}7|dkIXF-z&A3wwuAnNdDS5(|c^2w@Ow{+bb
z55b2Yd768_Keag5mzQ2&4#)H>xRekxqH2x9<@?G;nl>C~65aE$AOAdoDwS-$fTK>-
zS{N!TS)jM5Vpc@#K*8_+dW20wKFFHo8_kBZu7xlwY*-~&i@x~I+|i8Qvr%_U`<8zG
zH7MzSyp{ZlOlhZY!mTo|acar$5+`<>M>BMSR~_M%yzkH-Y1Cqqv?2hC2DI&EFsOmb
z>MY+D+vG^Ek%WNVNY?;PFa*&}mX^ciI>2u-c1#G#8w|Nri^bFH^5S^rDvKTt^Z9y@
z5Ya|tUDrfohz@7`eZQN>a4R9++o@r<<WGgKK9X`@zbAU}OrGwi*fj6?ERD>Dh?BVQ
zC2!PIXsVeyt^QR*IC1sBMT`<z0I|}ELXtQzrXM?3r$(l!FxN5o=`fnmFd@$)a^}`-
zfu}Pm&x?I46l7HL=PvCNsr#G~=;!inN!N8;z`q-$XY1Lp)+KtH<vs^2y1OJFMJ4oY
z8ib*r#nSbch&eIHS`yxKBtEMp$7Rohw}|7u?jWIa)l2uw?!G&{7B{xtz^MPBj<2Qo
zppd`|=l<Kz{t#JqTsy{|Q0QrT^0;@*qlH#C;`sD!IXq4%w{HpHRc1gFGU5s|?Eh-M
zrV7)zg2~bE%{RWxo)re8K=Eq((*Raze!_W@V|xCsWI|L;dsZR3&JUC2Vh}LQttD()
zf%3?mo3Y8j>yHcfKC`jz?KA>GTH>30D8hbEDc(oee}rV{`e>EQMtreqlC&jY03EPN
zz30Y-DJ^p(2}vU%eNOm#Wqr)sn^S&L>|)3x5v%tY!x}ekMpNlwHa@qsK6zpS1`Z@?
zf1JM?>ii_l0U}L?qngj{!|n9lHWm{irFp+|6nKn$w#HfnUR!pMtd++q)-A6X<`NlA
z%X9n0n&j%YFCA8E-WwuksSlO(i7WNwN55Yl*<Y9U>2N5k&q0)pD(DAwtcOaAbRrxL
z8zd`XWs#bRzi5{tPQ)?h6pFq8ASGsiX&zr$1}~2>F2-|)UZy^qk$igMChczq({;F)
z-->;2Pm3zFXPNg%K9L6}iaq8YnS>~s6(EUN=C#VlkdaOkYv7cHVpry2Gz}n;g%3$H
z2t%1{Xk)8$AqYuQA+;(p%6S~riFKgFky?|TwgT{p>WTe#Uiy(|19Hw@L#^_ce+Yt`
zykeIx{lZLup3mlqaxPqiI{F5Y)4;9x+#%wb51z;(xNzmwqe)i8PCH4UL90rAuhG^k
z3e3PCZD-Ufd>kn`49o_@9eK+<@b>On$QkyEGqTLPYzEPNSsIOXv1WfQaCB$tcWjLV
zig|>gG}vOSs+19^<-J0UX#;KMOHV*9;=P`kC2QqqcZd^Y(Iez=t22PTg^s;ooedJF
z34Q*x@)5;nbHb#(MrU4{vmZD*_h$wjU&S4=0QN6n7lNT?5`PdDWmA^so);Ty=!Ro=
z0uiX2${5zkY7K_|gZD+mteA{4dWL5G0>P_9BwE0nWYN2|(y`*jJ?O@Jm+lFe`x-+9
zb}!FcEZUx*ET2z|)P7|LpcyxzHz&H8sp(s8zyN4?^g7+_*f`-7?_mkn&~Ts9tBexP
zxjmbvFdjRKs2KP)6(i3g6*C=XI%WY0xWDgu0hsX&tyk#C2S(dXG-Ms))y7;2q|b6)
znNCkgfhpX9bz7U|#?j4#`;MlbZH?2AS1wGtS^9B8GPbp@+d*|FZ<Q{0aW0df?Sj{F
zTZFEo#Av#;k8iHfjr%$p-aDzu0hb`eO~HRgz}I-RYtl*Z<U_<GD&X@qFiIMb(pM@0
zsX!ATjOMA(_>CRN7yAml)_BVxc#IFO0Sr@XV{z0${Lci)C-9^rGHi80;qh{_wqc5*
zc-K?{2l3@=<31qdA@fBh`80j@cPF2}huH(X4f3OOQO9}Ia!<r|9VsU9C8N>@W0-93
zU4qz5p;dprCAi~=u|&0!0JT?3?lE=y6s`Ffq*yuEQ?E{WoZJs4U&Z7V-tx|P4k?%3
zv9zfB#Vgl#l)t}xOC-gF89Fo>q4&_F?Y(*9<|(+$1KeEv!fNOnqUU`%P1s2(87MYA
z1Hz_|;T+sJLQZrKpk~=NfTF~!3YxVXHN%p$;%`wyf4(_D<~jDx%)vNN!j2KHDPKN^
zN3M4)n`iNj*^BRE{!{h=#(>lHdvRO}^BVbZDtkW0eBJ;=k<Hzd_&85m`PlkTZv*?1
zA2Vme@#C6qN_-pl*9zxQKE0H*6g=j=TjK_U&R*Q(%lOM|xand{uR9=*XMzsZye7se
z5(U>6dCzGwa8r+ID!x~rK8HZ3hO~Tk%82pDfFB=xEE+F)@+_(d*rh>Kj}=>a&N)mg
zvCFKpiHa#vu!bhpL-g1SVH)Orfl-p6Z*w>&FYa`1Zx3J{>BQPgQ06E=8+`yVX1@SS
zm!Pep!z5(cI_MiTm$P+H@lopaBfDPPUmGnU6Ya=yC8)K~jq5+60CwcNiGf)9b89Z3
z=OdzQ9aC?<?!YRTp%}T+|74Hoq{yodqzN5tP-I9U009vZ6!KJ&P)~hv)m||t{Eyo+
zX%uWKuyVaQW}$WG?&3&gwpnYW%7=*uXq|{Brq~8x35i_od}>1b4LnUWYuwi2QJ=mM
z*wJ)y$)T60DDTTbsifi9E)ev#WT>XUuY|=^xrQ~m3x%(#2VKs-Vg{sHohT+ftj>WX
zqs4H*1kTAS4Uf}dmgg+LmvhRzcg&J&YI6FykAi(`WPF60&xvI^V~6YkV9FJRhp_?=
zCiH<XR8BRXvB=b>;ZKTKqGn-_L3TDb(;`@7NJn-#O7wI1bR<LbU(l&)djZ{_r_-|a
z5J`k3uidiVQ4*V`z*h<>?9dC56tO+T&7JbdjWWxKu=hN1qPG0H9j1~fai>`f+$13m
zg?A|Aao^b^uz7B_NIn&t_I0FFEOT21>@Klb#aM+B!&ao(tkFzfaY{f;j5FvuEWu|u
zkUv#$6sQ_v++cKB{}veuv-~xLaa8d!IJ5P;`?n7h8FdI_2C`!4xJ2bA3sYx-jyu!+
zap-%;egE?bRuEB}>)kn&@az7E;xj>taE2&0T<=5BgT|`O#6LYU{->nf6<saH6oR&v
zUHHu(;V60*_JzDno=}QC(6=eab1oc#slvB;*|ho|DbLjv;Oks(j};LP^!UYR{2uE3
z&p#skg*Kz&T6JEYGvZ(!{jy&}kyBlb2k;pU$S%3QU6TJviu(s<A?*{zTM!Y>GSq#0
z6t3@^Ow0MqZ!b^xq!rMe)T-u-74==&n0~bQpALj(^(6vD5X{=B{9$%P?Pje<<t%Wy
zWEm%J{oC(Y`F+zBqq2bnA>JIJjF=(<7%<HPctg_s5!cDV!z>GsR;^x#(2JyRVJy%q
zMmj2<g8qLbH1q2^xEP2VOgdXdum=o%JjPC`l44*@$8ErJusGep0^0MOO83RR7|ZT7
zWM#A+6UD;=z1do=gBL*ECvtzhzH4W|2O$QW)7B!*iVO^_M;P__Uu4fvGB|v=T0mbM
z@Qv0YJ?ZRi`o~^a4@a6K(gA0kg%v=-Pt;Yu(1%P70^rH!k3axJ1Re{apqk==w)Tg8
z#HPT!Pvt72`N;uAsFPLfDk~th&80DQeK!}8%}#1NBCfnS^gQ<0?es&NPbN{k;iFw`
zWi#+bA|6rt+CxMb=}u|q83WfC<2dlcmJDXvd1CvtQ~DxUW^J7!T)OQ~XnNB?6wW^C
z)qM2Yc1olQn-+!sh-c_Ad%lx_h-gHYK={03xzTOoTmeqlS;Qjt^@rs^1f(Ol{);?`
zF@?kOR<2e!&k8u9!^mIFE)AL|<uwE|&6(!at33(7`fe{z*x{JA%#wN;3Y0uzA8QzY
z?PQP470QfgH!JBKi^3p;AS?)Pd)XI^sgCM;2YPy@g+OpX9J<X%&Uu%|bWTo%o{T6#
zByZGZe2DF|@8Io^l!O*7x0J@(qw)xm3&(lX<p~ra+nl#TbWYI}nuXa!zRFM<eieMU
zR6kQa;aYx<<_yow$`d*6*5=--LWiq|GNK(PH?!(eXsr)8bUL12+u&p>5?+I+6>P?3
zak!R8IMf{AHL|6yR>V(UQ|IcdoRqZdHL}gWdFjW%E~fgj{gE@(W0678wII5Nt;T&a
zoCTSTat)XuL_zJQaz^7r`dKU6g|=Fhsux?I`t;MyIKyE4H$@&i%0>F`=BykoBQ$(?
z9qEYIYB~>vEu2sr{%fYm`<Z2&>Y0|}eECI@%TiHZtc>gyoznJ7y{qkB_SSf%>2LY~
zd$5Wn@gzx3nQ-<;DjK%cikZu^+zLLl738*ayU|=XsdG);+k2CMCJ;2;h57^rukP$*
z)tWklEqLq_G&6a=Ziyx>sR#d^(?Wdn#NPY!7$=J$!E0hkwCcmRIh1!y-PpIS_I!cs
z)4X4oM7?W5ich|MRYx;c=dnBFbXrfUq|RITt-$>G)DFd*Fu#mKs<6P0$RhLtUjJa7
z+V-WvH4cnnjMKV6xMz@eB6Lc^C*M_7kyL$~yO_FnwhdD*iJN*cYyG>PZ9+fBe5kU|
zUnf1Yy>rLv!Hla{!-{a#)o61w*(6EtalBlw_^tjd`yIleK%;^F#8!`76U=8L1joNE
zAn05&$MLlBIiLrfJ4N|F*TnDR?8hI37!^tx!Op_Qc#z|+_++S@pSHO^L!@?t|7wGJ
zLQZ_=nXCA35$=h_5uVAzuiF%+PWKySJ4CFwzB!e9)>LmmbJ6uc)ndi_$1xEtSC+5;
z*GD$*f2v$W*lC?RwX(N(nq9Iwv^r)EKQ1f|wZ0AGhI<}6q04edn}?zK+W*x@!g-yX
zs)W@8Hp!n<L;sHSe{Da%|Ho<pmGv;~l*#@7LK8OzYcM(qZpwezQXogCz_8_AZvM4o
z{XKJk^eDW1L)1JwtJ7WnF?D`#$zSjJrUA1ZsrxjH<zH^&69drd&Zn{c^y~YF8;Lvu
zc4nn?>@vTd|3AM*TLs%K<Y~13a_wOp&}IvE1+V{Z>(5`YI)euJ?>L5-&HU$ii0`o}
zLz}x5P}~3X;ot8ehzen2PIlRUxpw(jXmeTw7ydU0>OZ^FcK{HfH!2E#|JahhP6oRG
zBn`aRmw)P=|Dj<;R3I}*K#~0tR{U2l{67yM$Z$J=3LpZxfQZMp`)lCdm4(Wj*8jcD
zYFD5`vN46tKh>(xmB=Fxren-|p=$rxbPjyBw87!kzPnidf7~Io5{yR_QuaRMujBQd
zDzuxpT4q0`um*u{HbfsPP@^=o>^E@93vOdL6$Qnv_*gNAjBxupubtlUm;QnGBbViu
zdsJndW;>uhuLael@3mmSE!4sShpSynAl!)1h3F+)uRMPUC|7;Jq0A#f|6wZ*biduk
z7FoFcVkpx7pW_2n<RgiQ;E$LW-7O!o<n1fBHFA1)g&_+nEy`eRsOt&2fDM1FsBJ#;
zgK?5Ia9GWT^vpfj$))9-1GE$aeBJiZ2`H~<6?aHK9;|@ZS)eD)=6f;lD%V~n$!pW1
zug=?>h$*=wNS)~)gu-rLhVqHN7uI_28uKg5v>uR6%mTSjNpTBg2HI<L{VslIiGHj^
zTzJHD^3*yo=~AJfIF(NoCkBO@J^(6niT+J447dYx0^az*s(6cO@TAfLR!4P7AeqM}
zt}3M&UdQ3Ng1UTaneF?cF&ksvMo_0~GUza$hV)1c@(Xnc{?c^oylUbe_LcyLB8N!+
zzR9_@Hb1BV9)s;AN!vD#bMKYGoX^}6;=)908{<>L=J9ra{sA69L+=5eR5^6wf|3;Q
z(k+Iuj<}Q;qt1FWZUvRof6R<nNE|D?wCoCXXbkv2T=oP|ClM7ZVwI8B(H<=*N6f-z
zfz)DGuDE9!7(rSPwBFG6PYo7t!7~$`qedporXjJR+8JG&$1pI1^Tf=?jr<Lw(_Z@r
zZ|2BflObE5_3bY+7aznvk{wpjblaG{-j4Nb_iN^5z@!c3)$|8e>a6n3UGSqDtw~Xu
zIcM>i)vu~fDs}I%=bZvuv-_9VKNhun0|u5QwLWmByX+v7c&~#YQ*MA`d{Gz4;m)td
z7B9@UBq66AHJRX&5G^|0?l?d2u3ZnX0TZ!S@JslZ!8n<|m%YQ)&~vRXXp5XF&N_h_
z^I#e6PnVUGkL6i*1-xcfthS3>Y!{3}|G;fyhebMZ)9rB#xQix2_s0>PUyBO?8M~$;
zx4%Aqg2*((A9t|9uMTG#9Q+H6`)lTRkscubZCDFYR>QBjUDd2)JVx%sS*uTzU~IBJ
z4RUNDzaYvFAo&ci=h<m+3nr;;!z9NlU)-aK0BR6ZH2KbZH-&tW7BMrHod(ZYu&6nt
zloJ{OI`XyQZQl#nzK|JBUhQ0YTz<sqDcO^byri99@%uDC@kn&OeN(}@*2>=xWVFUG
zEZLVg?^K4{hGpZH>UO2m8}Ct$EUosasxkpd(sx)%S2l_8coT%xs)};6e!ZMoIfV0R
z`Hf<uKqI!836%u$%~y-LDYtoK`B6%zRU;Q57^%}WEvw?mG%r{W9SyA6)xGhX*#iPr
z6*Zb)i(v5qFdE*!xL1=hv`Nm~J<M#2WC@rqrnV|bD!LU~EtfM9VUH6DNkpaU);EZM
zqvPd*i#koZQJw6Hkq<g7=-F6rulE?}R?ZmIxH(yZ83*@h#PceH^_BhSmBjI0Ev{|&
zDTh}NDdBBY+z52k0%i2Wgs}&C03j)>lIq;4DdnJuo%abi9=9{)0G1s#y7tz<6}Q!;
zvqKr^rU>R<gjq6b)jwMHHZ(<)(pxbCOkumKUVBFwx=tP25P$Op@JN@99+Sk1VKy#{
z;W*dPXHiulOlmhF1*$Uajz9#ERn8-IedQA(7nYB>Al|BM0U)M?G`P?Mtc{p3G+L}&
z25r0^ULVk$laM(1Hdl0OQ)a;@fK0rdfIT6piq`B{pm|7dyi`a1+vK4=2Wxu}8IiX1
z_m-JVO7KzW3SN$^bgh^LO|IS15o>_P>}ZS%Z^<fz2tN;doDZDP@T$(q=I~L^c}A%_
ze^!OWw|XJm-yYK$sKTxQ56Cg(`Hp`Em{kprUh7^n^6nD6-ro*BaLRf?ejHLZ0aYQ)
zChT_^ToNQNUg)4xYrZf@a*Prk-Ru=AfJ35r?*`<|`*nA?B#?;)`FhNL^;}lwnXm(Q
zw1@7@QUvAkTDR(>E-Y2#f%9h#YGUR}t0uzGfxUJX%BxG>yThZ6gwfByy{a+{K9m#w
z;tFb$Hu`C3#PrZq4|PWL=)xEuPDo(b)Lj8BJZ!pO^TNG>@p*pP2<<%G$_w_^vM+lt
z&6gN34F9&m=)ktI7AYn6GtW9rb@U8+8CLyDnky_|z}y>WTZ=o}s<(bSk7Qn(a`{3X
znMs?RkywWf{<e_0%<3U3OjE5hgK~G@b^RVFbj=v1>d!%yKRR-?;n@idNaG#yWpz1B
zW9<)N2KczZ=68}UZ-4wo^+y5zY2UKl5m}N5kCu)kn^UdMq7L2UT16aRh1L|A7h7L@
zxEJfmHhIx~Dak}X-s}#OpUs{Z$dgW+J4gAjshDy9<?h0&!75lE7EF7liNwW|0{nI=
z&i#hnVXaBtVo_u`HZ`ugz`+dty*wFnA{GfI9F*~~LeD$Ftu0zPvGdZr6y<kx`0o+o
z!;V0_eRYfeSKM`@o*L8i+Q{PF$6Z01r&FggcdgyFlB*p%9e@j+F=_3!a%&Z=g&bA3
zfdZJxw&(EqJcFwDqej8u2;b6sf;6dS36#eN8m}F~fRL$8StfUwPhwPmI$e<`TnB&D
zA11~a&R7vEz3h@4$-My0T&{;gVu30$@G-FnMJYbW=bkn~q{4M+G+cL1emYb1a_rF%
z?A3|ac-~Pz-;-cautT%-n3`!e(n_tc!wVxY#lJ=vvMNxPxJr7=nTnPYJFv!%A^%fm
zzq5`B!5YuM)p-9oTD?EtKqK{9h)O#skSO;}g*oSg?sU|tLu@7u$}eIw=bbkkK817p
zpb9+s@wbhBuE+ptf$O-ZnM&}FG2|Oh4vWJMTJ)Z@27>)_bm1a)o{CBMPS959AaeoK
z_-yX?EAk%MJt~40GHEy{%EY)T(SanTcMjZJ=S%izQrVl*%QOxK$G4vF{k`k>_ScrX
zUUvqmF$0Y;crG*Z?tqCNYLBND0|SC~@rvD_$Li|l*!eix*koy|6tmG^&9`aRp2gr%
zU!VAKNZe(c!DJyx6@ZuMxMG_>OhCOC1S>6Mnqr#>+L#jyp-QKd{9Fv0UU)j%W1;=)
z?<f{68${yrkDPuj7|g!*Q137)jdx$H<i<HGcmXgp?b;>oP4MU}gPS^FlF)Q(*iLoX
zM+eJY!g6(M#_S%&pg@nNJYovE@oT)1ZoX+;3pe_7Uer8m!$Zm=#Rgt4htu89SH;uw
z>wiwOsFLT6L`Q1jrF+Zw@;JNbkehRakg?qrgdI~=k!e_NI$i0*gNbLkSu-cpo)_Jj
z_eqOa#p<=5pR!8gR(|mdFl1Fzq*zrxcjUL+Aej`{AnH)svwt!7IA#|1QN>{$8lHkx
ze^v7uo8%NQ%ocTF8RZM_HhFll)vb-HFIjHC2t18I#B2M3dBP(08t(B{NA)6B9z0T2
zI;!#4Xh!l@U*9N+{;J{9pAHWxpm$-}LFkLvj11g-KPtp6)$ypRJ(1`)nWX|A*ya(;
zclB}O{xxDRTt^oo!?vr}hpalvB-eU15=(dbmj*Emz&OsF?Jro$eb1+dqISl+Q0fyp
zy$Pv~ffO!rsK?AzSNrk}^k65tio_GK;8|cw<z#DQ23Hv+o6T?>eN7a__ue#5m`Y<?
zQ!C%Tu4hwec97d#B?XEOvru+(au%8bRQtpl=t{)r9HX2RXmpbzn_yeXavrg}W?n;$
z<r+QE%q=sz6I;@eAELgmSalTZ6hYmAfznJR4wt_mY4*v6>tm3ORJvo=o2>E@^G<E7
zS@<R#aD=xjuc6nEoN8nW9Rr54-C~9>aI(13u~xKayExvVOfiao7HF`dIlbU3HS5J9
z3B>7HNGraFsN$bz7sCr`;`Rvt8gEdhqSg3``v5Q<=QDbVJvZ)j6*)44zYaF<iTDoY
z0exMSCdWauD6u?}H1Ytc7}@f*o!rbaL|1VsJ@}(<e8KQ(PGXhZfE{r^M8NSXb}LNK
zm_s%3WHib#hmCJfTV?uIfs%$apXTaov(6jB-Lo&5OXT+RjI3<w{V~8NxXfLp<l1Z&
zSBeUqyPzU|H?s93`9jWK&PCUBFPu+w)!F_M5Bzfd>Vh_1_p3`B?rLsK6%Oxlx1NP3
zjy{SheRa;2$mdSjQNV|6Q{!0Ugb6BY{klSnA-xt3&z8CCtYR`ebTLg!DW1Vl464iw
zwK}IFj`$3;^M6>8qm1bHmHg_Wm!xkgFikORnA0>l?WWAZEpk@EIMEF$8cj1Wvs9H7
zT(}l2NMpE=kDfeLaFgZy{fhW&b#_a*?z_`%G1IH$-Yt7JrniNtFL$xg<hDyM;~}3S
zUzpBq6+x%yiKeAd2}^j|m1ja=|7KL$qMOZKgNF-!TH-Rdgz6=7e#zA?xkV9$n{$!U
z<Qd0z?o;{=ygWHvY(4xUnUY+pH!8ub?eOTQ(`Aqb&I~M$R#mwQzdSO^^V)#Z$*@K6
zUKo0$>G>HAZ2Hv!ez5{kS?1;($_M+{Pvb?IAuBTK;$OtV9sucPaome|Ic!@AC*{#t
zhpnosyl*~8?_c@V3lswo+uS=e{bLa(AtNOZIHvYBsZH`^gsSi;7@sc6X;_dSxj+m=
z7rmUv+0LR*%N~RWQ+;_WT@Y^qiRSIhX4e|YgQB?By=J3)VVs95gFzqCFnVph{pi^y
zSgjv?xurTJ>z@gMwQHBH>TQx^?kq8)uBc99rEZ~?csZ9Ctso6kB@I&g>FxEIRa}o;
z2$ef#Ne1EKJCl)DRbqcZF|<><llUNkFzkF%B)>sy*rv_;TKT!<B^D<<JV(Hx2Zfjc
z8FGv>W^y!MM_Tx1)5}US&9sO(%0h)9tYQ(W*VDa)6K%{Qup+xo@ccbeuBSNnXF#ol
z`LF3h!B7Fj(1;*|+YQ&AOF2=I5VKP!(9~r^2y+us-XX>#FVj+39*UdTZNoDLoE!As
z?QM)x?#MjJa(-~8EAPB!XQI@o9Zwq}Y#q2i5kU^8dy36kQHuczxW|<p#rSLM;S(S!
z!k8XBE*#@9JR>wbsnOsY`jz!ltjfiYWFoM?soIATh&)zP{Q#)9yXG$u6JKyGciM;&
zR5$gY*m+S~)luslo&?A|2J$0brafPSc-3>X`8F=b2yPAYF049=s$_#=RY9h@vub{o
z<%IJxd1qkK#aJ9{``TbPpL6qUb7qiTF(?Y$JJR*8a4~h=Z%8f@ZHnd;dLM{NmFkcW
ztvPAFH9X<3?}eWtCg7#v+Cux@)ov~3Or8{?e!e}>14_a^SetAMTC5EBYxa&2)<rvu
zgzj`+zs5Axdx|mqT7#xw_@r`q^>rvrmz5}#&)UuA1=*(PbzPjB_h>mFc6ByyaaIYJ
zUoF|s*ShR(t?DHsA;nEKx)wV@_qfK>@VBu^d!Jm%^wBTI6A9TisZg+At=++$=lP*R
z*8Md8!Zw|bxuc)n)Fm;7T*`MDD7>9KGXT!Om&)-ysPc~si;dg#cz&0~5wT3GP(ht}
z<o=w~ECQ}>1pLgxYX@=dU{RBrrsM2hshHM8swcOTfiSXI_N}r1=_z~@G9L6Ti%w78
z0xj*22f1}lNKMhf&Qi7K57XDw`6u>2GEg1Aa_6Slownnuh9y(7`<11*XA!C@jWoy4
z)jN1(jP0VW1P#32nk`ixr~j3j(S9pH*bPm*4*Utog&y~XO3S&0ml!gZv^buav@Q1C
zy(&8#uGzL@wcNPirp4*TI-?9T+y{A5PJ-v;NcW-Wn8>p?=`PkjI=IO1k7+IY)^o14
zyC&-3L1~X~UwXk$&?YQsO#>?%W)+D>K+q;vaHV{oJOr5+7Dj+tV}tf@_TS0UD^GI@
zG_?&8vL)qt1W6gkfg6Irv=52V*HKm`J{Dz0H?1`bjU+wgtYdq9q+Qw>@ivXmB(<qR
z_72OI#%y`e^CLPsM$_>^!6DmTO-tiMMfRpU6wESFHtct(9c0Yf7_vgv%l>Rvj50H}
zQmx#lsT3+=<GvO9ofG(Xr}*&Ktr|%E9)qOQeS>f|!6qe*w*Z(O*%19>AS=j8+TET>
z$36@Y4{=<`Xsoo$zbR9#LnHYGamBzpo`HAtao&xhOSo}(254Pv&svn@YpnnzHnZoY
zBKz7pq+S}`ZzH$mqYR)f{v)@e&<q6J(vsU^8928;-bRogAbLbB9zTPX-~`=4>wz}%
zsGwh;1J(ym_X>Nc&IYazKaa$wi3S#ff{Q>V2$WB!H##Bj@;B4}JShBJ(+|$2z{Q>q
z74{zk!k34>Yr^j0XsVVV$<3eH6nyF{M0|qSzDM=!zfOV5tcSTD!R#zp5SRn;Og0c;
z3>)ofNgq*c1HDthYdOZUKR<8s@lQt}Vs-Jm;YwWLzMm&Ww~#|>moDx+j{#5RvTAl`
zpgfDfx(#}Ug6f`e1eyKe&~YQs%=Ci9Fsb#n#n$@gcXGfD<+qml9d*GF0p>o#y-mMN
zVGyzPZ=N2}f-%?geg>NYWN4ydq>{OTKrzW)2pDG>fB-X2xx47^l~z#v5&F*ltT_Pt
zp^pfyV!t&@{_HtKXg!F8lZVP}3qF9QE4uyp`vth0v+!Gu1PPbmQW=R;w2u+62=~EG
zo=oI9=-7i}XgEN0;aEtFR>TwifAlwWf_yRD2DT`E*Iy4A`BjAd`#;|sfT+BAC4c(2
z8~p1&{Pka`DG{pXH~9HzrGNTv5j=11>7(bDe>?wwo{Y5_E;&peI|;(Y|MhFehoRO%
zb>``RHsa^-1A;{u!0T+4j?E{2cHQq|_OI5x0RvWc`m1#R2Gg?vypFWTfwts-Z4T{|
zD}Vr;4Px{A?^FD2B9b0v$z(UX?LT$Q-|x{9=iiOv$e)I=|Jge*fWyv0o14=WKm6~_
zL9o&jYaQppDE{Tz*=|Cc!xW>ce@lRV>%Z?;f*?Lc%=k9j-<C7~c*g$^2hc9f0#`}j
V(_jGs;sYd+S5z+NU%KV@e*i*v-T43j

literal 0
HcmV?d00001

diff --git a/docs/source/assets/kernel/v_vec.png b/docs/source/assets/kernel/v_vec.png
new file mode 100644
index 0000000000000000000000000000000000000000..bac3c10949f6c55b60bd16e023485a9281dd7e9b
GIT binary patch
literal 51256
zcmeFZWl&t*)&>ZK1P=~@gy4|iG!R^Z6N0<DySpV20)$|}-5r7jm*DR1gamhYW;gG>
z?|Z**=Ev03)YSaARCRTy`<%VcS$plZ*7L0ABuG|T3=NqO83qOhO+s8)9tH*;0s{m0
z6!8i8=G{;Fei#^3elsB<SqUK_5?Om2V>3%57#Q)OXcYuig&w>VjmYruVZm4|ckC=|
zfp>`Xz9(eUh!VcC`l#@@9%_oVyZW%t>u55jVaw7!CySKn>CnA;wXOfNq(K5!>pXgY
z;CvwY{?1{I-()bk_WUW#7Im$@-OB=4fu=lJEMmO&XUPejBu<ZT!eMc1U{n}N1^R<S
zL*c%o{_tVkw}um*)OPr0c3W}Zj3?EU=8FJ>g<|)l**6ybkQC<NU5ctaB8*(8#(M^w
zKBgi84Y)`CuIZ>IuWhqYO<o=F)k^8}JpY9UqnM{zLV*HPMH_B{b6B4CNu9WTkgQ1-
z-k0l<{{78Okmu(y(jNhD^gkP2^?xCDEWgW%XU^*RV~nQPq4`^cQdN~?a8>%2f9$YX
z(mTa*IFPMq+)~1|!E%tvrrMrbw3i9@m$sa!BWk-kh7ZQkpEHciZ<c1Rk*zP;e&K%C
z8}wypAM~`grDzw$2n;^T$o%Fhn#1(%5wj-g296#nHKxATXMAL2L9B0Y5-G7)qnn27
z-XkSu{~mInv~Py!pl8)e&}H9kTf&Ovhvnm%Q0EA|_nR&kk9&buM%XU>xE3kmU1CJX
z%+AK!=RvI6c+HgUhj4PL;ey1pz2E#hX?G=$QJ$=aSPkOYXGC8J=sD}xknyXcgwci(
z4?eCZ!zhb<;spOVGxN!MD?WVStF^CAD8dp*?~>7fwWhgts3wsOhA7Rhu6<}=BCvT{
z=J8hUgFRh}CL(^8An_0mekT>pul5j!8qBg_*8?LG<nM2pbYWf>5$KY2!?yldywY~f
z-9UP*Ndl`T0P`ZcfB7?xTNt7?wgB@5HW}=zXN33^`?$Vj-9AM6xGG}dulMOA!@3{K
zkPae5kf1M0{>c;;T8#XO-Hmv^l8i!&-kFF{SaiR1{Bg(S)8qJ8l;lJw-Lk}$6rV6Y
z2CcNqW)N9_zFuu|#c6obx)sb~xBt`4vznTRvJ(3xrZol^QZ+T=`aADndz=mbtMpz?
z!<#sic|vpd<+nA&ih;c~4AzJ*>|8bi0}up;UWOaduErPz97dfj86UKb$<9V3_q<XI
zdm;Yg)-5ZcF`+9wm~M)HO$^hbQ?_YFh?oD`<UZVs&yw8dM?0QALYc$#>m4{})ItQ*
zA|#6z^|Dn6Pi0+7x5`vnwW}LXt*qDuy3a=$<(MA0b;9+R>x{uof<JL)xCy^g2j{-?
zSDOhy#k(OxeD(r!$c&?HVrGH}X69Vv@X8&V`rewPu6U`|%?D<Aij|4U_JkXM1ir<a
zvF0|+bKNrjU@RTE<&vByKTG`QN;wK_6Xp1dGz$z>3*2<;WeuFnG78*V#LqCyB$!zr
z4w<ejYX&KGA#qRFe6eJo#Q06<@=nA5_>ioN&H=Oali3zI>2YieuI;1wfWlmq1{kkp
zqKhvih)C(Aj_okaB1ISo?YMemcjAax0qx}Z;#`pdpD|uTh~fjkk<x~rcH?Cc&V;>q
zD@}?Y?)>}B5ys|QZSquE)a;DeJZAIIuLAL}P(p>VN9;`TxRB^Y*E1!Dt4wk0o)-(r
zq{j|FH>JLU+wFqM3EtE<jwA1EpH@YU`DCPzWl5zR9BrVaN<WF1ddyaYz1_k0OE~E(
zi>+AAhqgnCq(GTglV2f+rx(%<Pi=mduAm?C-8}abWg_Fi@I-vkYAT4Q2TAvrm7bR7
zc+c_ZTpoXgC_;Rkau%a;S^GWZ8b&TAJ~lhXIhG98Tu@lhEd^GO?{9%4@{aJDEJ1lB
zGa@tg{2<5ZagnREi~{|FUj<kNkvU}Y@rCjw<&uB=dzGS1Xi|F!4Qb2KIY0GN%TO{>
z+K{tHGDg*Rr@l_Wc^*tm84)QIFQg`!pPiIGpV=U%q{OCjsvN3-TQI1sRahr;+OQ=w
z5vE9Az#A<^BpEFqE9)XFBX6UyG;GRhp-w^{sw}P{SeTQX!<*(Mn_0-2Z#RBnUNKHH
zzBOJ_(5OgVsF7bW-c^WS7+XlJ*j%(&_;&L9q^^Z6o3Xi~>5Gw#;f<**i#pRz(=`i*
z$@y%zw-p*8Sw_XU868szrBzDXm6HZsqSa{?n><%MS8uKe++*L!&50&vCTD8pG>X+r
z)ys%Uc4<+onFLD*4B912YrUURkIhWXtyetv-IH37aqo1e<Mmr4YgBJ6bl*P>+(g(y
zIgva)-&z{JH*8@-WbS5DwU)K9UBY0mWdFjFWLYzE@I`N*X;-hQ&9&`aAkj1DE=0C+
z_F(n|<I0J=iM*xhc8-2k%Z9;=O1sja5{Wd4uRSzT*i^hZjtV{s>6<iTRAcL?L{CB@
zx>Wm2^-kgm;sX*T27bg<#e2P}W|o1ZS~N}FG(GP4U>0Fk)|)YtF*G`E|Lu=sA7LN2
zvLS7b(vJzkLI>sjVyhyBB7D^;4VSr%Ia|wMD|5@enNiC}mee!KGuqYUv&_5pyKkno
z%EWegcPe&icNn?To~Aq%3`Bdn+lJKE7#Q+N1#RCpi>v%a$P0@X#9W(PBAf$G1upKK
zt@gsRpZ|~#hF+}gtR$^B3|y9k#7O%0X~v4iPL-UM@Ju03ZCkTe%T}vdqgj{E6U{%~
zd%j@fHq0yPMt5L(cCoej=k;OH>CR#K$-DD{lPz6y7K8YZ`1MAl#kY$U4PkCRCx<7m
zPdJy4V@oq_O7<C+_9EVpKPImZ_1rPp>pGqLHMKc(AalO7vqYea*MgRZ?22QG*^NH`
z^5Tn8SFhf#aU-hz(@1!tr|v^%dPBOYsGT3$K1hC?6RZ@B6r2p?L1aT8KrTkwML9-H
z#B+uuVZ0=0;%WG3(7$3=!0Cmljmbi$PL>~m6tPWCEHWb|kwGYyo=GNJ{KAQGmPQ>%
zi};Z~2Q#g~Ue|29bTBWW*q4yd!p|Or`&j(Ecbag#dV;Mb{w_gpgCw!{sb;7oq7tGe
zqPnP5sGDVwq}3uuIo;bg2r?P733HGYo{x%~^_b0#U3K|?QRB_ADC(MyEMxDQ`H~3P
zYjtfqbXyIRkxgw$wq;5VPl;@zy+vXFq|vcEv}~wod~!J5z}A3&BXCFK9o8cAUGRWz
zjL|{F*d<11vbQK@$9LywN1IfmM*D(>n#V+8@OnOOo~ZVb+iN$EW2K|9<FL<_!nGMO
zK~&he?=A7UKTEbJM|n~er;JkveY0c~K<>n4#W8OqzEsXHKEdB?BfHNC&Oncfq9#tt
z;1prab}G0X7g0#X4aB>DgO=KvILWhMd6l$@Zt~sOGCrukKW>JnPk-`8O{=2Kct1%f
zK8kz&ymrt*rQeI2(MWsMbM>-fo&Sz<CpH1A3QIzw3vTIgBHk~Q0F*hrT6*>B4UN~$
zC;IUYYHl;J9St4$K2DcTyQPmLA+eS8r<!$k1a|SJGXsBM`&>$fHRT$99rnu6ZYwh@
zN0+$QySHwk{Mq}HI4L_RKIK>vsy_F9(0-xZNoUfev{?>Qv8;MD&tC4&HQEiD3i}i5
zq6Nc-e9N)if!2ZQAO2SPh2<qQOYi2@D^E9n{I**6Hn>x6T(I?+IV<HB=H_zg+_%g>
zD=1kss<bdeQztLpoZ)pokLLa9L1<27QN1&<pz#BJCu+~5iQnsf;86rZ0y+eJ@?|-f
zqFbDIP36=4MD;0^RIR0%@Y!K+ieKUmnKr8{nbKk|FIc#h?KIafMzOzPAIqS!;<BHn
z;-v2IMmufU9s94<=Zd8^En-cLI#V6>Usj}>$fO!1TX5~W+)Y1DMG<3wFiz?mH7HJP
zjksU9ge{?0yZxBZI;)$v?YMqtaNT`|t@5ZOr4G+?cgJb)RFXZzrgG_hom-)SrPw<C
zUGd(WTeZ%@>$>C1y&VsKkL+`{y;y!GciGdx1!50wUhXh{RPX7lS1Th6T@77~DL6cr
zqipxDcg-$Y#!My_N*21@1wBf5h>sh#11>t&7bE%E?u$Ilu8)Rw{Cuj;Lie`aDc#r(
z+z+3BWanP2Yqr`8+x0nMxL;U5&n8CY+qvn!aotns%{57-_969ByPnvySgr21Ie&75
z@T6T6N#%~#_h+!NG2DSW0!*bM%*KJIC_yI2eGf_oM@28@g^a45=r?>B81t2T4^76K
z`b`+!r8llOAFj3xU_WQWhHs$BvR9{bxhhAc(i#OI4#6fGG#mdm@%2%_F&QNObyxcs
zqU3VmaLssE?fGe4@E06F6v0NS62?+eFf`yZA`Cn%A<Sd&2^PHgV4wf}Srqma%%g|z
z!8w6uF!296MjE_B|H8owdd^?(k0L+8Ab@{iftPC<+`o^8hon9F_h-1L;5V4J3PKVR
z;9bGM-pI(>!PLf4ge}e;e1T*uuI2y(gGUK{!Ai(e9Dwns%oJ4}Ri)l@8Q54c=o#AR
z8!@<A*+R#G;c?{xAFYfW^+;T;EUg{5TzSbJj^F~Hp<gqSkvtsYXu(USDkV!IWMgkc
z@`izlfr*R{nS_Ld$KKGGOI}#?U#EkA@sgQ3I@)qEGP=09Fu1TV*w~veGIMfrGBQCJ
zArN|S1igcswWFRZy|n}RUxWO69AP5|1A8-DM>89166m;k`Zi9Eykun1iT?ik>pG2G
z&HkCm+TmZf1#XZL`U@j70~6!lV}nz9px<)Inz<TTstKD}0h@t2_}CzCcpi@bKmGF0
zjQ??_>OW^PGqJP&=c)hk)BiqI$-&58$i@oH>B#rbdj0F<|NQY^C-N{tZ~Z?^@t2(+
zz6FNnL*`-pd(rrirFh7z!8*P$6P8f~?@-SKf3Q$5diB>k^s{MG;6BO>42%Gbgz#HM
zSJ<6ogqr#G#_BFsh4&k}@0oLi1Owq2FdiY{qoxPgOTfjx<U+!DUyGBtxMH`_cn<SK
z5DDJ0Wm#YYo~h&U7o>qu>)@u#&H7C4(RcNuob?{c_4_<SWmzNRF26^1WbWfPaaB!h
zF??a*Nnl_R1z_O*{+I?2?@ns?EiL*L+5hP`7&uZ`f7F-o|Ko(e&V45YLz>AdCGwW^
ze~kxz7U*6k_`jNo6&?ffZcGse=bzdCz7T)Zl>d2M|DEPxrPBUuwEueA|M&Ywzyj;y
zsE={rWOK|Q;Cdv?+i)$(VK9yue}EDL-tTfFGwjRP!2w7go$=4d_>QqkVNq04kn?$m
zFamz>hRfekGW}#Bbox(FpXvTsbhoHDXhl&8?UVXDjtq;#ArgdUNzOvU&}>}GbDIXe
zr#uG&$6#RP@uP0?FxK2uZ}$NHmjk4>CnI8sg(Y<~IFLwA@9W)a(GQ|A*IR|9{mVZ;
z4m`%!499JM;eD~9cdaZ*%>rTnJHQFBBas+7Vv~$OL)5qEmN;gE{=+U}o6gYZ_;<Do
zdVxg9N+M*PBKTV02oXrhaU~UaCQ4{aYV?h@C;U-^o+VnA2NCePI~ses7<3W2u(B^W
zDk#ad2mC?&J5p^)fGC*4DuDtbnm`cZ3(td}BP!>$>nU^ZQ+R5J!$$dUR?v;7zw70*
ziFS*3jN%McFL}?=RnObUyD4Eq&X#<99C|2Y4LhY4Ie_&B(gR;6oU9T49Uoc0!3c^9
z9Y%71(Cjx!`MnjHHjaJClZ9qEgt5s~S;MMoqi!<Dn~B(YT5oO9W4{<NI9$#zEB_^R
zifcyg*{sFDbeRF|(Et{NRmn6vTDbtR`R-~vQAt^?qv5ER)_6Qm4)^@@1zkly1ozJM
z@3i12iYd@-ila6f0P@A~FZfE=OuUbddqLDGOwTR9`_=b*jiVcQ^)!PaGC2?lFoR$6
zx~&A@_a|7C5YP(6gqQ6%T_^N0)YIXe)oqus)bG_<{B{{&n~LK-n<lTD?AiCZ_i)(E
zj_tgu`6o|=Quv0Ch}UJI0qdp(AdO3Q%i>0jy~lv4XRa){trJ!3l=pQaM^PD?21a(^
zY0%+GD~Pf>QEc8_H{Z```rMu}?+@`^_XCfmUh8`NcP=x<f%X3)8bPTAbw2v{0vh&b
zyL0Tu5r-G6VJ7N^CtDMR=2qqYUxKiy6Q;C1=rv^(`O|!ER*3`0aE`2Dw@kKd$hU58
z?Qaj-(ASEq7a?GhqRW#pnYIS+tFcr<tH+65uW!#6Js9aqN8`<M;#;jmDdCHnuC_Xd
zXl2|P`lob!thBvOmB{gkeQwI)Ax6P!x_*yai{Hnpo4>9&8I`P03kX~CI@eM)$-dvK
z-%p^7)f}a3zByVUB$&N>QyOk}`6o6o2G<*w?_&Am0m0#|Bg32CHW#cn&ND{%OCb*b
z>?#g8=jBfyp*h~~DT%Jn_>;3PV_p0+)E$lN;iXm-#uq!lHGekW-|j2Q2fnZur)yMF
z(PEo6FD%yg_D%sV<*Hk4B_-8&?s<2){-~Z?)HuE?jF{N-c!1qcD-9w`aLeOZQa5WQ
z{)pE8qNj5(S0?qPDoSK<jko%{>o|k2NIQy2s`EuC*9QEk&4XOqic%_W(`>Ik1io~C
zMVqhaL#qxyXIUXNQK>{tE3M>T_Ca1GLF|Q{Fx@=3Z*9!Z^JlE4vUCkvv>nX@od2xg
zE7-B&afufXyF&ew??N`k+3MJRddm4tR+RL+Vj21F?!?6oe4V6(l;e5Jih+Wd%O7*r
zC=?0fz58IHTqH!rf}T5H4=ciEsk=Kbe0~*LiEUvQ@w!1#@*IRJX-S5)T!@a>>4et1
zr9a73h7=;XW76F8F*ZkJuRSfR7o2-YG==_MsrY@u)HM$xnpuIy8Amu+`q8`&!ur)G
z=r1h?Iaa>}Q$#9A(!ePm*8B>l?br6$jVAns-+U2J7V|v+JlNITLrp3=&Z49?!&G^m
zNsQRb(NXJnJMybe_24W~H?YtsyLZ2+?eA~TpIhuMdvpw<^WPkdJJIU<nQLYhM?}p3
zTsxbyVdviYE@b?!C;!!UNqvdi)n+bBo5Q2OqsDj?h#I_Rk$6x?z@thdqTBffn@K#Y
zW#^pWUxtT@8zM$itS<OzQt|tkMC&PSB~#Y^Z+`1}4wgQ;ii^dSOY$%teec_CokYW=
zVBeI$<R|*ToeNc8AQ8-_8=cLVpbPPxO&b_TNxeDL?~o~u4!pcM9#W`gR1(k5EtVRa
zyt=r%-VdE@8culDr0}A+DxKq<$H~S>8HA>&%96=t(Jhylo+~zuT#tGnD1Gv1*q!1|
zn$=8i%>T>p)>3lOFHDaRp1u!xd4)#IH*Z<L`(wnbY9B=J3=8Rhf;%|~?yQjvX3&K(
zeG8{<Yk%~+N3_+S5o7u_igDxVq$K^JAPT+VTB^_8xW`dH7G&oew!^$#mk9qw)j_s1
zE5RE}RXX}0xJOS=KVR0ObhrVdhiajaYqpGCQiow75Axr8W@@ojBIXpyNOS(dVKM0Z
zG8`;S_BDx)#n5kgO6yL;x;C40Bkg1VkoOr2%aZ4DPv11#TS3W3+`Dblr(A;k=V3o6
zC(g3Wq(eTvJBfE&iw7~|C@R(e*QaLPyxywgLGDBl8)@GOlJy4?dq2~vkIGlb{S=Z#
z)q%-<+Bv<+$?S}UW4Hn_AAg0<>nz;;d1V>yU#skgg(cAaN-tB+5W4+ONy2ZC@7u9u
z4ENOKX`5L|g<3yjqu*SV#|(cZY<kXVpLKaT<g@3Ylq8uH=QOECd-Erq_{3RGW~@}F
zdD^QZ;<%W%q~YjY1&Cu^tr9Kah-53I175%%jdPQoAwIwt463VydS@+4iH;@fWjQH2
z-^Vfi^gQgq7i|p}MnQTB^1@rX;G+bCublecG>JciPLoT9IM<WL7ZeR~%RxRMoM79;
zfJ6${eEJs#NwWlHP4A@sH+*nQ0?CN1^4;H2LX=gtE;^*wZd)Fq#|^MA&7;fPrBE88
z5<n;_uZErT-R6O3+3YY-m!)Anc0b<(SEJp{C8YeaDT=Kf6bI`bBI{U|42>tF!}`C#
z`Co1<lEpW~k$KA@BY3%k#USggffyK))63y=|0)DKKvLcAHztJj)HpNr%~4cHG6=*A
zE3D7fd&aTEK9K)A1`Uva+=SZu8y1?I9wSk7_`9ncnqGaz8|$~1q$?BNL2y<HK)tQ0
z*61}hirQ+PH7|5>OAa^ufW=H<OtBEkzG%7=lf^bwDBd~dln@@F`Guu#mNI_PZ9N%3
zrtMq?8xpw*{AXaL5}!ZChk;c5L%2gi&PhMZP>tl*f1p}Auoz)*Y&*1|sbuhT#(*rk
zZx@euh*jjx@wU9z?h7qq^}Iiw(plmGHCR)u#NGyGef%ofYeUn5pys*Y$cuIUd$UP(
zi{gZQ36fd;5PJQUQGZmmHlCAVA-S4#p}>UIFn+!|=@beVnHj6<1)3tINV~sR4EPzD
zZ@3&|WY#079Sq^Q<ho_C=|S-jUmt;E)yg)d$&&PI<yTKZ_N+RwdblD(?QmX5seyd=
za7J2>8RP7hqDIQ3z53XyO7oQxohmRyl6A%H*<2#`UJW_kQ1ivm{f*2qNVH<w>WZfz
zp#+t1`uz=A|5t(FSP*_#zjVtmFV}8ENPjkio13i68qm&Zg}c2kNaj<lkWPGA{>8RF
zUA$;`@AWLQN`~Klq~NE$+ml>=wCWP4mCEki)Y4;_mogkH@D9I%s5=|R{85W<%er2A
zw6_|m)#vpmq^#D@*)%Wgd~bW&gunjJLg+yk;!}+O!$Q3A%L-3t+-bh~!&r!SwOhUP
zG}o98cwm`rkCtRYm8j@s*skgd7Csj29hkbqbNf(;$BG@#s<gm~=SE9GvvXEW_&gF$
zFd^Ko1r|ZEs3ZMHxRXST2PlyGL7A&<$1T+z$LCcSZ<6AsuUGLScENehYT#g{E%SRh
zB%akQa&s(K=64s;znvAs^c5_(&NGCEDwRmQB^cAs>!+}f^TNkv5M}a64neYgVv_3R
ztV@rQ@!c6MfGMeUj$Zhq)sGpfx?I+Cu*qAj+0`Q^L8XcPg^2sjcYbe!mEFnwpTB#e
zXj}VM+tB$@7hP9Fm!nm*gF58+&KF#CTxHG|QWr=mzALcHk45J0U7tFygQ8he*C#x;
zvto?<!+NR@UkxbX;Oet2b@8iF&*mM5bmwdA<XmZ=C}vqaPHx%zffX<kq;*VI0}9ti
z^<!cpuF)cwo*=uGaZL7RlI{EoVSK3yo0Bo{ZD~S9hiH0Ni}u&6TTrwWXT0xye`-fJ
zKVp4N7%jB<(rH`{S8u)o<MD6!zq2B191I?TbLB&QG#ms!nxks^eBg9agHbc;Co*Q|
zoh<uVpbbq5h`<T-)$^lY2-#>(n|Rn~46z+RW(eF=OD9btle6+d^ofwT{>WFUJgoSs
zq{f^(LG^9xPW^sUevdiq(B18IARcn6=TT6ZU?>zkm`{71&2+t@UmtZAj_COC12XX;
zxj<pDShd(xSDJg5HMa6{z{cCay7o`Z#%!|cTk@o>k!qVITk1G`ZP)ZQ>(?|<MaFmQ
zK6hP|*gu>`M@8BHRFA%+4N%bFyV}Tn)xcdQ7*&=LRR8-QP(mmTK*`GI9%#h^aNMwC
zaI*7kzGQ%g(}dCha3ZtHIT8AEQ2r*OvCj{`@H!b$sy1}mtjR}Dn6+&~W9Qg!6}xST
zMT}V65reA$&eoA5o)u8|C)YjSdr-@DB_OA8+p323X0?mh#}ZG|_UEug@3+^+9k?n*
zujMZeEs`~8<4hNx22|laTN@TaFUF;>E2Vd){Ji{6@@9c1@2EF!Z=uPX8Ls@@0X^~&
zj*RkitY^l+G0ER%#zaweBGVS>9^G_LP5Bgiq|?hIKuavH?IQIalSLKIt}g^pZaNln
zrqyUmTcj+uouOb2=BCZq%IZD#8>?#pHZkzlY77_YhC`vl5K-|me4Mc$SGR2#o&H$t
zwpeTTB!bC{{!XoLx4e&G4GL2l-A@Z}zg_>c%bVZ{d7J`%LpM1Tk(dqygA$p;f%oxs
zXV9eZ_sQpeljav!+ohR=$?jYEehVL1kyis_n{ml9zE5V5OxrZyLXKxaJ^BIRr7H<C
zzxU0B?wAz2l+@f1^{!`O@egFtFV8Tum3D<KM5kXf)gG5AKl9t#Ee%`AwV{6pM{8&}
zITk%7&5pG5l`Q{~2A!L&Q|1lVR2@ZKtfonO_V!n0L*&Z6!1}YW-s`nQo3EoMGCO2h
zKP9M>zr{Bl5rOiiWA=qNVbVXPt0-0rS=-m02Of|s*;mUKqKVMTuw!<Td&<a~ovzdD
z-HD8~f|p^&ZBvQOF10W$O`_f==%f6dIU%KdT0bP<h2uCEPQ@UXMTxl&fCy{(xhatS
zhGXftl3(<-zav*t>C_w-iKw@cXwUrd=*rzkSP7$LG+KKJDQiW{M;a8pX`jR+VN*R*
z`?TQ}-qGLr+~$S)Dvngx!9PaV2BCFJB+~J3XZbt+W0-OPDEEn+>e1hb2f8wU73Nza
z!0ELqBoO}I@Bg6LPuKv>a_B?;!J7Y_?!U(T|C6hjQo^NroyEItqzCpR5PM?aSp-qW
zFwI%lTcwjV-QQiWedXR`A{YD0+URlq2Fgp#*!R&(HGAJ8XOC_9DGh<VKK(g<9z-#V
zNi~yX!HzpnCJ*#6He0^*xv@J2z`%bVnNmRaVYlQx_j5nlE@6tImU+Xz=$Z*|iAjH!
zL{vXW{8n!OKtD*{X#IAX=Q5Qim)Y;K<b|(n0BE<N)0F$Ur8{G)_i?fnTe}?Ad1_{G
z&`p*{4z?v%D@A?^x$SzgO9BYLQ%R1a62<@>4AFH1*2q+vW5s`?N%%9AdKhskjsg&{
z%d&Dr)c99>P+6M85N|cem6d94f1(ZUid{*cDCA1Bfodlw-Y7;55>oT?$#egY9Qe!M
z6r`EEUO0TI@8@~)R(G%I+S&n76N$iLOjv?AE5Hg&Ac*&V=|g+C(O1%GBe(A5!ch1!
zQ0jeJnC0Ti>%61NqTAv-?!mjuY1E73FfJz+kZ}XJvt+d-v$33bP?EDd>*zNuhEyf-
zeMKeaYfxr7283js&do1;wl>Va!VLs*nwW?K@mosEs3arNv=|SRzAK#k({#OuQnZ2b
zNy(MgP!Vl^=?0+nN3mhQ7yzGBRynEZFd(3(J~LBlC$|6=e5OF8DnkTE`Ls$<bmCxd
z?RbcvI3%}bZwQ%y;hET9i}lFQ_+;Cg86=alJQ)|SYmje<o6tdVu+kC?<9!gr)x7&-
zE~8w<Lr24+Rlv#qbZi8fpmwVqnzw&lnq90bcXvm>^m_=WB+0%K=biX)f5?{Gk!~+a
z{;&$nIF3oR0ltrPHAcfa=m|BHl86g^Vu|p=R^YcyZqj&fli0q7!c1O~bFkDvd4C);
zIdZB>_jjEJI160tr{>i&#);Fq7^In0_7BeRN;r++wvIOZ{^fYs5tOyP099M-9M+S9
zphpa$DLmSaBO)W*1Ja2_YB<~htkAKZaXFekGa2GNM?K*?xd)u+{VQIJT6q?qhXH*B
z-oWO>OtUkWG{86AU0EDYYS_d>X{(}kP{&$7<vkfz4Y)erw*g+961Rk>VX61hb=l9B
zp)@DPwD;raB}k2}w#cQfO>+QXwVkr<;8U}B-?*{>NAj<(`qU0k5T0~8lmzSNJ)d6{
zvxs+yA2;9I8|2=95wrOLg@Nh0b&Z}P--OAGQEYMC)k@BXntaNywY$o#HIQa4`yR?n
zN5_mRW*nQGcw>l3lAWk&FRf2b%xAc#&;ra0k+nkS9&qVWOHE#xn;+Z*w;EOP0M6cz
zM`3JW@}$o1SHI7YN#&C}#0RxhFdNg|VVIBqw$SVFMD>iSw>O*F34>hsH^*X|s?1%g
zAz~GPib({>n&oKfT4oiwW5QiQT?78FYJY(wyx7hb0j^l8g*~_G{&7a>iyj9xi)l7e
z{b&Nftvy>*Gc;8fhos;N>)RPZ3YoyZ*6G*5Z@%5NY4boH3*G~9qcl&ays0iG``u`*
zZFzUM{#|VI4J8469pCNASY|_6_vi1naL|fK13qD)RL5htN=}E*a{7A<^8HWR(grL~
zz}GJnh_R6M1&=oN6%fhro{p>9?aZpk@u(S2pQdJ5@DtHSw*Si3;sFE7%>x#Mksi_e
zAtlTD!YD{0w~(1yV1CzO%XsU@pC-<I4*;P-z<B_w6q}ub!cEO&9eHpjY+}6$Jt~=x
z3&gNHe8ZdePW62KalLyM(XI-aO7_M^t3<ae08kn3Ts}lb+DU&fAm!c5=+MjFhF&(5
zFqgY;2K1Tex;w&#K}%t2)9h(4E63Vf1R@;T()+ukp<ThWvPg;mREwUtgh9}uzzm6X
zwrj@}6SnILeIAPk&;M7D5%A5YJVX0L%LuzmrjG#MsgC6zJ#E)T6xxh+2>`q$VCmV&
zqvV!q^C_JVaHVs<KWg1L1M7z^U8`yk%9f$U3nkaOg~z>vARv<2;%HN!b=8&Wcnb1M
z6%Kz3R(1fy6LT2fRbH8eyT8a7%{l+|%ut5@Ab{JVPG?ejPGeH^Wsg*j0icS9GT2`-
zU%$~OZGv_(zDXbX=Tl0MJ<ykV=zhj$+s${Ur2|?n3rbbXrOkKl6X$Oyy3rc)=@oT6
z4@C?_TpU2u4&=;dxVyGa55Pmt%{QN4PU5fn4eiM|hlZM(hvoz(c^>r`M=L2ahcMLT
zg+#iK*9d4-YHUZ_76exfilYTmKR{6&5?O&0l2`ERF}@Xf-4)ZhG%IwF9iQk8l%UaW
z_5Qv=w6Co_S#LH04658{qT^AYz}3Q$K0P3)bx^bZ&77^|Ne2;zg*tW`Gkn5CDYeV$
zFoSji*-Ep_?%D6qe8XgU`nu^{n~0S~AFf{gG4JJWtpe&)Pl$VLPAI88wXnd$0)0kg
zs-h75DG`q}F4SEEOwEEhUCKE^(!Q(u3t`fo&^mnex_Z{F@kxVQzp2&&CgcUu1Ld+M
zlcsiC*F5My`Ek_x#o(p>r~UoC>fW-Kf}foKjCO^R4HA<B5-eR@u~iV76a=1tG>FRN
z8XF?O>yL_rIjhdlx49kFYT6Rxc!iwpVP^53qV4Rr6PSpwou-)QDJeNRWh*p+ZV>tz
zpKfDaC+nupo6qYYj!SoKsEZ_E6oRRqZ6w>cF!m_vTS*kU<oqfI{hQif7>Ey5t~sb{
z^9DX+P?9VI=2V)TrVelEX89w!ewD3G7zNMtW%<bR`$wUmUZ=&I)Vk17tlY?Ua89gx
z)r5C!ZnTc>T6N_<I#aAf`^3b0+5KWAFmSDMPrZLP;du`QO$jE1CVcsU@Un(le{T0(
zrb{wj<H-W(p%E`sY`?m!R^OV(hw$B9tOl1$Bpu{_xloX-%&Tf{n*rgD_;*>!GiH6^
zcB-D@44A)qO9IxQ?N=*@`8^t`dt8D4yHf)oe;%RnLHRlX+QNE9QWb4CBU5CL1*fUU
zgLSP|&F=e+=c13$vNK(SOLs++$KE*Xi8YM!fN=P6r7bYfjqCn!wX1%a-fcXsFWRYM
z;0?-4_l<Pj-`x>EK{_C?teQ|B;VtIlcix#^1`UueH<cDa!s)ADxfzZW)Ybpo0_=y<
z1X5Jq;5(K2dedDDtIe&W@k!?+$?{M#duW)0feS!nE!3IX)7tYY;9FpLM#OV-(DrjU
zUC4mjXCRI_Q}!~E;36l{I(Sj+iX**>7G$NS<!OU3v^Vp%Z3iixm<1m@G4HIY)rBms
zPB#4(llPLTUb>em&3%}cA>W^sA21Zuh6cahcL2c@(Tmj@z<XLCvA2l7JbiTKk1hTA
z2}#{PAnC{`+P2+6ef9Ovs!eJPYAL;H-?Fln)5Lh))#-@$dAai8ErntCrB2?>b*!en
zgzLS|{gnz`3UMd@l38wY>%k3_w1a_LARpfFUWncUX|ZIsS@Oo(=p0<JGJW%+T=|<M
zRx^1Kj_PMMMQ?6h;<<-ry(hgm^wcG2`3b(E!C}0DR(g;hghnMKBo;711b0(p>@~ya
zM<CRY4f80eA#-e_EyPy&I@Ir!OIq4y{QOeH;hD?&l=FA^^b~b=CZmFpolWDZGKb96
z;bB;LF9vySlr-1tP9eeYd9ZP@)GBLg2=gSC!>>U<fo05PWo-IW#N}U*4jJ3jv3U<)
z+`=di-Q~8H+k94-c7OKIR`?5qnxjH#kcL#!I*N?EHZ-tlzT`}H_XIPm-8)TbvBz;a
z74(rg^8gZ0P`O<DCzNyW6}Uw_k(dARvGZ*jJ)&u;>+X=XiWEC&1u%iOWl=m}vA49@
z{~QCC5eKVZ+Ef=0*s-k?u}th2EXH5TsCq=N=NLg7*El~r1_B6B$;=8$uZM7BiUHls
zlffge80+u8ZBCWyT$Cg`401kKEWDz&$nAMXs~QLGHP)^r^i7{K^k;-Hn7f~DDc$#5
z1{2b<((Hc1i4%rW4&Qu17YB(ji0mP{{DXp_DE8nU5GV%}(BSdEK&>nx*kajcG|0#g
ztNlNJ7JM@FKganCKWCEyRyQFhBkg}4{Sd^#G^qcb=5K2CA8Y?VH(J3N<Hl!9@_rCP
zGK06cO1W43vHQbGh3p@z)PXX?uH-v14*c7_@9(PBYuRT_$yscdTbOv0w@YcI-5jGQ
z48nM0xOJFMkMot4Cq&HFouIrGq2lJmN2Q7PHAv8ODS|j$x^9Emm?--71%YVZ*`SnU
zncRMhdzjGtg}`QS?RF*C%0pfyN17%!<3K!l1$%8DBR^csf1NR;g~0v!^*5D0f2$TU
zr`Phv(Qp4SM0(>XWJgWqe^J|{xK;W=#z4?K>m;Wv()(5w>h!!KzTUk2ZjA`)mqe<+
zd9$5M&juwIX^$2MWxj<!#5O4bokTl~4NN!@o!>q9^*#&Tu)>%eB2Uz4j(ez6lepp}
zo~q+amfE-sMwzJ;>O^=xe{Gl*%^CVunV@`@vUpF;Nt*WK&1i1~p0ZnZfReWvLvg>_
zX&;S)dYziySZ|V2a}gwWg0ZmQ>{Q`~HCAb@^1GUmbYbeR+-S~1Z!@F80-XqmNq`is
zgYs>8Z;~=NoV~ysdWa7B97VCWNNQ~*#AKpWJTa?E0kgT%KKu4QTxaq)21v6jBBPzv
z3Dvx592VcamR~EnN%F~5HL7wUEG#nVjAnLwFZt-96ik+eR)SW37JPo{3V1uqIkM@j
zkKdZD*-0EPsN3!MW-5noG}-JCy17#~Ar<X<_5*(1;4A-~s{?>8?&<G2MB141*lD?^
zAq=~!Ij`|WRB_E787Wz~%S0-byP7E|D3(^9^(Gu;z7emZTYSgfBW3iyP)uF*Pn5E8
zRp8P4I_Mn$6NDD%D94X&L5*+~^fm&L8;~vaOGe&hj-v23n8%@bl<L%VZS>9t+-~=M
zlxrbws;KOww-<ooYk8SpA%GGvK=r%#;VxXz^fzqYxvBUnqPofHqM%wn>ToAAI>Dk~
zF`LucZFEbFYIB4RwuawRD$pML8M>sPG(-rqZKP#LGOX~YRIG_GEw8ESMbVd@z5+bS
zy@yWiW4M#;%j;7*Od*5Ec9t$(*QE;@aaSNFFMuF4Lge1Qi{<y5q$CRdOGeA6YPQ-7
z9I4p9*uh~I_>HM!yQwf&HTsPW&0NH1B~5B#YC)!SX(mVy-mCzEqN91=xozWufHc_?
z{{BMtPX-+@C0x{d&=~p(1@`LnhW<|x#1)}49-wAx;}&07(IKn{oJI}$sT=LeHW6F~
zkoaa&-RM+Y(uXkY2XpIwr~E;QvJgrS0D?b}A0V{P1ktVspf|+@jnW9L5GDVABC9DQ
z9Cfmze2s2RGl#jyMEEQ;fOZ2LMGSo6{?8a|DgifEtklVKJotQWvxpMqT{;w|vqFb_
zlom+=UnePGTsipd&Cnjz!a!-!VTZUhN~_r&YmwlKhisl!2{j3OvMQ%`QFKJ=b8x|a
zXV6yV%UyB$S%2Rr6zbFcoOnV`6f|KH%Id6mq?wV}d=DE^5bRhVNr{^9&-*X-1s+<R
z!><AL%9Wj+Z(@cBWHGcfw1T4k8NVnvmyJZOk%b>hxL3GP8x917M?9Fv?+hy1N!fi@
z{O&yrDY?!n^YTDfknDfV@RNq0RuEW!7pj!vor_kFjsths)M<bb8;1RGyWaw!29Eil
z<gM%vj5rGwFL?Z*t)gKD#fE2S7YPF~wIDQU|7-oC4XXe#77sKOR6cOfGr*&xAm>rq
zw3Ore%vR<br?{=rYm_CbYlG8kqCRp=c6WD$K3tP}^utsV<%ArzX%WN^>z)hbsvF;8
z_Ey}@sh{3Ieh&W@JvPs&;ymDs+wR?gLe6`$h1N6Oo)ex+?gkRW6J3LFHZWtf?^0>S
zXNNZhhRuUv4ys1j%TIw)Y7=U}TO?EYTTIB&$1{WmM1q<5$VcEpES2Yhv$)5A)5r}C
zt3}Fn#JYWz-oWkw*GPu;YsON2sV0a?$@xuvT0(%M&~q+}baxwf5P@)Rf^>1qB`E5w
zL2>QmkNW@WWMx-?pc;VkeDad?MnD-C%WsbmFz)NJF`O<J%C<}dF%)+kfVLL>l)(m#
zD5cbF=5sqTJO&m1b8uVMU!ze-nRUnr{+BJu3~?-cW%w@J=_-c>SJHlirYbF7{P2VG
z56c5H1+#U1MCUVof<|cgy^^*w^x-G2Ah7n-RnF*>xTKlSJpI4uHy}R9!J@&=#<3v!
z=L>A;R@W~)7f8m01iuyQe$^TBFoJqK7@=(P?cZN;ML>uA;f_8|0ETq@XN3QoLkdhG
z=9Sh03U)xlrqKu}upU*kV@bwMC~FK9RWG_5M~L8Us%NwpfL#ZKcLfN7qDnlt>{SQU
zC|=6CKY=F4)B!lB)nZGrJTI7>-Y;D43w}D-r~2?Y@8k8qa;`mtDgd)YlRWlnfkaNz
z04lu6UF@oO2Hao20RwV$oWP_fSkp#;lDTFgDh~XC7m>uUQjGhDC;ds4fb_?8KyigJ
zB9Mme_{^UbBL7s0v6O~-TQK(UkC;U3x_qcy(_9n^a|VD!LWSi8Yyr0Iz5sr(_d<j?
zf8DOi%D2I;bCK=?Gux*?(}V>kmNwtT%i9Ko>gQT}l*&gUpzh&{Y60|){-lnN*YS)=
z>K{0gzAxuXK1&;)f#OU(448Sl3W@Rq419u6TCx8Z#-@u_sH|WmP4WUy%TcbP?I|~F
zpdpccJcnI|+wTam4A`8tSEY?-vrghOaivXHz4JiJU<{Dk8qgrT0SX}2M5`b1YM^Pk
z9qGL&3=9!T_5$jNS6I-d+W=587y*4%m1?UbV`C*$-(sC3m1w*NNbqT1tt7x|uYD%}
zns?9z7Gc8;E))y$frJTP%Q4~-5XJ9P`MmmHdhCoGM;8krQS>hY!e+&_#$|sIs%m0}
zDtn-%JCQs!5-@l?{Or4Kjz8ZN`hg!mKO9rZh0+Xl)~8WuZ>C$Iono4ttxV4T1t%Tv
z%j_9wdmi-2H6zA>gu>VvXpWHYMcqKhGnj4@nuzkGm!XLkN)^%#gDSA?ydNkqv916)
z?!yk15g6!!eSyP(NO=4k=zUC)pcfCAWp0-3qKZ#WnxF+6+G$gjl!IQF3fO+9en2K_
z(YlW9Hv=wh#Zk+P<0P4GP&DkNl`??SONl@snAU+*86AFD-F!=539;?(qcRk)<k+no
z(>#=F^#1}Dp-&SSTCdKXU;uOgI``X8u{5E~rss3F4}YsNNs?O>WO#9<Umxy2C4jV>
z`n6<@#j4PsY|evdVAJ`9V3mOr+#kJYOYu+nAvTdZGQ)Y`a8L;m@}_AqBX#%>IED$6
z09<7zgRZY=A(`G%+-$XG&LEWMTz2u_jmR5WKeqk`3s?Leco3@JLlXlS(`2vle9?AT
znJNS|T8F6F|4FSoA_0&Tm256G`^2U6ueXbA>bNX8iNXcW|3o0R{Dk%#FXdQf2wT_b
z+S4k{FVVTp?knybQP35!k$#fnokK!HES|PrL5D=n$ij9dM6r*b-g6Nmlg!07-=vsv
z02&zhCDdg=tU!3%qY1hY##{DIhH~d1R6alb^G;{~bCcKgr_GY`UK${Fal@J#j{tgG
zGGv-`o%_I1Z_^FlGN3CE({my_J~F@2_p9|4z()cVIl>gG@@$OS{kc(m`fJmwBs>JZ
zW#31wx>$&;l-Wv2IVzx)I;dy&mG8>bVJkm30q8oOCtDzgCgmhq0{VZ2^B!aiN)lZJ
z8>{uA@N_vBT?BK#WEA{)&<y$sR87z4fq2P}2vNDXl!)R9Ak;N&Z%?NXeD01@KLTE#
z^}EG8#ZUu;GC!|$_{l9&j_aL@j{w_peOC~Mo4ZsAJ>Setec~+jg(=mdwDC>93Z|m-
ztr}^jxhJ>o)19RC0=*&vyKgC4`Fwbj2`GW*MG{&EKw{OtXE)*iuX(*~+FWv?`Bt50
z9SCUX+<$cRA`{SS|FXxGxm^YvC~lt5L8^oc!#z}H^4$VE09RoN8U~(Bva+)^{MuXu
zVv-pq3{Vq0d|&zW+<2`S#4Ci!9nRh*?@$!)>$#;Tlk)T|Ul++`_-`F?x{_5#c!pVO
z09-fhbaI;X$jVdj@dEub<0F^dw4?osAwE5h)6KIkAra*>6hje$T`mILm8354y$0rD
z!k7BF7o@{Xa98|)QmlwJrz+&wBcm5aUO_oyeIMuIs-4zpZe>ffCD@W%)~9D<!>&Ai
z`HVj-<{zJBs;7D10R0KtiFA)Wki7?I7k?*o0Crsdfbs#f3P{Dz)SR_y4rb(iY##r5
zWkzc(d|R~W#sM}V5Bz(#;DqKZ>2BL(vm;j^|H`H^E@p?TzcAXN-1?=y9UwsDF=~df
z*-8;M`JO&}I^2;EM&p#7E#=7oL$bNxr#=;as*{Xvdq6RH#HA>W(tgnNzWgJn?YyYl
z;l<<r$=AY%2oq91mlpA{W=E5I_3KrhIse|R$vDGP0zpo|nq}U#H>dKb)P>sGG#rYS
z+$g=|?x3gp;2&prIa)Fib|rnAy`+-cA#pXs=%<9R#^8k{ghds3c&t6=6s9RZatP$K
zVcEGZ3T)?q+HP;9wXulNqDMX_k;RAB^~`UAqxm|sPua1mE0U_r#Y3ascd5T^j|@LB
z&sECpoTzVY)VUESY@;3{;`4M3_*{H=m}t{PAmX&oTA%FrdtG`yu1;>BIu;%DzS5fD
zx#YWL5lp2_3@C^}M<@6z^Nif=r6V)0%A@+C(r5<jS#&usBZu=sq6&Z1{9oP;bb6w=
zjylxY!qmxH<Mi*ScS8&yJW~Ut%hgu1$74zImEH`YgzZ({1J89@eeSQ9<d&BC!_3y5
zxre?BNV7_ztX>xbIVM_ui_YCqtfrB2>Tc6XtzWTDQ>8=wXGcY2we9SHp4Hfi-1>GS
zO1XgyN|~2e1@VsVE|+$v6BP-P<?pdu3p>_d!q>fn;^A7+vhpz@{o8<T?gAtN1R_US
z^B4G2L|!2rp+={Cw&&$vooO6VdqICxmF26X1+)RS+PgJ<*(1-nx#RwJ<W_xhyr%|A
z-Sdj2?FOL6W6dpa!NnI}eq`YeWXxc|X444|=b{@^zgz-M+A>C)J6~m_4UvpeEoW^l
zQvznUqu;|6gYn<<S8o8Fmf2vazr&?~mVqmypt8f$PI5_>_huuyHhXj=mMtF5g!6nt
zR_A4uw6RYKUAgGbswc2hR9s}BR+Y$nVmHI-JejN>W981rmgXou_WQ<!eOHrz6xuSt
z&n^)=ZM^!K98<dQ0)$dxm$O9N=AWh<(X#2OEfDyNf7foAOpz6s4;+E35WBplF2~BJ
zD$28VFEvy{t1F)lNV+xr$SU`4l6GPqN-Y;suT4&v+Lfk2vCWtz4z1en84<>|{Fmiv
zwqBBKq(Z;GcMe?dH~ZjdPvT%HFQi`aEtN|uU~0``X+Q(0z-?^Nm))455#G=twwEr_
zf5tp029Sj(@q&Foq2<3y&0Z{{Afc&kCRn0n%3oJ{Vh89zPQ=Qpa8gY2+NKUb<q|q8
zH<|59u`l2_=^n)fCEUldRvdHdvsLulW<%zz-V321-ohLx8T+{RSl933{W&f5xh?$l
z8ql0V8Q0+15fMinTw1I{9(AeiJhrY;TKBG$y4d2c01bq_{mYt*L<2;Vo(A{v<Fz;Q
zVIHZ$lPK@JZ&h5YqW9R^HUqCq>i5*;jM2((-A2H2d=Q_MadHK*+`t0CY&i)v*Fr5r
zobKGsl!WeQJASv;JRr0OjfjX&rS0~q@TJ9q@ZYUR$?yfWZV6A_g~8xc574O}E_e|s
z<)}Y8HB>Wi4ABo*g@aol5ok^!){dNL@q2VEHb^jff1xbZZQFA>mNPcFZMQOxaM@bC
z`1-C!V;vMkM&m9}9+Sb`v64GW_<W+&jHrPwFg!x!i4WqpL;(#Hf-B5aN9&hKP3D{~
z(|Um-$?5#deO$4p{z>mAG~Lzo{_LX=Scfx~kq?;dRY?}U{3uqPmb)-kPsW}9hoRJb
z+fq?@=R=@&O*z@+Sn3_l%ldHPE@3;Xj8y<lCYVnd)JJ}kNDtwc+`*4FZAjWmIe;d(
z-gs`!x@rg&``l(<O5YNZQAF$-g+M*i-o=p59SfdfPN$gy-{_jnH7474XR|tk1gDSu
z7cMIlRHkn?=%`6G^czu1Gbioe)n!ctEh0@M3FGohB}?Z0bPDsmqP{Qn$gi^5F>Edg
z<F9dqlTbFof>a)Om#5}CC!f8z5d1_ZUclL`6GbUDda2M9bq*EEs>i|zezHpZ`4Its
zOLoB>t0$jC-oJ_}0Gn1bcL&`6a=a$bATQa~=-RIdq3(l$n@H{?s8~Wy`|4;`oVpKA
zg^|c^2HsN|lyWxCEl-=HUbJtD!Idz>8|JU^$C_DCb$|GYp6MVJkav38j7b^eBwS7h
zl~vG|kIru!)>L|{<c###UM0MeY*6<xJG}W|>YJQ|MP5c8){&gz#_v`2c9fHOE+Fc=
z+(3`~aDY=+mWX)O#k?f-vg+37Y%MPGORDQr(HoAT!A4D&gr5a+cm!r%mG?drZYfkP
zSAwEduO~~CwD=QaVxDK_c%3*LLQB44u{3%gxw_f~H+o3rMASAbo9{SH*TfwK;)Ss}
z7Io#KKmWluf+Wc?e2C@@h5Lr2$>=5ReJ)yF)QW}HwKD$I)_T(9t}KC~=n_}QLh1QP
zBgKm-_uM7vO-|8=AY^He0bFgMpx%X*sMUwTX^(E~+mE8LNd3U8*x&AG3-n?U7k4Lo
z(JHMUn?KLh*saoRWx0G>3ah{I#9pTe%$WN1i@Uh(+xt}@&prESn6k}V1s?GT_z*Fv
z@y<b-$Hi^Doa}HthgO<p5#8*MH9LQoj+p<Kok^*)ZA1dzz5;GRQG4@hb28XADQ4%^
z4|g)+bR}>0r043Y^1@d%v@Vpp@0`b=kFvb6g+h9IF=zqnbs6<)g-Z-=xdc26auAaC
z<hy#_fSRPKtOrmM1bkGE-@kG=Wo^DPufP3<M-_P*Fj(aq{$vYsu0&n*bQbMBtFAJ2
zQ5B>Fl;@_o7%n;}xs5kReR`|c89_&iT{IO70FOb$y*bA+?9@s5t#pgu7CsTW>Ww=q
zx0Oi{#&5O;_{}vgl^;Rf$h=ahyIW?OmmO+N2g#?dph{44Iv?zgAkVxT<V{tdS*&GE
z7y8(r;@H#?Pg^mkC!&mDEk^Ak(?tjSGpNaXx+KLSIC%dlK)}y58hy6@oa~9mDD3B(
zGjIk_Sy{XP<wDhu1VtY_$jSf9TKVXOvVjr3cRs%lJB``}mrtnESZB^%f&<}_IjLJv
zmHMlt(^|sBAs{LFk(S%JyH&yVGDa!R8SmR^93O-E{GJVM@p%NiN>I(-eRSuK63Z)7
zcA%j|w%xkdat#eRU6Hl0yU@2$ZwU530Qgk>lYj;iL669QW?=SL^Y+h}atLcTP7Qxz
zG+H|ks>x3GbCc^uxNegrMq4kv?^J4dw#_f>lo*#bA1|X9jVbG%wJiZ5QofVQODxGz
zZBg$V_at(|-?<yryn>={s(<_9laePYx$KK$oTf@9*H^u~SqRgNtGw)Apcfjsz)IJ0
zn;)@KO0MJ@_ClI8W3URvh}Bm4%Dvy<K$rO2BP*rVF9(&XZzD&v6Zx<iU;Q}+fwsep
z-2(Np1K@eq;&-M1WmHih)?Da4?4w+mY+)Ji2q2F2zy)EmH)zSPJf*r5t1@?!s0@!X
z60Ne`OBt1ND6g^Aixji^84M73uL#uToNNaP+ni*t>L^y;I(SB%XB}N>z?z@B-+p(o
zixN}QHkG7aH|!|`ZC4e473yTxinD5uXxs1~*uF;&!Bu@4#*OMQtedQ1Lje)9T75l`
zj_Q6{O(}<wzAOUJujV&G`#n$Q2WnmSyU#?=<cn^ap9&9G(paFFOs&_f0*s+JQo6OR
zT;cpGU)k-nqG`DhBFP$_olM)zZA{-(baGqj%FuA7QyeiK-3%4QDqG%Lq>b_eSgU_t
z@3GRm^HohjmQ_oB|2fEFjbXN@q)_wZ4JvwF5tt-l@`apU*lF`t+b<QLU&~Ck^^|4Y
z@6<Vj7T<0L@QJ9sqP}M9Q1B^w9CW*zIw5ndP@r|Sm&Z~5+1WyOLr9zcP{?i&fTCvc
zk=mm%{fQozkIm7pIC0pZ*!+WZB9CMh=y!}b&FR_cJlyFW#DYR!8Lnno*6|()cck_g
z8yjWsUCg;v4c5;HEJ|K{sv`0<p1uz;`m!4ynw)D@#90At8}@@|Vhw;CX&<47r~0F`
zfsSrxFpDvG^s*N<wt?BW|JgA>sLyCC<A`rJ#Cqn?>Y+mV(9lmPS}-uLyrKUWKvi(i
z=z3X*Y*vcu3ccsqjxW3A`nZ6rU|JA-(PxVyGdnsl?q{sdA&qfMfGDxvL0^v3k<q%D
z^0|!#eaW(&U6KV=SgND3Q?N{gcRnFJjo7qj34C5xu6&)Ug<K9m;T@;l=-zQtZ-2N-
zd_S#<$s24gf&~=`{&o-C&@vL)`hMeVYds0bl~+VAk?)s+-*oP6{jwGl>!{}L7oOrO
z%Lr2g$`yTYO~+8*`=a5}DP!<tB03@W!eo+dD^jQT>rx^=40@54l0?HIUA-MH3FDI5
zHON_8x*AY_v0T3oP2H{hYP-XG+|LrI$99%t({u@0b<Az;-4DRm9`4dK;C~hmbY;cd
zJ(%F(ppDmXSP)wbQ&w0yNHC_e#rK0hCC}Ie*}wWWWz3u_PVnz8kGzw%K%by-u;0>F
zzg(Do=dBK~t&Hhf5}vk#o#7=Zi`4+_6j_Ek)3}u6$?Ao}u-4Tq*5tkUX5iH2x7P}-
zm#Yj7U)4hFbJRvncmEIW-a4wPEs7Tg1f>xr1VxY#kWwV15u{rh1w<MIB^4w@0hMl~
zk?w936hylFP=eB-(#@Njt2f{HzBk4j<Bjp&ANSsKIs5Fr_S$Q$J=grjWSGr4O`2Ew
zEj`t)Er;su?giJH^~KVt7F(INoFEl}tCx<sbcu90_ks;3!YV$W{Xm<1&Jf!6lh)o2
zo3-%MnMv}plF=n1b3^iwHe;C}vRY!d-8x=i9k!(`_XXjyc$R$7NW3=CC$y|#=33Nu
zTaH3Iy-7@Lz^TYc+HT+$YDnU?VL4C+Z4Jq-0zjLlZOXJPL8kp_C@#$}&cw_}efCaR
zMZxY==2Mx+uwkW~j<GLlxdW=wwz=+PmK=+|2uB9@n%kG(I((({^}GM0;QuonI(7lW
z@g;%b;Fe6geD8HG|F`|4z`Q^0gV$_3|FTxzcNUc2-*SB{6{dSh@LrRQ**KEA89pxb
zVR*~z?Rc|ZS^3o%)04N%mmLqCBv_-KZ~*upg`vjzJZ{9AODH26EzGap!3vU~r4&1t
zPIJ2aBBYC6@dzjpf7_$L_@OW4ep8grZODtph<{)cv(&#_$b8<>%4>!s5wFq=?}L$(
zgB2SgZt5m*EvI!bIoLRM%xwe}zDnCU=BD^G%og>?>{@p7yJt!3?4odaDL9sOdR22s
z<#f($JA^*#jLBpuHQ+2w;h`fJJ2KsHxkk`PJPC3Mdt4Y$555Ol3Zk<O7;haClHcf0
zlGVu|iB!0UDls=1u_`C1?JMfi=3!jnihQOU^RZ3jXunW?8Fy7uU&%DuJnfBp_T3@s
zTPD72Wv@tQf3Z{L+c-MC$asWc1p4bv3M>71bj?INdD!7f8&EE+P2wB|+MdYX@wJL~
z+npy>73!fizuA9+NbsusQ&3ha;hkA~3b`LYvR(O@h$QoCQ#KGvUyGxc%ylDoCX5Zm
zB0Q){4tDF~Dg}`y=iQ}VT#<Q?9|Sl<Y$5f`W*SXjDyO25WUUY6W2LwscWQ1TYMt4q
zTP`w8=t%Z7kGSB@!<UOHW{p>J@4SsuC@But82%2q&Wfp;0H?HEdbxY$c?6HIIf1Ij
z|8S{n%78z5^{rsZYVE+4Zhe(4$b5JS#A+>`xo+N*Qox}H%9T0?;jGuJ5^I+P0Rs_)
zC*5wSpa`2POGHJUyJ@!91Nmy#v#K|b2@=q{=v#nz$RswOgybe49M3jLzpP8+>9=XE
zZ}A6_5q`Y}S#GXCZa#p1auITWDJpqc?nO}drh+`yu_OaBsb*);aVQ`eB4BC3{hD_e
zBClM;Ey4knL?_E>g;KlTDY)i%OSf95`5Wz{=Ec0Uk$|5rJUXWXBqmXX;hATly)wu)
zvvd;xdB4NhuT*o=j{s8Q_k!+AFX7r?CGdc?Qo^|ZgC~_9?S;a#1Yh>!(6kDLykFuW
zcaus7Td3ORUufZf;jmY&|Lbs=s*+a^SwecebCwf~fVL9z-UU5;D{*HKTEyIZak5{?
z>I^i^*&8q{x0QmHPM;v?rKbQ}nsqv+YA_)9V*bx_pQgX|oqa2Ov~O1oI-w|_n(%P#
zAwnNP;As)GzxV{$!whkverG48#*Q}e8R$eMcm&JCd3{-*0GL_jzWAcTCDxg?5224k
z;;z)&^gSen-#|B4PwoW4AnLk+BzyQ6wGj+;21NhO321*{rLoX$&F2dqG?)&CpxC^F
zRAPd~=ug?OZbk*(xh@9T$;T$YvcFbZj&S~WPCyoI2L|pyfuQs`XbL+5FTM-SIZ1fo
zTn+hJx+vfsXc*IQZz?OBEDH0b)t-7)9auNNDDqv#-WK}|`+ZBmQ9`gkree=I-5(4Y
za3uql@S<|lHc00dbebSP-oL&)GRXSwN1cO0<R==io^Y$}uvQyR2|eH!=`aXacm|lP
zXgYA|=GGY#Wj3Umnut);KYqo@h5rXYxc?u-&u>3RYw3Cn>0_9^zwlO+ZK*6SMWE2;
zo9=srnlksAf-afBk~ISv7NUnm@zM-*J7yoUMJj{oq$mlVG!P!c<Net56CUe}^K9RC
zK_cotHVUcrWB$_D&;D{mqmwlO;p6VMXVxe*9k#`9Zd~d@RF3>PAD?DbIZs;9%w7=l
zrh}Eo8ozeCfuk;_K-Scg16fguPD|rbtS|Q8S;;Bx;Cyk4i15UvT{Y8&W5<O!gw>{c
zr>1#B5tm}uBGKald)x=+^w|3W;05vH@`-VK$!!D^0dQB_mw<|&>k9jUvnTC_5ipZ^
zK_+1Ogb1uTh~p9DkUt)VSEC3<`-v}|#2R_{ibcamps3^qM}`OI(7G98&O#RfamoYj
zzT2Tj^fg0o$e;)AK-#`uI>$XMf$DgW{>Ha1=2`ZXM0}KSd_k}MbA$c3e~*8R!2Pmr
z+h3^O{Qd<lf{2;VlVvdKr>yddg^(sGxj7y~&s{#q@==bfYO`8si+8u$utv)hATXM)
zi_mwdWGXpTJl-4HkfQ~rprNelN<34>p7YCB-DlUE&mPmz5M01?0ih(MU9Se4ly-!F
zJZ_X_%}?iop!o>1Vtjcdq|{EI5cJMg01nJWuo|+E01UA{`=nGXis=MA1bCapA?t#p
z^Om_k)4}8vrk?XW140#l8ILVw!*dr_(Kuvjv_n-*TV!#litIEiwIv+w&t@6-e&EPw
z;rue6X+PGo(NY_~=H}ZcCD{5&cAAvs7;sahMHacz5&!V)z|Z(KT0aq{ap|C*0Or@d
zSw@NDDQYG5Av$@Irppj>N$y5Lul#d_`(KK`*=Tw1D0}))PJzC@_b{k4F0UncFjLW~
zHHWjWm78<o-G)kqWZ{(%>-TR$06|Cdh^L2ZCAbAYuBw=EaNZ(uIyIx9IazhvL^Gb&
zf$8?;W3DX>7Aud}FGK^+bhX}C^jgvzaymsUapkrVn#wn+l7*1+cM|T~$21~eTqgf4
zCA!wT{-j%fj15t`y(Y)SLEm@14cz&b;Wn@IXJ`#W-fhqy-)a%OvY77pI*2-LJZI;m
z$>csK_jK|*pnhJTyh{WT<+RVh6QJunhjoSfF$xH$cPxfVIEg5^GAxGj2%o6W0Ix5b
z)~eRce4}%9tyM3dbJ6KX19^3mwuF=V#oD$!o1%sEALiCzm!haJk{{`-neE0N8=!5x
zc`BXV1>x<a-_SO8oZo58DUeu*)R@WHO+_e75+k4TBLyCM@u(aW-Ke(Rg|BnFE}Ji>
z7nz0+Bj1Gc$SltK2;2EvoUL`Y8(qK9|J8M>XDv|MiuR(@k<a=GbMuAaXh20(*VbAW
zI*Q}@1}{r|CU6@r-eo73oKFuIY3&TxY|P=Tz6ww0QJx?>`>_HASWVoVOEK7kI+$D>
zYKx{r<mFNP4i?*ytdo%Yj9BOV+8D@t_Og7&k8+NqtGVC&(NCPDe=mP@c%WJzaDlIg
zb_ueJs^n7$Hl?8RMAM{SPEo4AKR1fl`S81lxp5w;#@5E{C=91X#pLAdgw=rrI~_GP
zZIfEJ!{^CUvH02ccMtXpBwAu4R*=;m-bmbb0F+lGNlFRP$;?I$*G6iHjFq2I6VZC2
zm)+*(NozowUyU%nH25fi7iMc;?si`-rA?wP?I0mZ;ntU<yQrhkoHFaa>*+oATYN1*
zB0C7vf1HraN~S%{bi1}>>-A!zy5mG<?FC_%itnWHrv06`yQBu{GpP|eS#=`0ot;?|
zl$JRvKgTy)m&S$o2Z|N?2zNu}o+dO+go`^>+cwo|#AL*FFb96qs!z?n+FW4%{mI~E
zrxurue$B>~g_e)R$lU++%6Yc?r$F;h%BCz5Al>rsx0Z0wr`^vdU)GRX_=CQStb+?O
zAGCvRv<5h*Dd-^o-y#M_v$FMjPcQXnYihB)f8~0%lPBkjI<gYM(g1Hzhf9e>qR~Z3
zg=g<e>IQ{i1BFEl-ues=N*tMrp5j=J^OI35cO~_q|00xnm=Fi4d$Zo5kmB~eu>&W_
zm4s{b>!h4$`Fab#hTdM(P*vi0I7cal);i#Qk3j`C()6(I$@VBOtI0DO{UCiCrxRWU
zzeZX~(<P5Vsbnq{m7^OKtxdLiyi7hoqQ%RKZlB*ji5hIK|D&<TNB{9r0wI1dNIv;}
zsb9B)9O3K-Z-c<MjE4V{?|cw*3|YS)e>>CfkLw7p;-a|H=ih($-8{G;2?N9#Vdo})
zhnL5##=r!bu7NipRZ9m0TNfD#bP<Fpkl$Ke*&gKqvRraxo+In9S+YNW{3*0Q8z`!Y
zV<w+kNO}kLIGpAu@|r~2WZ4>CG79}}b;AcCwjezt+KPn9m6zX&!sk#V=*8h0-gk$N
z5ybhkjfuk0uM73TYH*@eWR22~39tGDOkZnZTtwA!d?GyLL~lPQGQo)uE*uz=MqT<l
zf7fI8bWqUS2H{hL7tFlzDQ+j+lEFe+fCE<zOAJP3zH)AcIUCYnI+Cf9iT8W({Lu=J
z%Sfn6`=IErQT12zo<j4g8Uu&^{L8Or4xP${FmTQqgp&UAbjOVzql4F*klUwUo**Uk
z9vrp%J%X@xnnBE_?mQ^;(uT>tsM7(tunxr3Vg?q9>mUk<M@TJgf@^JtJY)8~k0w8Y
zqNX@+C-~f-S9t_Y`X%LnPCIjUd#Sjy4WaHS-+cm6r#O1O`fU8um1_{;hF32Q*dY8z
zc(HM7-nH>kiPyH?-~O5Aesri^{_`bFxm%)mnGZp+IH?~OLQjEoIGYm8i0PP&_qa`Y
z49orTeXyIE)5bX39jtAGEVDHT5iVVKUY~2Ldk6HPG!R!uZLN(WVQJgWY{Ele{=5<V
z*sdZ)WsC@5?i%1=!b{x$S($$Sz$Rntqvze{vh~=pG3>e}$fKy;wm*WgRpS!8i8f@b
z8ha@O9CMuZe>VDK>Tjq*8rU{4@-+rDssWiVrjmZvr@LA;IK}{p_v1J<_C;C^Y4Mq>
zjwti=Xc{uWP1b+}Gk~wj_Q4b9Y#T`Gtn?sAvG{l&5d#PpL(=pQZ>|NoL=`dbw19Td
z-5VIJZC(OG9FCU<SjaRk0V!i6*Nj&c*{`-iMES6fqdBXsQYBe3>S#Og=uzYa2w=`2
zgH^?_R9uCa6ZMuf6M~g4Tue8hBs8W48imW;Zv;=RXou>wS~s)@H9Ip=z0yFKktznK
zGtWh%yTI=;a0dIR3_chW?ef>#)vmiD_o>U+B`fC5k%#!fnM7!t%sV$MoKw|cfDNCW
zg%37Grq&NnQD*Eg#ipRln3@?&n`?1f3;nH2wMU0hRBG9!&u7lIVr~Ni4coU2;d22|
zr_8#cZ^{}rjD8e|Uk)Omg)#U>V432&$s53hTlW!q4j_yH$NpVdQ((UpS@quUG^Z`^
zDRgAXP-7?nrRq6?I#*s$gKA<ELgplZ4|E6n_bNLX?&pAZhjenoOGB&yB5(5OxN0=#
zQ}biVwqT6l@%gM-SkT*$S|E;0_MOo_HW1-vLBv=wCkSuQ;HUr0OG_#7>D79={bSO=
zy$sa&Q9y%aW2F?6%q}@S{Y=ybBlRq2$!=AR*v<<06kIjdrqZo@K68z~y!mpBXPrm&
zN{h;gMuhLNlnCQ#k_jjG!Vy9Us>*X*g&6sz9jmLI0Qn){pPaQ|N-=6&4v6Jgoz$_z
z5HUL265oTNtJeqb+C^~>Xwh9rMD!1&-^JUpCcrf!`H7PfS0M6&%Ren?WOdDcPWhA1
zI~tr*9w*P3c?SIiWLJuM^lo>gzMIo^c$U?zKFN;kRY=5(WOm0xMZe%t;IUWSo8cEa
zI^3$QH@8G&Z^t=U<8XB`mV8!ULRWk>inv}%@}dgmgl!}UMZ~(X18K?MxpXDXcQM$+
zyp>+rVTbvO&O<6(k5bD#hIh7w-pI^){Gt2ah}k_YqDx-^7%WDR+6w`5Z#0__RJDHN
zic1<Q?cqXSrsRoXFdjtZ!1GqA8sod!G)E{jC6?%hY1AEIU-YQe)_jt*OeZ#{q-r5S
z!`osf-K+D?j7;cduY&}~@h+O`hZ8lDv>!oLbY3Uhp_I^v7r2siKexvoPY|RVwgI(j
z63rbZGy&G@@%K+3^X(ueANOPWVV^=yMnxYacfkn+&4$L6z$Iz(ICb&d8o;!t&8RV;
z*A*7(;?H0gl|353_~1mg393rf_HHR^s*k&{>#KjVs>HADs_Q(TN~+PS^GNG{)^{WG
zOL*5@m+Tca46YRKE~kh$NuQad#9&~!cgh)bV0m}$39$Ot)r+tgg=G;AEb$c3ifcKt
z7}-}Z0ddplO~ms03uw1cm&$`MDK=BhcSH|Nsl7F0ktNhamxSHNL^Ct61M%l+QTPI>
zEyAV9ow=jxMe#8k7_~%8xGO3QxsNVrO>!@&-o0^TmQIDKvE(<^x0e!STqZJpV!FIr
z^1LzU?J-0`{2}@-c~;i0tK~KFp$+=1wA{n(t=c2kCf!-sPqK$~PDkorl$&tNWF?uv
z+MFoNd%sM0a)k$>i74O<&0i`M{&a9uo+N7GPsBf$*mr|HPhj^uxj)CUR8jO#z|yO<
zY6glbO)f-Ez=}`dyU3nHDjDY8ag6-qn8m%m&2Q;-+Cgj}gr)-C2^iTuCwWI7iQm&y
zC!%*$vqPvdh_d}8{G)g7F__k<e>FRub;I#e@&~tI+^@qM^cZ`oc{)5ECqGkuvwV~p
z`C@E=xxi8d^TkO5+*W<LrfuNOoY>pQv$$AgPOk1#_n>Up!~o^w>6crEx-w;^;FZjE
z13D7r`)rqkw<6oY{(hQU!60t--WlcdlhN8H)jZ1TA20jA80T|}EnR5hzNgeG?&i4I
zCpheB09nHXtr&RYxK)0wdgeU4jZxl`_&9t#+c{kb7(X6y`WK~dl}9{fr$oAo0Nd=H
zWx5%rqfq$=9obt4GC-&>>;g9+o&w7OiAior4ayaz1MfH#F)=^*a3&WxqGWGeF7J(b
z5vk|Qd;>D}$xj!2bG+`7C`~_w=X}C)nApq4HGNm0q4rydoY+kr!xb)oY9+Bmu)_jH
z?_g8pomdNW^k|`Jzb$^E1)5otw5EF3RXYqj#C||6cL5ES%1#AIocw-|1-x{9_PEwg
zf-~7T6pncYBwJb8d9v+v?bq|cV95snz>i6hFZP%0L$Z<-b>g>?%%H*-4)-0o^`o(a
zS|x$3hqiSY#VfLSvwKuemkJ3oU9PvV8tecKw5j29ez8FLqLNx}^)1K2iRv2?fsWnp
zvYfVgH(q>Ds%@&SPqkZZW^C+Hc6Q_qX$t95(w~KKV|lM!*S;HpxUIxQziMqAy^ju>
z%woxABd^h^P5qg6<ko#kOhAZ!)3n?=pr;6rteQ0`S(vrzU0>rkY<Rlg>o7v%8{38&
zW16O`Ow5~~mq5|gl$^GOJF!nwbYi%nFR7!)A-XwP_d#>u8R^%G-ZIsBmyNbBe_7~S
zt9mEqtw)(BFFQf~U=d68TWV1G15beZfag_HG@?G6e6+T5<hJ?*VclSbg=?(3t?CRI
zJLP;%x`h|ZmE2Y;pJcqt8a(oy+V$j_v}H<?%+e6-m6r<f`Be*9Z*K&VQ13soD1N7S
z|2ehh>@L4{U*_FrdkbA(aW=e$q-h!?n_G-Y$zX)2T$QPd;dP__?1vY=b`XgTHid*4
zF&Rd#T%why9|A*j)%Fiw<7JX3MB_f^@Zwsv@!cPw81_8DLBN9;O3lUB?aVynj0^8?
zFpktE%UJ9EqSLEwq_fxq)?Z(fG$ORiMCOD52gG>I#7PvfurxKNrO*@<M04||yJK({
z(}517lNY|bbAbNkqkUli=7&>l9Y+^kfeNlSw~i7P@0xs!<_WTQ%wF^<STJ0s{rC~6
zgrWH3L5q^%e3kK@hP3EXsu%~{&y6P2GwIp=-1|`EGU*!<`y2u6?>wnf<4VqjJzH`2
zu%GmmA8~>}>q^UPj%CPnYgHLukVeX6%k9OC4YfxQEwACs9l6onP4>k1Pqy4(JapZN
zk(tG#D57Vd0$zI;Eh~^fkbzJ6Y_B7sceaM)a=-rT(=`+^tW5hiXCXrQ%1AL39)2RK
z&HONKAigVs<o;my?pn~YMn-ULM`#CcB#Z#2__pusnx7ohn}W9QmwWrS+k_<y%x6UQ
zwrV&XGp=C}44krcjTFezm$ftZRfUM^NFEQRT<aXEU00M{EzpuBTBpEms@a<VD!nu!
zgxx=8##8ew+5Ws+p1i~-qt`R+@c~5(EDQTvkqHs*E43`H?s1PBFjCAYa9~HVPmS)M
zzV5DMnC)D>f=%K0w9jYCOnwFP{)CwFqS+8t2<K7H0FGEw#s}Y1nEV0A{Wf#lDT;mc
zA{*Ww^bgeBDZQ#Sm*?<K@1iuUzNFc^T>1(aA>mpib`XPD#kvI!g{~9r8ZHG9*#3T!
zarmLIw(k|;qxGWiO8y7|MAl@?E8>YZK=tu9PxnjdwfNcWwh9P~%t^}@hxe^H4niRg
zp%R`N{rQ{ead9C(Ls-5d>}OIS1kKpgOecDV(`s-uZzSwfVXS`3y;{UBKhUWw_rV<k
zl=_sui{@nX%~U5>XXFrCrX}VAvyrq>*Gu8*aEon5->X|k`gl(K?zvH+ms`fcAVcNJ
z>l&TYT87da<<?atyWF`7-vX~QSRlNe$qq#nfWuF}6BtpXW6c|*XK?cI+J0-gx&l!>
z$+QEgs-=v3rKECSM%tgzX5h~zJWOihkDNe)9(s}cmwUf}+pifL1{IxA2AxBkO?>is
zd)Z5h-h=y-5M(o6wg#F}Od4$=ZENK`hY+*Tw*C=)&2|U%E6ejW&U~JJ&cXCl*!F><
zwj%-?nupd(sAlHjt$8U&)R)qy49(Qldd34Jl<vFwMe!2(#;%hB?%ZyvsfT(ewgeUR
zFK_5|ttTW!a46C|p6)Cv%G4k{dg&`C)d*Dm9I-cpFQ3yl=DlL<Z>bl8UgGy~#-(s#
zExKEq=;as6-r2{sL9vUL$vk>SO4qqR;0)Wh9@hDuDQ4Y?`LT$lZ1^NUa7tQ4r~~Hd
z08Q65G4F{=ivmXDvS^B!8lGk<yS8g8j4TqvybpWpEUdrG;ZYo-&YDM@ipKrQ-xW{7
zx&Go)xVWYFwxR~p%6Mv|-onJf0H2s-`43_W3<tWzeEpT93V|B1`7+q0yjPWPx)L7%
zCMk|%tKVhCHXz_}uhkn98jO^h;CqWo&E}4!TrMFDZSAD$t&ttL@^V~;y++`&m$iuh
zHzMNKH;`fAgbV|-Sz+Ehhoht30AyEH(W<M%#_+jiftEsv)<bxG2XDdhLUie7&=gC+
z=Rs=%e4M&wVMexuk}=HUTRI{&EiVx!hInEacm!boRM0WL_GM<O*-zol>#6{mMdONt
z^ir*$wLu!)=P$@~dn0Z?hOJH_`WrX1)~ULv&fT@Q{liysG&Tkg`)jJI0a-bzb3^e;
z@x%GV{Swlgp24ZO<^Z+0gMLRU0>;?wz<|xsh!2UobAqkRoxr2z<IJ|>I87Btk+{N`
z{)I`1tz+g*Q_0;37IJ+To05B3D?6%+MJ+*tpfciMuDh97h}pMQd9=4OH{kq3!EH=v
zLse*b0Z5H;w3@Xm7hDSm$zSP4w_nv9*lU<hPaap&wP~i3Gg^g!S>N<>ILSNfib?Ob
zf>B74EGQ+$i%GthO&9(OHoW+aVOJ7_F?Pw^n_c=IS}f)KEcHB%b44d1S0~np65+GP
zNhL;RJvb$lYBPXahk&Z2)1@Z)GvLL`0}Mmc1|fm`&LKX#&v`Bnk#wKo;|rPCq3Z=F
zR1MvSVgMj((Xw0kw%lgPV)6zyp(8^M{B0nQh@oJ2s>h}K?8AQTyZ*s~<_xehE$2)d
z@C{F874EIjKGe%ogf$P@R)AnV={u=Fxm}(=slk1>vC1KYg+ZF+=UPa?Vqwq7JOyw3
zF`9e1>~%HkKpFSFUAVY-j>V0VrA^X}zwmihp|92FednnpJ>wdPdi=)mYKrxz+Ai3&
zO+K9#KB}iaycePCIDGY8>GP|Izia~)*c<H(ce;_SWp8=uj?lWCs<N*}nJng($E!HS
z6_4~%vg}s34s|PbUxR#yW?QFD^*I*rPsje)lyah9ukd|VzjX2tVnlu~(czNwnZ?%E
z<VGYX$(f=)jsgAOK;7u$mwksmA@}~r%<psxnuxw8EG~wog@VvUNGw^x+hq}td|W?F
z!OdsU-6=GSRfK$;YI-(3oimLsV_wO!TTD`0aV?{F)n=!%h~e>LoB4Sb(^Zz(sUh?S
ziMghW^C@kMM1Uf#3!=Lhat-R!xBEEcE~njT=ud7t?BmmX$YR}>>Vt9{Nu3X9&SdQ8
z3>hx1TkK3|(P9f&VMzCk1+cp`DP?>scU{fx{rRFAAHs`*6-jy4QqTK7{X|53uz!Yq
zy#uW>zc0Q(L6&cCB)!1I$lT&^pxZiK@m!K`G?;!LyL)rHdX=v=`cr8|gZX<G;$3dA
z-S;6roeC`)&`!$gi!Te)?2y#WTdYrzG5F>sD^iL_=%xQT?6doRqx%~$hTs#rN97X!
zl6&#GV|-~Lt#6^-yLNB~(tH^vgY37|F*J1@!h;#axJ?yY{o^7u*`!s!!<}k8xazvM
znai|QU{LQj=8;m@B8t9DHu^bQ;JiS4=g_?4TkerS@DA_NxwJE`WfUxNh4p9&cYg-V
zY`O4Q3q&PqRtxZ0&sU#(RalLFYb*De)ly$G*gnp=rqrPLHYOZ9s;cn=i;@cLn(0Ks
z3n68crkSr7tdD9&h|v-CwWhN%=e6T1wjx%kIgMTwjsWl4Z|v~edZ3dn;?(g+(v4Cy
z(a_89WC6Vl@|RTFARdCce9=;KEVl}i_`&Qp%iK~8CRzGc_Dh;|g}R1J?@}{gouNRa
zR7PDtP<bLojv!1JdOvZn^7ym0__;MZ@&Q{~5*2zgY30J|`g4|E2K&sy>^Wm(S7X!W
z@<_*5g6_vZnZlV>pEJv4YXh%aE$$9T>gcFYOB86hPxi@ItG+I@n_y{Sxg<fW1VpOr
zw9dau0VoqergI}vni5GJ%F5mYd>EVUb4Wv^`T+PN1y!vmb3Q1}UsN%9eTpTw<aW8Q
z8)W||b4|ZhgOT7HaI#OGYOJA78>qw#hs=}Ld_{rd)@eJaN&wOjDr1TpU}<gixu2iD
zh%1=XfS8dOh)V-ZXS@pj%v2_}BDqVyfLF+0T9icb7)HiquUxr5*mn(NFpggCEKmBf
z+khcc3~oV=m9j;Ty3~TW3`8*+A#FC6TEaj(9H-u<1jf!<`;bN&0Lw7J8O?8Ez7?pr
zatlePA0JQIDKYlc#XC}6A=}Q82}k>V>ArB$xB(Om1@<@E5BC!)uM%TpD9|M?uPn(i
zY(8#bPUm7f$}>{y<Q&PQx{1O_lDWL%48|w~ON5s^$h7mOOCF`ys#nO-EoOM}l)kS3
z<5cc>MCPM>sQt5ezw}-<S>3><w^Ll%{y^E3wt34$-X1Xwa$cuU1ncTT!=TihKT6h3
zPD9TOrH2U!1qPl;crPE=w9|dq%>KpUaKnfuOu|ZR44^un_xi0DTPa$}Vad4?vX(yD
zcm(}AymiF;4U9l9DE0<(dk(w-m{YRCtfa@mUFQX@mF3U6sQzou)1u6X%?`}7trg_D
zOU@o-zO(+>$!{Wa8-nSZ7$|3TLl&|THtc{e@c;9WzznOwX;p^|yKO0XDpG*}SsD@@
z;ENLL5oJ#JTnJB$vNf{rEd2M3XvJU4hb|&VmUwgv-omm-dUjnwD-Rb=nEjrK?e`Ls
zD+gaVjChB@_R${_{@q$NDe)lR3`VPC_<hXjIa(l=?2bTEzpj@3vN+fAOoC*!O4VhS
z#~vF+;k|g|znLn3E?%dZ5U`u+THSvE9RwIX&FPFUM1g$AMD79Go_a)bHv{$u52Qar
z(pka|-UiWto^jE;p80*-aIhINL6*A%`Vi}HQE?m~GT@m20QSJ_qO1$ZbZ5ZH_2Cm?
zqdykV<FTcz2S1s%5|9Nsr)Z)%CVSN!t=$)oQ#^vT2KC-ikoK(jgURvxH=<5B`yyy)
zXh%n1WupYpepMw}9y?WVp&_pEC)wm_84P(s$&=L0AI6CB)iGsan`xfo&)m3Se~)Pi
z!{;8SR><qmDReLJ&;1C#)$vh`c9#Cb&@<6*v38@26R}6)-<R5I;<ik-+G}<k!%M1*
zg@ncu#>=bMCRXBaawlBCLB~8Tiso?{1F5ZW$09Q|`rF)|DkA@k0kLQdfR=lD=NM?7
zp<$4oegg#*5JU6xkO4o>Yke0>NB%4^{P>9c+%1|Hkz$Nk>jv3GW6hzA)x>m?_N!eN
z@X#@LnSMY%w;Ga<!>$m&xUy!tZ2?8U8lsiacCZxOQ2K$5v>`DQ&BHHEQ1CSx#tE%w
zqMy-LMmc%>MI|YodhuU^%VX0#6XnG{-8A=EFS!GoM%fb%rzK%dHb%#wU{ehyKE4n5
zm)R9jnq+1oCE8~n(8y?lu00bd_fwz?Qu(a+RuRsDoT$qm1Cz6=a4_RK1~$nhI2>D|
z#Z3E|sAQ{}(Ji!JCyK^p3c3bnb-XUykI42OTxkG&=xP@xwh?~GMK-6W4<xLr*_<3Q
zFCq;)ZrvYEY=$Gka>O$T|M_?L(g^Q#Q&ZU_1=8QY8+-hTDGF{S!~whi=0E=qJ^KJ|
zZkR{##a|cr^@-suO^`|vVdiazk{*OWmDOj2^7(xf32mZRuR}u!Jcu>dtkWvEH_<$-
z&(H*QE4=4^w!m-Jm6*ZpvQ3z%n9BUSM~Kn!?$4dN<Qhqf^tC#2_R@<SGEccF4M7&H
zVr5aPBV!@K{k!QNws@WsZ?|t|jJirJ!?SaR4n>*DH49-6<>R#QfYD=mBfycrZq&mb
zy+$Clx_}fNGm8<r+u2nwi|i$JlV2N?`k?af4q=LV=KuN2sl*TmeCP>}0O+o{8+@na
zafhVws|gYYPdRcLvz?BP(A-O#eyFKO#gV&I!f!O)zge<xK1~+Pn|;yTfr<7Zu|tQU
z%B{FHEt2jU>zwPZNqT$2sZRF04q+On`mZVpp~B(F6Yo1pzVFo%kIAhytwpm;!Iv*{
zY<)BAh?iA`q>-UEjkTZug&tFu<;#(1=`+bOkt3E{7On@8FRENnN+UhDb@Zl{<oNZ9
z2%VQ^Q+bRA%SzsXQA%q}dWW`C@xrLGLG7jN4)f*SLvx1?3;k$2jkw;`1ub*tY}eyI
zXvB^5*W~c0mva1Zl+!|NM5J)0d*Hlfg3*S4{NP>f-IP~@cTp3L8hfaX$G10y=NB#6
zzwe1w@6u-<_2@~h%edyx&WG*_IeEf!S5tjPOmTQeDDxeLc{11j%lEytMhljD;gK5N
zZn-jh4VRV19sBvRM`;&g2WnW8bDUGcgBc$V{PZ!?k9Q^sE`P<J?qotisxm9s$dcr`
zoMop`DbdH*&Xhg+`%p_7=-|fIP~OES(AaQZB_W@t81+D|WI?B5Kc5Ucow1^TNaXGz
zJC<WxybwIFf_zi75bXF4KK${WIuEkmWK_X`&#J{R6Z!Wf(WQss;mhxvMf<log$!$D
z2bc-!?pR%Je~zkObErxhCN8-HnYzTkW>4Y*49cuS+{A-_%uqO+eK1^~eStOW&l&S;
z3XAq*VH>F(kc#fXvi6TT3NIxL3f(?!H5kLchxPF%k1QCc8M`dcHtGJ=<Sq$hmB}d1
zxbpA2R40e&wV{c&EcMTeh$i~N>S3>9s-phSi@>~LhmKjuM?(evp5(uqOpXKdB&j1Q
z>)&^&rVAa@ua90r{qH8TBWr;~oP_$n@AChS2jhW@;{tTX6@XD4#l7)uz=OcvOGi=l
z_uY7S(23F*epzQ=J6=%DXnz1+T?G+pC0N`sw3Grjvyxdgtpb6qoAm^}{_Ig@9(c&c
zi^L1Z>undA$2G&x6_DA=&szc`zt&~oz&MON`Rg$&Tf_0wE7TS*kB4Z`hs!Y!rt$!4
ztm*;RUCeU8GX|`_qW}5$MJr)=g;l#}P$7SU_zU+w(;MP6O?}sXHV7>yFR%PHhQjW`
z(`y!ZwQ&6IBVqgj1nf!(iU$y)9<D_j*i|%;D$4xz(5Z;wp%R-AKL7J}prNj$hzIN_
z6b~2!d#g#XHuwgcW9@P%k|ELH1|aqd&`9kds@Gc|#lPP6<#0p)s2|i3B0ihlj&eFb
zoWNP$UI%*@_CV$B-Rij-+Qy`h;qa<Xkt0ErFhK0yUT+^(PLm;tW{J+DA1_7hA(O0l
zSCZwiFhGqNc}QMJO>@NbGli)=Ad1TN#<|F5l81M6-2Ldlx+g_u1)R)nTgn%*vL(@X
zwN0WJTJB``kAh9+Gyq1G75fl2RhACuw9cPH<?$e(v&tb7KR_yEq8(ggyT5{j;j=E(
zL+K8$;c~VCR}dSB=JdWH1ms&Q1mPAoFIuWWr5U?>+tp0QK~w+W8>7hU_u`DL<ST|F
z_Sw759jII@1Ee6xAR8zMRL1I;TOt{om%xAa5ocMLdc50>Vc^M*LI^|g{-i#XyUI|6
zSUv$lA){~kZZ3cUBrg|@LBKi+nxnI9(s_gTw95R}ok6)dJymm?YL$Q3tS_f#2(a2Q
zaKyjY?rXiln=a84>JYLBbpJ18Am**ww{rg?@C;eY|B5HV^u7RkH8^az->L7EHKSCJ
z<V+17i$5J(lV{!6VkT0ow=@fZ{C2Sa+J0;kWT%xK4X17`#19Gs<T|miy1tIMU05*H
zFFYs*qJ!!(5Yls4T25J7Ao$8cjqr`LgI|GlpxBO>IS=lqHd)#hejQpyl(kf?38Mr1
zuWjqsA|3(_IzL-={FLxqh;~`P+p3+0%H;cjz@kWV%2lk|xS6A)QC;sM{N}8{bZ+&k
zJCu!J8#D)dJ|9AxmC9M3rR2d6z^&{nK#Z=qsin<^E|%TgGB#w+4_%JzCkGB}G3!q&
zB?lcalCXrQCLdcn_LxNGI28G-H&V%;fM<b)Wlo{wn9Mkg+w=W<2mqzP7%+)50e7KS
z4G6v^_@bJ<8#}t9KLWGG!maP773`I==Rb{Z{!HewXnyu;^8i2$!x$J5>>c?BYSeL~
zGcx>>gfFo8od#{{sxp>f16`Fj1O#h#;j-05o4L2PB#AO7>B2M_qn#Da;c4eLe?;XT
z<bikW9G^%P{dk@rR4Q@R)h!LARv?~UZg&Suiu=0m8SPmK{SDUn(AUw_+_xV;u)FQH
zXSze$^aNI~MZxvEoC#%_y3I5y*;Y6wC^V?EUr;HHI80zrC<*?&HD;-bj`{xNItpLE
z*jWLg<&+}mufu_%q(&aqcfac$ud?C>n^9RcT|Yyo7oI>3t#D&sYxVRW8_a`03Kvpr
z5mrgn0cr!*k!mAm^1r_XP_&89Uc|M{Z+f4%eidkk80&V<V@`x5Tqd^aeXYT19`+*K
z0m|`6C_Sv2Py5Lugh*W^M;vS;1)UaK^Sj98Zveky%3{Zyn%lkk$M!bQ)HguF1Ix|V
zZ=I9>u_)5exs!>nqRm<8xRZIR`}I6%i?n)0&fmQ65lr=X^ct#B$@SG1LeH4u-<c;V
zMQYQzu{(vFH+}H4qc2nSYtcCDmd!ZA=hI_9-6ZgjEih8#k0K0xt}-Br#mPFHL5gL-
zx??WL?Os}R;|Kt1Yo+Nbf)f<Dl1$9fDb7REC(rX4dEieM9;%|zM6m`vf@SQJ(vHLX
zsVbDE{4(NIVL|Dd{MBpPvhkgw<JG;GYjR_HV1d4A@A9<i(#u<o{d4$oZj|y0=%gcq
zdhVMp)z&yhC2|qAzuz%PZzi%~G>dvG5|Rcf7ljtlo}l2jQM)Z#uQ;?cpvakw&v#UO
zw)1MxTUzkCDcrt6B)ol3EzAF1S1tx#vk_xwR;0c_a+(U7hSWhzzW&j2A;vjbjkLID
z2o1&NBHs9r&3-gZx82)pp~x*Bwjcvzts}3SjCMt$-jZP%7jdr)*+J=<a@whRUIQ9&
zr7NfJ-F$%*5b8*nRXo?!C5@eGL{qQ0#9dc+o7e-NWfAb3*LyOmUaV0P*@`HoWPjEh
z>1fdkygx0|v<iP(>Cn{2t9@zJ(|3MfDnQ(1R(^H*uXho=tKMReHDS*#TUNaaiNk)$
zIQpI{*nsgp|IKQUGky(y_vmS`hacP_4W;h=UIP0I{%Y7t@%g~VRg;SgLcLBI(EfoZ
z{W<UeBeN88vkY!IE?0UL@4=RHsX|^LSp6cFU<dHYpzKn{K0sJ0*3k;@R67K;J@9Lp
zOa%My19B87S9;do|3W2XBc~j<y@ZnIIkWWNgwS#BV}y50dqe8$#yD@KQiZ$HR<3)A
zp3iO0AOlMWPY)72qYNc(f5v^&gMr>x`2KHHKD6k$H+K^`#i?yQ`WAq3T_@qwLMXm8
z7oO2vsU_6TDe+zCwV(~^&LY|Qd!(000$tE07CVWv3z8vaGaB)a#QvLIIMH$Ii-(9e
zQ9=nRag%bCqpbzwi>f&Ajrv9vzs)NUufV5#RjRo~jn8qSh3Eu^4rX5u*uxH1mFc)G
z=Let0<yq(UT87%4E((hQX1SpS0XnrGL*f$e=q_n2CZ#ao88PW=r7f2OFm%f|K5coV
zg~$J}koj6m0=|*i2(XQ`$kz)+z56dM^RNU3M`$khEDFgx#g*g_yjI!Sjk$gMS!)~A
z_xxZMb1=vn^j47Uiwt{^<sP|{98I#c0`OP=S{OXa@H{>653X3>JZVKO;P}A)zQ(6l
z_;D34>$e^1(m#9#Sqv_b5xtVeMr(EIqn4Kg(oM3Cv`#%Tj1t;y%@`LZuhm&a`<%I)
z+iC+WzrhkkiHg7#%WGYa_{A&qv$yI+C5v2y2Aq-O&e|_pPFper1+$7&K7)!~))5?R
zLBVbM_B3B1D<glAHr^zer+g~Ki?3ZOpDOo#l6zS`2IZs<s}L4vPQp5G@5(Kj<gvQ2
ze}uzcxI;Kq2djw0yMfWj@jlr}&IeX_ac{gBo3hky&<j@%rDIU+Zj8CT)fnbCL5caj
zH&2Y)Hplev{`ws}d~5_rt`=tdWjqk5?0X(=&ipPmC+>^of``+b24lASc0Kh%nRzvS
zSP4@`9$8YhyH*djAE>-5(p2y@A~1+uWndmIN;737-FueDnLJP1Y@}GW`C{W`+DpW2
zy*uN1NrUhPWl%PTfX3r7#b_>Or&q#49cP=AYcGeEGFsm9UdVU!J9mxN`ej-(y;S2j
z2i(C@Y}dNINgd{P3<~G=A-75F1fx5?&hj+LOfoHhPZoFtVAI3$lR*$%9-0YD&RLt0
zi+Yo{_U2Ps<BR7p9Q)5;afU5nD(=`1cO;#ln7p3=e)l);QkQ;YKd1tM(XA1Op7SSf
zsEM#0R)W1+!=5(HeTeb`$}?A@KfldR(t7HUy~@fgoowYOb7v&)P8`>W+oW0WRVC!1
zdPnh&@H#!u%g!4GB&VAlQNF8j>}VBbGE7pE;yP<xWoqx*TyD3jK0(0~C+&J#F_j;@
zmlm?1c<0@3svqfhypD2Mv6NVcCvHCJKO>a><qIH6=WB)ScP1PeF({VF(p>(&9b%Ff
zWvp=wNj<9(m$+a-GI`Z0?$JvRe6~%qT4zgrEIfNsLbeLC5z-4PJ!+)O*_>d;Ma8z7
z`NfM&M-75*zpF!uatSjES^0VL_ZtUvF6fx4jD#bU(Crju##cJ_DvhQ4{FgMx9OmW1
z4L!*FcS?E{eX~O^y#4{%KB-PGJZGKi4bKIGkGTE;LAe2MzXU!%zd3raH9<{H_nnIE
zoHK+xpN-8gY3QmjJg5!NMDs_PqGK!Ysf_<fz<R_bc5@y$sQL3g=d-dE>q@&a2V9kl
zByMcPJ|}XNcr(`=do=$|7oDQN0mG8B=j%5$s`u6;B)vBMFE^vruijMkYE!q2F$>o4
z_rsB*Fu$geTltngw4m03PxmQR$pU>KxP);F=jlCk?5Cy@-%vlg^L|~Ym***g8Kyrf
zlP0kR=fx`sQj2}ep28C6c%SgE-3)w^!1lx=M#)52TPeKyaV7-@bfd<HOQVcV`izmX
zj-2O&YbV_HBsv+Su4`RXXlx3#pH~{;eOH?Oa?Xfbkgu;fUy+OZJxyD}yZlgRJnsx)
z#8Ny~AN_({bo~9jg&PrliX-t3G4F1BFOMu(p{mY1vg(#`$v2DoP0LlQ(};V#E}5+h
zrZ5-@A7p<^d9gfv&*19i<u?Xb#TR|(*XFnri8-5>7XrV9(X=K9Gszx9&WrXi+pU-y
zH)Y0Km~&mQ%EP!8>M=|vYXE$Z4{TJXpmtqXQB$lwq6_M_HfZ_#j@v^F)BgFx+OiRu
z2n-?~Mt#PgJkhiiP0Cm$0nhzs3J$n)gzt)~`rbs@RrSlnDe8oCane=RUc7UfzNN3u
zJ(`U8qWD&Sz@xaAkBC+MS-Xt2!nqVK-_7V<$e>ZQJqY|UcqmRJ#FaM;?BTCpEp-Xy
z_-jzvq4Oo7+Wtdb9BZyJ(S*Sds!R22S6?#6ZibdARVXBraHU?Ncy(LAKAb|1vxp{m
zx8wPVO_%I)%CTUR6#jYcRARx15LFj7DSz9-(mQpI7wo%kB68HCkJ8D)T&s0G5MB|_
z?lf??5l;}c%2sovm#i*3&``%*G2F|JYp%t6s%YuPbTtW0#kQR=+fu;#RHm{%&Y3?p
z9VhfYxZe;s!s;Rbi#awhhCfmBp6m2+sG}m9E^>Hld?%CA8k~IW4{zmMQzd`!v$jRR
z{mcrPwh{4GGW9_d0cdf$boR`NI+S`J9eR#}MVU+YH514tiv`{^z+TsYSoGoa5Yh1|
zvoSBiqv3@Do3rH7z6;Cl?Y6*hwmH=F&a*fb{%1DlADIueTlb(03L-|ywp<1s(DZ(i
z+W+d7eBVWJcGmN+4Y?C0m(@>*8@f<0*p}0nX`-{P0mZI7tTrcV-x<3jf4Iwb!{m*@
z$?`5SywccORSIs2#(a^U)k)AJEX+WK#E7-fLLnsa`BgUac$>m-R{6B}N588(5+6sA
zlE2nd!hduu8{W3uYstk2?;y5g`=0FQ7A&&`9LuqGu3q`dO%^$m+OE_)06yDknnK||
z`@<I2$}P)5Lc|JjP1CVicfVIjWO-j*(Dr*HVFToQm^wm2j$;#9_C&9-{pqb(Hw6`C
zLs0r(aq|BoB`q3-wXZHe3j5ZX{Zth@^<Mo_TAkv|J5t8mEG89T;6H0o%(=&+Um?d{
zNx$W`j52__t#*g9%$2?g;M#u2z>D-hSNjzXCEmw~jsPTtqhxdexHR1a=Xr80r3|&i
zx#J`~Vl_hZo+KqIMTO{oOzY2Nh`yK{91Af1)2L{b<|;^)8I|{%{e3w{5^YtgFfbl(
zo-dXBTQqV!NJ5~0uM979QT`RB9DnjyA%S#9+<|27-y?G5*BL30+>><_@c+-b1R(oj
zZ(6Hj`R~hLVTR;d$e3dCzt1(N2nn9C;fH<yxx6eO5&vgzC5!n7P=YJr?K<>(diLK+
z?Ek$#7L7#m<*S4#yT^aY#Sw%Wh1O7Hd<Cf#>8NF%8a@hk3!`8tFsy_A?XR$p$qV}H
zw*86l<BX?SIZ~^A6mlV&kKg=Xn)EvE010fECSt*ye-Gi02Uj8;3@QG)wwwR`1{<EX
z1k>%4e^0?2Cu8hy!T6MV!-xH!Z+^o^|4J7FbQ1s#n8Ut)>hE#dA48WGlDs5ur73<t
zx<6AsUBu7;DuWrC|9tZs?K<(VX@HK&PYYw0uI&Qmf4=cK4OwPSTI0XcNxxIZQb?BB
zGwQ{EzafUc@lj5}|Mv_zJ@*PmNx)UR)BpK~Neudi*awuq|L&asX-4w@;fIs_$iwFA
z=Ss*02^Eh5_Qos73sue#-z4v4FkN+k>e^;Q!az{;wsKk}g!<Z<@y%iCTP6Ia(+b&A
z0wa+cfwU-qP@Gm<1apr7Qi=8o;8osD>0P!oO-MOVApOZNyJ-3TWX}?6xTeGz@GU`y
zj!;d>_17eaBB>h*@$-vMHHY_7@|QzZP>W2<-<6ztV|}*;WLPvC<+7#R*IzG)A@lwU
zn=;)cWlr=XthAXUrkF#yY})}hRD48Fz+`Pjim^hjFHu0s4B%=-y_}0r%$<4{MmsiB
z0X=dBgv(Mt9@?9#RC9EPDM_hlm!s1%kJYtja4>dLX>YI9LcjV@v1OD|;@wQqVOg)9
z-SSds$)V~|I>T;?9z*AZjLvw@pi0zUO7Hg*PDhFhPhUyFh!EYtJQqAEVk0wswkw}m
z_E1jIzSaHrBoGb9T-SZ|{igbY4FYjG!QlPYP5W>X!$q20QP$(vVOO7FalMylwoX2^
zk7VQ3<Y81c@V}F9?>|*AQ1YXmd8gs6Je9ca?{`=j#x8o(z@?wP`q(C6MUfdK01F`_
znK6t{^2IZ(Yg|URJ@9|UMMw)fF$}K0pnl=JNqY(d8($tK>WN}~xrr=rhmdesZ|w=I
z$x}NHm5@b-DOB`a+m~v!C}-~~_-#5e-(;h|e!}3Pd~qrG8qVY2YmdiWjOhDG&rT+D
z+7<>=pktmyrWtxq@K_)I-j``|PwNHlh2J2R@mXb>7c4PhxtbY3OZeT`|1MS}$ck`U
z5<uU7EeRfq$dcsA+4$eZN(@<&>ZC9I?(;wAau*KVX`J%Rxxal=a#K|EMep+j*4^Qg
zcYSW5dGx@`kjZKgzs<+o!Rn4d4AH2^aAOJXugmk(C4N_{qP!4U^4*VD3Xt7ZWNB8?
zS4+C=IIo`Pr1yFyE%?{zyCmUqR-aNKLSQOI-b;@=KV)C<@bef<FF9rjz&-6s!DBkA
zuv5c4J?9RGA55GI&q9ZHz5}oSuc*<32U=uC$1XwlOf(b0T&kX))SCwA{+)2D#P3Uq
zz*@K47?(`@O!NqOcOD66s1X4CmsM@f`fDjG(H8xP&p@UUcy1eF)+<QI(o!j0|96;h
z4jWE&QjDYL<iA(P|Lvbt>3@lPJZ=L*jlgP<GLAiTTRV>^+{k+}5ZFWL=cltO!w5vR
z2O)oVx-Fm+4!q_)^jiS?RKfPE0Z|WHBdPeqT=259Mhulzcb5FnFedQK3YExq_s3s6
z1$dPJI<kLmwx{Gab=jsz>q~3%DlZ=xx-wWKr<K1<fsV%(Bxk;Ri~2{DgX<88I6$nd
z1XhJ{P{|Ah`CCJkN+@a>FdiuaUMCr^kjl%54#dgFcP%Sk>Y`Ih_0P}ezgjg5ApB3j
zB6>$b#vSb9(}+zIq$)a*X)jSxXxLZ<3+VJ{%o02!(Jh4RF<!IS5^cBBpSQ$w#UBlW
z>P$wz#W7|DEJ1aW3d9fKcHldZ5mrEjh*aUo(w~sMw?(YRtHGK*<|HN5x4K;pISpgd
z(5eFnqaDBuHb`JTCG>WJ(n!X=#*^nC1Z|RxKsHwbx`0hAm;h#BEHVoRQ>U#@d-&OI
z5tWLgc^r}sfv98*>@#`{84$m1urnLIxTft5`keu!wta`B?$Bn*o#_K8AsD-<wor!1
zX{{iAz5-k|1y^CquMm381Ej^r3(_D{V*>#U>~$I#QKx9)CDA<4Spc*A;Aq8nUXms`
z_CPEbT6Ba(Vss9w0LYQ@K?Z3I9s=sNp7HM3cQ<kZq3`CWu*ik?e~9w}pCi6VveOc?
z89<Rmb}*aw-oLVFDzX=K=g~*3F?d+ZX^L^EpjlAiIs*C0u-=ge`qvT$t&SZnIV@*_
z9z7>HOXl9xY@1!o?}0jwlVBU)oj2zJ(qDE;%6*V3w$ICrBM2d4kzc;wP>?Q(KylsV
zbqJek4C2KVP~I(?N6tC5rw{ByEhvyzSy;o)&%9~^x`^zc&Jn>4p~_wt8z?BPp@iM*
z8v72A<tc2nJBNr$M&OSSivng3`}N@SP?qw<3k;0rU=q+MJVUBypI<ex>e_e0y6!9s
zNM<2mDvQo7WFm;KESF1!nP&rpx$|zjy$PRvofnFK02MTO1mrGLt`1TO{=i$d7~{DJ
zH@%45bU7^}+%&7l?US5{Sx=;(%0Q6%K~XP(3|@LJ1icRG{XRdgcib=>D-$GIXHd3F
z<F4HSZD!Wq?d%US1No+qd4CM%?;2c;yTMWyUlTb&CDTeoE9w^eE;+FgHoRn5p&^~j
ziT2?5Rza$vb!kT}Fc8ORUHTR*5g7%~#=kNu#?B#T=?Ykg%6bg6p{RnZX~K+9X?wJ&
z51^}=fE!cT>GztEdT!zAAJ#$UchOZ?>n4(N6ade+WrM(STn(l!E18?*x+=zy1u0c<
zDCZvslPIc(t><sNe*6tMNS`opO_&N_Ie+>9?^9)B<d*Gw@^;F=gvNE0V2q@xYkG#H
z=?MUQUw5~-gMK1NdANoSGdst-Ue=)C{T{PDZagEiPcJpXi8LGa0nl8yxX9_8rONrm
zXxema)gXqU-Vt;6a^hBdNf_xjweb?y=xHUET8@#Pfc(|y{!3N~COf%qD5b?~a|hZ-
zT&`jtF><jAfw)}-+PSf!&4o>XZya$iWG_MSn|h=Q15*3YN3w`{ZbX)<m=MYxGFBhs
zLppTVxM-8}Wxj7zxmeyniQ{?XK@|u#^4wtOs&`ooSZznk6Nl{=c}7<tVYdR4?RB}O
z=Vzo8oyg3eY^<)3XGgZ0sBiB5-Mnja2}DL=W%1giYqf3-8(SWJIS<&Z0F~adbfdr3
zeuPqAqhqi*EOM|J77hy5{*1#IdwP1!;}r_{x}@K~o}h@@+a_K6>f*Ly*X}Okzq+eJ
zSqdUMVYiOJmL}F0j4*?*W4Kj`wx7EAwN~zFXUU!Srm;}F+s61JXn;|Gy`6g>sTKb%
zmneDwR4nhGBqV*eFY17G-DDY3jrluYEYpHfAe-JB1nbWicNI1?7cz8VN;`aSl#vq}
zDBL()bZ5)mF(>C1S)d)XSr@(#kzM2?jhM9SR`P#_Kr}9U4ykmy`=H14wo(Gll{@*3
z1#AoFA+8Bi5}jZ>sH^98XGn@B5{}4!&S#PGJ>SjVHt-1Y2+<c(LxAeCdq=dkDV%3V
z-*JjV@K7@;`iQ%BhWtkMj{ZXnw=wT%YMqgi6{MD>!rr`smi~nVcwEu?cFFU?ZrBgo
zJBjk*Y(7(zB2X);GvA@xp86<Tqn+cvMlX5QStY@1D1AnBs0Ay3_sGNpzqI~2Ry$z|
zr#)+Ga-c9jd$l%=xKlmP2ze8HDU!o?<&OT1P^cfvy|-PFbpm)qyGo*r-J85)OBx>X
z3~^6bXi%*ALf~68TbgPRmsLE@av<#;fwHg~cp5Kksw2~qp6Iz8Gppg@FUUM>>U83g
zcH=Gxx9)1XyUPsb<hYKn4BpK`iaa_rhDHh70kuL|7~S=8$rktf!P1cRxbDEBECrC`
zg)dRHo{d-1is`W?4Pv+iFDh(%1sb|#0;yWDU4wT7D7QBc+Vwky-A)j0^~V(Id*&D<
zz(CDT)4x>zZt?(d<-ARpq})rqR;ZFW&9bZcV}p@m)d{3cT^VQhyjt={$Ccd%yahe{
z3h=X)iUDCn<GLb#)7NM<a?f{!<gg{Ctq~ZlX0^|1)lB!d%-l!J%b`GugId(p(@_x|
zqXT&=kK!_l6Hvj<Jr~M27Hfp*)}4aF*AFv@8MODxi^h})ny=ItRHH&z*I;&-Hx0%R
zxK+vO4pZ6;2GDmtB6(Yyo)OHjoN?WlR@uR09Fr5N5xFhIuuq&@%La?sW1>ecm?*lg
zeh++CirZJ|<SEAr!0o8<Valc#vr~*9&ckU1MSZ>cbx09?y7|gzr@2wv!w-r@4)$}i
z=s&qT_Ry4u&WZKuewR2=f9YU8`h6uEUo?yZnnV@!=B?;&h6lUUCmDJbnd)6hw1hw|
zjv9z|9Mn!$jK)gs7q0bI=@m?t3ha1L`~1UO&D9#0L9hl&OXh|m)B45{2`*y2+;xT-
zhdi7dQM}&`vDE5{xc%PErDV!C6EpCq#nyn2ZxWG|6FlC;XV23p!*i&Yb!4DUhR%L`
z5;bXSh%i`PROnI0<<6rvJM-g6L}mwnFw2E!Lfl_M)1~g=2XR1o8t(;n^MU9ZGZcBj
zBeSd-z7a=PkJ>38mL&7@flya$OpVBuVm-T}@m6&F+wPOmlt~-`&T9jwEB98aIS$3L
z&|WjR2KTv`3JgHH#UlFQK9oG}L)?bFIjptOaj6<-%8@#`8ZLq5afC+l@q&ET*{B@z
z>+fZ*NF4z|QQ7ZV4eNsIwF~HJwJLFfHS0Zc4yvv8x84uVyLOLPFKN_%X1Q^vPsX-*
zpftO~B1W=PJ7!5EnUzaq7w9%wmI-Op3-3o(y~=l|V&t2*ZnoxkQn|8w)rw}Zw<Ii)
zFDG|KYp8M#<!&hWNaJp69V}}kJeIf|b!T`o_xZVIb6<&zkwvF7LY}?n*pXu;A9&!O
z)U!47Ltm-4a>MDY+o@W(&BaZk+H6La=*L$oYHD9P^ghiWx8q*Ei>|E@qnb!7zw5H*
zJM=OpX)jvR{JRoT`K*5J0pv|qago_nNizftAkx~OW(s%=uIPj08c)T0Vxnr>?u?1J
zXO?d}<$4fbUUuJ^j+F8(+R0A4<>7-Aw;+`DTzgr62_>*pAg(2Eub)hsbFr!&*JvnU
zlKkGtz*1SLhaW(u6?tr6fqSnwi`%v1xybGpWER_C0NcoPY$9{~@%r@DsQrHasV%Oj
zfw#NpzKcaR+df^ZnB%uZC6+c@m$Hi&hno1~N(aby)lhnnU3Ps<B%+_=LXc`5QOD6-
zrWD`(0Qc^YTb25~<X0()5?XmHmQ=p9A+?_@6%u@#S2wg0a&eP_;Pr6M1JF3!lC*<m
z3ww2qo~^nP*N~9_!PTZJTIcJeqhy6dtpkr;{XOx@=TEh2q)iS#%~`!Jo0IqS39$9C
zsxAiw$L8l|WW6_-Cy)n?+fbQ?_5Wz^%fq2;-@l7QmZU_fP%5&-BN1YhP-Kh9GKMG0
zkR{odnW&V#Y&~ClRQ5Id+Cyc{K9ns&7;Bcvm|@=Y_D#?4+wuPO9>4cE-s3%f|1r#5
zb6@v$U)On_pXIFE_OQm;tn0-GVR2SVQ}D_7o{#Vkx<h+nxDCP*=nkecNPYekpyZ$2
zKN4lw9sLCyA2e+UOL{vbJ*6SHt>Vy8>K$^JF<{vvsJB{eNd}E@sAX~J`P1dC0ROsy
zGiTp+#={_9)XMo-Yah|Y-o<r_ARel4!bi76(j}8aAp0ak@5;E`KuxO1gF@qCx^;^(
zkDgg;S60ogtA`#X-<<00tGqzDRsU5MLLoxnZcA(3^@Xax=VSnY_vu+YTS4&3e>J4V
z(Dp`ESKGF2N?qOH3*<nbGd_qx+8glpKC|Vdp^TTZ&3KRhq%fUrm;4>n`o;8-+>wDH
ziKiU6bVA--A?InEUOU*I-X4E!n)%U36H-0lh7G2dEY>d{RpCF|9mpb>VfED4Bx{AM
zPG&oZ6||0wmuIj0lp)>@(>&p(ZptD{B{9z9q6|i0&{1P*IMtpT&?s&Fz3tdF_fvL9
zlv(hk6Q>rk)Bn}DM$Gs7gJ<^g#cjS{8Q^+^?q~eaL$7EH2+cT@!y&f438LP-*DH9@
z8r6n$R!S=W?N#)-IREh5!llGizCB&4zBAA{sUvb9+C%4yOp&9{8)tA9s9(9R0EbXJ
znvP!h>Cv@?=loTtrh?3SP_emg;D+$Q!7|1(*`qsp`CBD0>MpfC_6>Ht+g|vhnmOa)
zg{KQ2`<v#P?c~)gD<Khb5owv$u+Dsbd?p>H!gKFej~$-k=6DQA<r=<%G?XIyz7g|d
z|J!xQ8a?rE=}4j(t&JQzZ?*ULFAxo4k|RV@Q=6T*mG+@G<j?0Q>IJHg+<q1MLUz1L
zbFGCb)z06=V4INXo#K*ubJUJKm$`-Ho^tT2_-><k*>g4I`52>T^!|S0m|-@QJ}fu%
zE@0ni`P7?PFc>JXDfC7nc~vWG{OhQ~7FE?9>H<1_Kyi^pEN;HXM*3$P-M_#>diczg
zjj<~krEdJNbI5r`MU%za#yF5fI2Ok&XMG?)xkkFg$N^Uz=Klg)n&)q9%WJl3rS0Hs
zTzOS+M>*#-sPwq!Q)TLa68J_Myi;SiPOv>VAWQ+(V<^{{2C0&Z@%j#<dz382#`B~b
zz6UjD+cU_dZ<z)A|LB!I^2P5=aTsrAtEZx=9VU}BolkeH5z|!bnvQ6Wq=zt+Nkf&{
z-1nMGC*`pnjN%ZP1ny`0LYq<0v{jhwb&6%Y)9;c==%>i<zdkYc>QbF6>07nWP=i|H
zu0}~wrEV=0Izj4u&QMU$<q=04ks6u#iHxAv<qrjq8zk5h1Qo>g+_>Z=MDHuiGD+#a
zQPX}~zS^)z<Y~wA^x>XffM^9SFTP&r>{;X0f(1CYPsom6;Jj{$xT<S90igY?x#swU
zl8(J=PX8*-jp7)x2bf1U40u{S-1!nUgC(-9JbkzNjmtD+A7nA8*<l#e3%;>WYZQ&!
zy4u<mJpT$JP(mkeWLF5WW99%U&X30<Lh;T@81NL|vrKZIp->7<7(}|hofj!J_Se^s
zUm3b>S8T%Y)@w;yaM&X0Se&}G)$J-nZfuO{Qc9Ylp3Xqu3zMpe$Fp{?*o^beh`7I8
zyfd3&&~yw+T9JB5Xy0IX^4X(ml~fZLBC}c7S8vZhXmnW>WiNp<&mKurmqpNv5Bs=w
zb?zw~ySFyp6kct9>m#8^+%B-%-@%(%{djMqjv2${+K&Bl_U=>X*~GP2;)D`aeLls$
z<V{7JouyvH`tsPSxo2u@xB0kRM10nuzTLY<0*UOV6ou6=(RCtRg*@G|lSvP1mG1&~
zdd)~lFvXMavKOAIWJ;Zvr_ThrP=sJ9UFZeAHIJnJu(NySNW7xW*wKAH`^t;qT-Bwb
zaH;U4GiL@JtnPLgG(nnPH{H}TeN?-1!fKVw)uWd(SLlv#;ma{e;=ZgQRP~b#5_cjE
z&V4N-qz_wo_^?@iVzThLRlHVxm^YCEy8w;ufj$H6gr(9+8pCJJM8!cexy=42srIP}
z=p6^Z%VQP@zTNG&Xzz!}&j5o(--Tp0(*&ro@w9^I7@4v+_`1;0(bT|me5;Otj|W3~
z{yF^fH9m^@lUQlH7=+N;OnJ%mP8D|M%D4FSUc4;IkH710JoX-r=fbktU;j$4)8^E@
z&3DHUBCwpyvOZ66)G32bZ2W7iX<nVE1zY)ZD6tN*3NabKv14GJI)QS-ag>#3dE$Y*
zh&L_KWY;goC^vott9G_F?1RMQN$<D4<ZYLzRLJ~BsmmW$^9K?8<9J1sC|TrQnOMm{
zC$DM=F6L=0IJ|$g+bnhc&5qlH(yF+b_+ZWZ%@}vG>jw8_uSsg;{t5aE-)|PFB}O4$
zRFxm+R*R(K*qc7pm7Ac1KO6z(Y&wOE4Q<7!)Tg)NY^;}MO-s(O!#Vtgff{qARl6><
zIwC_7X-_nmxUrm?W`;Ye{R0HW66G}jcdE@FI`dIVZ~4<kmsj$Tb|jTOxWlV}_M(N{
ztWW_*#31?XRPubo#^Xy9ct>YkhErrO<Jkc`?FU|-Ru`l9*>fmYWnwuYRc>`Sb4egL
zrP34p!>%MxP$Oro`+*6!uc~Ac>1{r!ddQB`=W$rVRdSm)M;MBpaJr^gP5rbq1Sy{x
zb~^K?%&c}J5W0|Y(&vEJp|8Yoq(^y^H-8jY9wW|hj|Eq6OC?<RdmIWPvdp(A82b3c
zLq{00F+guDoS6ePzY}<7-=M~QU0-vW5r4(w@J60FItx-hdtU~wCw{EV{zC3KN|%hU
znb4L{v@<v&)T{#N=-0ju#g6qVIpN~dy3zad76cCHB~D{gTDn7ljooXtKBbN4VelbJ
zfAe+(Ub&i}tgM>=l(?XTdVz(3)A$PZf>9OCC{G}hwdyue;l_5o8dp8E&RJVNhAZM_
zV|%%G@)s=HRPddeg&IS2i>R<IEY5(0AGZCnHFGAn%n~3p+{>~|OWI?zNm<c}ywA0|
z!Y7+f3x9r7jQE$2SZbPFYPQugtByMrTozg<X|EIrg+{WC6NOsW&V^{#H^d}A8<3gK
z=Q7jMFg`D5n`lsC+c7QiO4AKnlqHeKm39*5m%KyjNN&{Bdt*+;X`4$iC--VK7T5Uj
zy-kCHrB=~GpteV<pDpv{K4*J*e|wk1e4=)NnR03I!kMC7qOk^l334d$Uq^&EXk>$A
zVm<DgNF5Vo6QEmWp+LSnf*3YmlP*e=`(efDXHNBtX%*iSV>8x^b=qjsO>bK%?7$>+
zug}vq9Ha{B7S}QxUut>ePneVuq}XbbY?XUx<V4zCBuuR%8OTf;>fxuHO2Dq_LxnYS
zsH)}iS+d9@*&1s%OppbB$DRc`92@UMZo>riBC$1>qQkroe@ZVE+r_gY4QyhyuEMib
z@^;20!gutBjpLckE(xp~>E7}Q6iNilG>M)reS}T=So_KCCZ_2}p6{~Ni#o;So3pRI
zwqav5WiBx>(Uj=QX@vOnWEO*<f>4B-;!q@A@b3dEWPj(esI{Vef-dKpmCp0b-FV-t
zu6|s(_C=L>t;RyyK?c*rqu47crkz}QyPF%zb9H&kBi$v*<ax2!TT$7wIHFd8H7}Iz
z#Tk};)bWO<968VL&-jD$oXFcJth708V%&W$DMBve#)A^6n#0aHD~O6w?nKgiv7*ay
zTn;^J`?}U^+dvu;z~$a`F`5UPq%vN(`jYEGYXnsgr=P=>vIM@7V8r&KYmzxT;)Udj
z_BgWE?-e>aCloY7F3L2<(e5VfQ_}e)7^z5qj;cf|(uN33K%ASdZrax{S*@7z;U|=!
zgU|%VXSi)ZN-_)Do(|m2&^-WhFiDgmpS20IE?)rDf^Xpy6jjQN;YUL^%#Z5{N~%9r
z4xatVRPc2I$st%oK4#}@?GATx<tTy!T^i^f*84=?zh<%Cc#Uz+)6cX`?TyU-Ir0{J
z%L>>K|5f3*^*R*SK@KSp^(f@GH^6C%KrL1?`-#X(bygY%7mK^lMdu+xq8|hfHVf)2
zaIhzEq&JCWKq@03T$M-9wYTs#7=eHc`Vp6ka1idxL)m{2cCjwk05B{1h<$<i2RuWF
zWxvpP-_x9*pxfvmgn|8L_d-ZFfN9c)JG@Cm)7J&7$1vbPt{J{3b=`tf=rvU0ZAe}<
zMKPJt%ZC&gRg*3ZCkLH&f-&ssNXKLjLwCA4a61w$xcqM_>+0kCr)x4e91y^DEmQ@;
zJ2Md6MpywI?n<mP;4251;@k5F;mf-SX~l2BpJN1?79acpauROlKP(&e$bi$H41gQl
z#-|@`YHz&7mH0=!Smt1RC<2&FKeL8XRRsA&6-(OI&1KyKb4mYF01If`7>G`v#ReZ)
zLq-+%ocXl^{*ON=Ny$H)>s2Rn|38Y#pU@hDq&1|dnEehr?U^)$FSOp&_V@2ll@fm#
z%;<UJv0wbEUr;1&Kdjtb>fyt`!G_y+{BMR)=a;Y_C{5#XHSrVHWq|RMZbD^UPW7Yj
z#qAZWw%G<OGWY(`oh31Dub9aw_>RtC6x{?nfN*nD$<>y8=oN<jLl_?)+RtTitZ_V7
z;_NT|t@<%@6w{OwDY!xCN~#VUfPHE3%|o5;<>}mKAJKULLD9cgxzfGuA@ADRFO;8n
z(lHQtn?uRm9G=qypVm#*dW<BL_gh(8PE`ak|5Owzp_o5K?)5vc{%O}?VD+dY@?XwD
zomuryPcK(w7w^pe#SkoD;A}bY$Dfy6cP1cEY6kx>o<Z=aj`4PyOd3IHdV`EPdTn|d
z)jhq4McHX+U_GKxVpHyoa+CR({*<E?#jnOcX52F#0*t;X_WFcBGldO-&l5@7nfDAR
zopEnK@o8AEu3<E*fxJ*a9LOPA5oEJBLH0yao8ClW<?@wvud5ge2tF4pzcqCtsCwr{
z@teNhoSKM&A6GD#dmdjH`v`(Y>(5DY$=XCO1?|M{`9=Zce#FoPH!OT1UK!ut-W_4d
z15F=^wFW#LcLBBOwQ7qnlwT7!aG^~5;>hjZ-Ij}(PjR%bx+StA4h9N+AWijLF7mOt
zQvq^Oyqp<kI?hU$pg3hq>k!<#su(rm1D+PW0OnJiE32zR3pO?iHp~z~EAlQ6gn^Kv
zeSE=3`0LWk7N12Qjj~Nh-VnAu7fM3(e-#!WPf)uHES>V4<}``l86*T)lDdP{spDJs
zdh3A$$93P}oM}m605bpMZ%Qrz_v)9-W1+Xjf6`mfi1K*4Jec|S;dRjPtPp0sj6;9_
ztX@P~S$;aD_P38>(*M7HDa6|@Z?|pRV~TVog3<qfxvc-Wl>ZkO+keAd>D~pLTzPxY
z_ss#N))CA%12@D<hyCUzP_!}%lCX`hH)A%w0ymx)(z`DzZJ_gOu?It4doYDb6Uj_v
z1lXz&DFzKdgYUgjXX;qbltQx77s9L1hr}Lp0Fed~7m?n#E(gJZ=g=R2KJ)@X-~h;f
zIgeaiVbxpw4&J^K(a=0wLZiDH1sREbQ{<^R(4IMh1JHdF(8CSxmY*4Ru0%Qr*xVb`
zs`6d{Enr22mRzF(&?P(ak#6ZFq(0jU$VR&W;J;@N+oc;~wqm*l4F7q{EYf4Ue2NpI
zR2&35Y2NvzCXw7yu*10&CYGphP_ZXJ`o^CDX}pMu5d}>5JO|VcB8~gfw6m_-Zym<I
z8xSLOd{KD&9bWMbXFBZocKaOgfqj(iK&hkT23J)Mh>?igltaU?j5o+BT?EIl9<_Fe
zFB<1lC_`+(Bu`3zGN*R~X;cN!tk093IV1m)wll6HWwAkLWG6(0OI?PfgM$IB2(g*F
zNnd=gnb$|L!uE?;?V`$Az_Xp8WwJ#V6*0o4gn@6gP#f^IM>!7=PkjV!^c<jYfo~9K
zU)S;Y(9^rO;e^)Zw?-(XAol&(Kqw_IwV~X=B-Y;BPi$+8RKoA-Mhp~SFLn2Ythh=V
z*bcqa30-O%%@2bFmjZnFWOP3elfO=gCemJ^O1?k?c*7;ZF&C?BFvmQLD(4q&4vS2i
z%0Hr9(u61u&UXLN5>Yu%fAH=gRp@E?&5TOu4_i2LPPi^C_{opyk4N~Jt|&yGuao1w
z`;Noq{XA5xUD_UdxL{m{07xg2$^d|jnY(GVs-#-@1z_j+=5|I&VrV`6R@{_S!(fUx
zyg#~mG(@Vs)jpUmR@8m1BOu?82inS8DUWkL%!{~5YsS?rW!t;MoR2yIy&<z3%75;o
z=MV?DgA=+q$ht!<J;8Qx)CUx_#gkan^^|X^F7NvHE3HBic@&)pw3zbysu=o%i2lYq
z3*@+!B#1og>*at!Gde-N{M_AZF_cc@8x?%3jwgolK~Wb$jE3gadf!nf=_ocItnbSQ
zyXAz2>gVFzzXp7-2JP@8yKh`uLEC@Od7A_rp+xjfFdqzLvFP_w!gX|X<!svR(~`xj
z<{**|4IA&RO6%A7n?e5Bk7>hhh3Bl6t1Al0&zNx;s!|zsB*b4lc^$!nOGQzSUgwB<
zCGp>%eV!FGj$I+K<5!hUdk_U$No-5Y0uU8)y-s_#+whM9njHck%^nb_v#q`_`TnC2
z=UvU(J0+=wv3=mpsIS*P29@Sek|8fc$<M_CXWYLax{gejNn?SdTZhh<9YvK;+|?sa
zwpe6w{?LY)7LMpumCthz1UDn@Gw6dzCt_-xR!^O!Z!nM+(!cdiq{V~VnXZ}qcSXq9
zZFtD$f+LR)J6$%7*823O69Mp}qE@^!`}w>V{$6$G8Trbsn~gN~{65FrX)UGkMoA|V
zfM`t0ZO+3TOgu;G{WzI#eTocU9j2Au^dP~$)!+yKNC+`&INXdLQjF|V+b;d0Qa7Ch
zE2#|=UPwkE{xUO%9zl+<mpK(84%}|Qd-~HEpAsS+$TikYFSa53#d}Gm-;0t+({V<$
zD$YfF@RD=XE2WfF{F>koSf;Y*KU9L8*32I*b*T_{k7yUCWX5#cQS0gX4qKNtefY`K
zo^WQZ5Z36cR<T>^G8h2r$+t}hf@3~;M~&a<v&1w>R%TpAtI}Xm2jnoN%;F*+E_GRl
z8-1-e7Sg{`Pl!TOC&MT6*`;Oq(w@u-(z>pjcXBBZHWXK%wX_1x{BN>g(s??ope!9e
z<agf<Zg*g1jB+s4Y^>y_Fq1VRKJ<H=C*`TX*0hp^O>^|-6i)Ipr0Q6ni8mB%%HcbO
zzN(JS!dP@DOkvdvoIx7`*759mc0JKR&GI2-9*9fdzZXhmhFspY^vUPYGmDjc6TXH#
z(Sct+Ag?&hFZt8Hscz1;HRl1aAzIQyB?}@h0+V~QaI=pm^KsuXc0~jpH{yFPw7N6+
zGj$~3go7%;J8Yna+FVnbu9qr@FE|pC^cM_IR4?W@$Yb%n|I0=XIby^8p}(qN%em2D
zIH3}PPWNW$vg2kM-CAWQE`i20`Gt#B5ydz0^$4~5zy$Z65PM*0d|Q2rJF4RbCS&zs
zb@Cpnmew^W8o8|%bK#_LE#}CH^2%(}@pBfnJ4%{QS8TF?6d#5F;RXv#It{lTquVMT
zy=XVITeDcnIld=%%UNf^+wQAR=`XjR$csslBm0Fn4Jz8_>7Ijg19f@>+7>{QmG)3w
zwvmQM|CW38G+x$_K311w{3nYUa`$yf2!ldc+4iQ;l<<iSWtp|q=DcG0slBP?h0`EM
zI<)(hovYP>OuX{Rnxkz`SW)YO_KAI97oFTZYGdNW84}Wvml+-B)O1Q$@$;$1YZUTZ
zjpe^+zf#2lrqTlahn}0ee58hwo<QM~mkirxdUDU|x~tTsi@F{jT+`j3Ua!7WGO{sj
z=1J<c)s<7po|KNA9UCJ7)qadku~%cl7c0K3=M+O@9|8kiIN8|ajy*hEXwy_jl;#&5
z3g^`Z_3WLJ2r(vx&zIvzmkKvp-|!`?vnb2rrw9;U7vo{(NzSGTr};Ggc?OTa?cPh9
z4c+unczQZ1ogF$DSW_)~^ux|6^UXcfGtzJQaL8eXdi@$vec5o*aYLP_6*F?&jNdu8
zij>iu!Wg6e9i;5?e34JMK6V!)?dV&>BHIV80|WR1hNHnkU&ei5I4ui~K2(Bw*PSLc
zeI<6<qab&i?^p5U8>v&g><Y_ImkdP$!cwqD;3N*z=&H@w=uB_L_`D;F#9r#MSRMl=
z9}!F+?#duR4pJd@Lr(U}wp`qb$VBhK0NzY|ca8*^<8&(qM<6TtS4_Q*9YA`DmStQ$
zp*iu{GD5Q!olHS6Y-f4kms_uCDenYM-_HbQ9dXe~9F|3BtnF-c#sU^#iV=`WO!Yj6
zTH(i;j>W@fR11NX-04MNAvq#p0QH7jDDW0aK$G2VXG`K|wVVv(#W4phJ-4wd4(f_h
zDb#vQ$5^Fl!JlGua6>YClfYMCZ5IX(VS+2_$1DySb00UIxC!9_gbOVZu9#@aK`0;%
z0@Wox`h?3N-(Y<%>8&Ij&m4($XL8x3C<`Ut$J{uRaO>$=Ldp~4$9hKI5-+}K&m0P@
zf|GBwNRE2@wrOq_rBTFjQkYlqz%@N`qmR^)cVXAk$d5D+g~mcF*)goXcDA;K-nrUQ
zY|P`=6~`mc=o2lmmGf%m-Ltn~w<Cv#`S1($wJ|zfn~rR82xA9=K>6tRUkvF<hwZAC
zs}44s6&7EcU90z1rB*HpXgTAd7#;LFworKliFLYOYX;?PlY}i!&G1@B1Is@pE~R-m
zyp~%-s<xhyt@BtZi4XY;z1-tpm5r@`@f||)%lJcHn`Ne7cMpliQBc^KWOUvQW@t>T
zoW7vg?1~3!)f~{vS_v<~B<1}q;G+nZ*Ia~2m1OUBb!iG4{q+&Eh&W0M-Gmetca|y*
zX?XY_hJY|MRtkHqG+pWkqfLc6{Z*A;UqlfZj-Iyyxjomp5g-1znz(cdoY79YlNm>9
zsN>>;5c9>`)@WJrrp*!~`RaeM%li=VRD5|PuyeO!z@I-oLTKTU5w%;x{4*u-kns1V
zYkdCg4RGuuHJ*#Ir&xde@768)pJD!+CymNCSZ~Bcv)Jhd=7Za=sAyfvxoGb9zW{U@
B0L%aY

literal 0
HcmV?d00001

diff --git a/docs/source/assets/kernel/value.png b/docs/source/assets/kernel/value.png
new file mode 100644
index 0000000000000000000000000000000000000000..f585c77b2e1449825a3c704cce6b102f567696a8
GIT binary patch
literal 121414
zcmeFZWmr^e+Xf7Zfha1epkPpffOL0B=g=V_-Q6Ha4HAlUcMOemij*{nLx*$^-3)y9
zsC&Qe=h^Rf9N(|+&&zQTGi%np*1F@m&g(qS8>pO&*sU8xH_*_~Zi&AXRzO3$qJ)Nq
zA#wdGcqgUk)C~>oMv;k-kes-X&?7lpD?<}=12nXk(8%a(N{VW@Z#LcN>+d{z{0*~S
z+zLIzxBn8=%WK4sG4DuR_fFAMAy6q!d*ajSQ+R`_idtmICs0oV56hA8wY=PLt51&V
z`Lzc=wHL=17iYV@J?>VFPKF4c!z8BLXw+}%BeR-2(O$o_eLaKDn?F+JOyG(3`OS5z
z&)6foS4I_NR8_C@8z@|8O*vgQmYOajdbhuMaiJoAH>mUq8ow?T<&95Q?o*M9wbOVj
zMO{WKy<JXB*Zm=MTO{H>V;O%0X{gN(#_bTBS$6#wbG#pBG$Y{XkK~Tr)fmz6o__D`
z#3xI)p=zKSm+;lY>|EyB>i+r_FD-cSieXL|-a^UIdduc4*#)g(+7pW8<oqAd@?fkc
zl>}PpCHix|$~iAd(HeKg08CD*XQiL5m6!&2YU88s=f4u4&F>c{z-mXfbBK`KW9jfQ
zw>%@mI&NqRs$TgdNWme#-ly2oF(V>VN;OWe7j6(c6Qn85NM{s&T$)?AQ%2>-BhsSS
z(kBTGj-A)E*~=I2m3enPX=jFdozbN#&GQ4<XCvyOM!k*uqtgW{-HP2k;?SE)`H6gx
z#+P4WgJK`%<w#TQ?8omw6^<nB-0`Y2R&;N8$-BP!l9WiC8WaDP<ZEuVZV->u!Zkl>
z#_Q<lZ>~HMU_K%C=GDA|y^j9%E-evP`kVM?TF-GQv*(N1muYcJoN>JO?W5`o>Yt+N
zJ2}q2tTPCm77285>5N+OCo$MVJQ~k7hh9cwp1jQemfL_yu#e9I9W8_y!&VL9Yq8FW
z{`?U-(JQoDDOfkX?-vWM*<M9_CO-0hbSG%&!L`N9=r7Ul^Is+K$$W)Rg(mWfX#~5I
z|H=p!womg9D&tF(O={NYn&`t#7F%e0Z#=gy-Sj?uh3E5#Ti9>;{nd<X_l2U~^R)PL
zKjx#txDha$f<bjZoZr^?S=m);pP*F692}Uxr?(8hQL3aIq5bVuzpJTi{gn5-pS`#|
z<SqS*IOdk!k5}wCN!KWw)vQSpZd>_E&QKrSnYf<pSM<4ZhUDl9cQBf7@Qb@1EjM(Z
zYLekW#lr90FMaS#SmHf(2DXU+=ll4Vw|8)k1DkZ0?-8`!m<z>h5!S_sdERZ??J9MD
z@ln>@!O&{xO6yJC2EANWi+u8-TV2g<dLK)$O0JCHtb3YYKHxrJtsp%m320`p$zmte
zd_;#G9i-u-5jGx@*IqtbW9?C;%S}?`UD@LHLulLG9lHUo5!dUw*Hy2}(_t6<k8t;{
zA7FS1atjEmN>e@len$q^@=nrSQD~>Iei|`D1Qi(_h2;~((~3JqcTb_|WccCFI^N=m
zOOO{li40$Txg>sD+^XYsDf(z6cl>jTBC_c4u=n)gZXKl0V0c(Jq8X!~WSeI^NGVf5
z$WJNcD&fM|Sd!v|CSMk0C8hIbdCH~HsL^ak_)5@3bVd?IT2j@g-8E@r#<t+44O156
z60H?=%au}Mm0zUZjf|#=rb(7-NaxNMR4Gx!gmlQM6)B|KiW~6b7n!E5W>YKms7&U$
z%9MwN_TO63ta;Pm-w@`RG}ZT+LFzuYD0O<EP0T>YJq1OGozgpHjSS^NM~Q*}=nLr>
zZbMEx++C_q@y8}{z0AGL17!|_t67sVtZ9XrhypqVdZi+j#)5P4x0y59AF@hB(#7XK
zP>2o++EpwH_VC0lvFMmGlueXlsQamll_SiW#|Xw|#w5!QO?BWTLj!Qk{tv6`-#M~(
z6{2(33gTh4(uZY;6cVjIMcMpH>CNCNyC|Y}%@RhDrDJMBCu1kxC$>llqzqF2vf^8+
zxA3d`SHo_K`<pkDHCy;w_zQ`;iZX>n5y6Qj*(iuUg(wC)-E+DZ784WK5|htj!g&7$
zgl!>qF?J}9mpMJoCDtGo5<AJ*9{Z_hu-hqKr}7PL!gGb^owS8&U!r=W^tvXav<6EC
zwX!9%RHPgAMM`E>7u4bE`|3;uLis8hY-)|A45iS@QN2OkUHDK3>!f4y#$_%@?W0<_
zBdw#=I_>(EgN1|22RYd1*fbB~9%^%`a|S&)WS??K<La?jU;Dne*E-s&wy3$py{tDM
z{-d;O0Xx14TffLi$G)}Zq^i)GXXRv|XOy;mBWGL(p^Pv=uplloNf`wiuE!57>HcUs
z%-LJse(gfxd_F%qm$}(`2-|U;wOFRzo7zMM>-UN;JeaN9DcS#U%zPAVJz3RNRnQP<
z^~NgIDmxjb6QL73t?zaD;`N34MJihM8*=ml?~%_xKGS+TIdgKO7savR&Eu3k&*6!|
zd2m;g;Eb&0i7>%8M%<+P%yw(t0iS}6#0^>eeB`v3hx)~Ivvk#T8++S&u;aCOPfe<{
zj=pV$UUzNr35*Lo{J{Gm{t2mYtVl=tSenUzU3y-+n4Y_SkiM&)g<f>4Hgqr;x1FVZ
zZZ3W9u>E%5cKjy~nG4PA>MM4&OR>Eg%$wibdyZFjWoc7rf6%i>y^#(S&k;ZBzP+Hb
z9f0cst;Gqz&}_nP!b+{Qj<=SZe@solBf0aET88N5!ON*Q?wHv~h43APZv1GdPWII-
z<&4k_zpRHZPoqdaY9|YPefgFBv3z8P_!yG|6QO~lIBn*YZ%NN9^EcFlN?(nhn^U=I
zJKAAiwg@fk?3w5L>So7d*Ja1|JXqV={6puHj<mdYv;;*OXV3fiHkZg_$bHByB=@5W
z)yHasKjqtUlyTd*jNfR+Kc+cRxth;i6;wuT0k_JtP$4YxtfVlMmTZtY?>uWDz2uog
zD@Xkve?o1op<-67J8YJeYnUT{yc%JaTD)cZ!9reqU}@iHKbmdCf@*ThLjPM^uXJzx
zfq9!nlsRH*wtV4LiG+=|&Gb;L261Ic1-_+QrAO0&BzY(~KG#dGc`jE8A&Gb=qK%2V
zT{|_2(GA<=ofECG+Mx}iQQ8R){jkHNnj48hX+g4lP_G(|`*NeI1`3c;!b&!$%|VZW
zQ%+CA3(a~RJFkIr`fFs@T(SL0&5Im!3kwGdN0qRKJI$`$?%}aV7H`fha^}@^x*B>6
z+_e_=oIj?=Y_uo6#XII0?2(VC+TPZ>wdemuG{?qfPEJumQPxh=ZWx+^Ye9>kz166G
z;rb)vykkevG@CV%BC(ebxgj+bJELr}*7d2w=yZ&rjE9}j4WWFxe$czyv_!AH)o5M#
zcF<-qt-bYQ^aNdERd>ULee1Er93(sMRwD;;sDEW*)VXIteb-@ljaG}PMrL}e#^xYv
zvj?lvxw3E~$)Prn3wDY~tafrm+}OmPkZe3W@kAg8uF;U{krMDKcq;9vp4z}S7aNBU
z*$)#|_c!WK$Ywj<@(N79<14*5nn=EgI4PTto%*zje(L@4?fKi`<UW_F3&9h;J%grX
zYA=MV!NKa0&Xkwd!=Q(&UbJq=jh>TABCFeVcJ9Nh4_MERJlakNK76@NM2hYC+H3s?
zZ+B&M`AgqXU#3oXW10J{^OG%&wn{67hG4#_8;_RXd<;i(wnIZReSo&NhW36@YVZ5p
zC299-!MI1V+yhQw@0ifsG|&SzG5N2<oYKZH9x_{@wT-2{v_j)zr)qnB!8Mf^2BkYG
z=y$!l@gU{YQA63s>#H#7h1TT7cmBKkRwp`;va5K;@on7z!3G(qiW^ExqfvvuucKk0
z6QNxJf1!hKUUcHW{}x53Lc9F){Yz+Q0VZe|zds`bexttLgKyM1zkXj1_eZ-1Ug3am
z=afr-efo-0%H_ZQ#*hHV&;%5P#KpmHMSWWX14}z&EBkK{p$PB>ru9oT&;tbIsBd&}
z1+p#B{)mZ^s=cbT6sNwG1)c6|D?I}`XA5goJ80a_oZwFj1AE;^&KBmDcAU;UPkw%a
z6a0;OoBqk8pC7R|<9VVgE%!*s%GTf!E8Ppa7f*O^JbLtq+xE2~r-HEP@6*96o+rlk
z_ST&A^iEDrbWTikR<=g;3>+LB^e-6c85wE8Cur?lEbVojX)W!Z{%Yi}c7zS=^leS7
z?M<vKAEDaS)w6Q2=Xvr3b)mn%e%+^mv&sKl$<prkV}S>xM;)POpnF08ciZ4pZq&P+
zawg6O=4!$w7NF0-HFz0baIka#JmJ3%{m&);I92tJQ<+$q|2*}NL;ray#LmE0$jSm-
z)SmZ$2JH9Ae;)jOA~!wi+5hN^UmgAPUC`6KH@NBl9yQ(@#99<>U?3lu2+M-<3Jpv%
z)Ym020nn&^{YL$b=t@}_z(YghM-vwoP;y3JpS+fU2N^otetYkZov?t&-4{y<m(8$a
zUTnoEiwJ0`@Lzcvh<;g5RHQqs^T}2I?i&<RIXO9Ro?MlR5s{ZdPF;8$4k?-#))@G&
z#GX&~8Lh)KvcFZD8b%LhTe3W>GGxZM!;gmkFF(AAuEf2UD;Nkw`<H{L6Fy_Tms=41
z_mj~vg76-ts*~aAUVrok?b5&ez=R<F?W(BvQeL9tz(fS^`v2>Rm#%}$lKkIgQ9bZ~
zo&4wI`oAgldnx~aT0pQn_m~WW>yZcJ%2ar1fd!RQW2_^u-Wn%iqSn3l?3F3!xm(0+
zmI=u1c2dtw<l*$RdWE^#+0{VdpK&AQCdSL0s3wsgcmBG%tGCm%$1sWemcZ#sc5=S>
zM+(+H?%fg7yaL0MfTOzIUb9BzLHwj$9}lZRCtjl_!#lc~oQ?8f?V$>b@l`hMRt4l+
zue^ea&gb>(rCs@AVI;AhoJ57633c3cE2muZLLPDrWl6^y(yEsggR{m%*=MhN$<kK+
z@NxW%e(n1j<`lR-tLb+tuFbk#b3c6LoMHWr@#A=_29wKIZx-u^@eJ){UM@gj81^Mu
zGzZ)cJD=)dn|80*s2Cq(9XHpfuip#j*>1r-INfYKZS>2_FAcijs_qxWf={AH7B7%`
z9yJl@xNo-*_3@rQKa>*mz5lbP{2%cPmOs`c554;{uzBAg95GH0SqFF6Ma6x#)m*Vw
zSUVPY_o;cZ$3e&QY0uLQX-Jk=c25jL^N^a+!$G6E4{{-9%x5d$8M`QZjd~N*Hagr7
zMvW@YcDhuE6{E3Kv>YI(Ki=Lp|MA&-sNU_cW&%Dvc7REl@apZ=TRS>>!4EigtSsI}
z2H{p+oS#~NUhcy1I^C!os&O!robGyi?REv7R?X|#we-27#`B$y>7`^Z-Z36bc%aX(
z!M;ic9synJIX3>UN07jm!^yRpZh#6mQN)|;Njxtzg?I9vZ3%I0eZIUAsiF-HBv5;O
zxAtiEU4cfWrMylquIu-UVI8l1xb{2RiWr?lU(?*61!6vvZ)D^4gVO1z5{JRzTQJp-
zX-Uq@kn?914RO?kkyKoB!A$EuSadl>%{jJ>CrHJ}Do=VH&*KH^In@_SmlU&Q$|s$s
zCslI%3mgy=qlmg)87$~Vf03yn<gf^IS8CpK-ei}>`{S0i8)qf&<!IkYei8VYcp|94
z6A8cga`#ulNb$r_dYm(#tDY^BXg*S`8+&;aOhb#k<$;EaF8anYgJ><*pn+mEnCSN{
zbRCcVNEWl9?mhPWbNM2%nmjY2HQ6vY3=%DeTuu*@3M+%re(GkF7fB~+gj-{HogZiG
zRP23|TnfFP^Y|8pRLm<d|3XQJUQId-mzNKzcLjN9I-TsTcs61^FMcyEt92%}l+s?S
z<2-ZaV!7I1MAbAm)7A<9P*p#KWxaYSY4^Nz*yWi(?#z9d=|@W01A?(oYGw#S{*AiT
zp<Gjq6;G{?tQKQ2-QOcJ)+cK#orcfAgtd}T@M|%JU_u}vt9ABLuHUr|m9$S6<E_>Q
zW%>5s;id0udm$Z8kC&1y_Teg(v=t*?;@GT`)F7D1!zs5ud-|M*F7qE^Z$M&weZAZ-
z*H}3e*z<W@fP<R;H5cEcBj1jNkx;4(7GFD&AN0tbgJ3mV_)Z5DPl4!QCCjrPQEoLo
zMXBtu<#nM5N0Nla5)XzfOhrZ_y59frI$6o~fE&F?V7E!T&>A=sy3O;tcB4W=70+W;
z^716{$-}{E>PbQ(Q4ygWCHNkj>S4a8k^QvkgklI~!9L9F<cs9#H=c~f8Dgutl9kO*
z2p1L{7<{GV=1<gMenoTP%B`Oh7yK~3bE)u#j8(u6AF0=w&1{SoBbEAMTZ7c|+ErLy
znt?8IYbCtUk22Yv@osXGO-H4%N3@(Y{lP9}qP+rMx<##xvPWYc_4BIE_LkB@ty+qT
z85|xz4a;hBT*A=pw0CsWRl+KIiF?74=f@t#I*3gAl~z0MO&p^uD&z@n!G+W%KVUq#
zr~MrMbHmwr5t7|`R$)8`RPM<AVdy^d9L|d?Y`m5f+G|x9mQCbFjB62NGbEB>6`MG@
z)p?CzImt_%c-l2f%_n{A9fZhGRxE{I3^>|j*DUSbzp#OR^^we?9J0e45ticS<t;En
z;ABA^w8ATYgcnz@k$UV6Tw#k4dE&X-!`9cb%4-X)mAas-UB@Y?o$)6&jA{>~m8#9q
zDeIj`GB!UfJk!Lto=>W9(0L%zr76hLRM03^*vX+zQW(bTpp&(!RgmwNS+nN8oEktd
zL8uHx8ZWk!dOc5Yo`qJQ22ID)rN*g6W1}J<I!31!zg0oz<G+&fEy2(A%v2}s)!}a2
zLwPDi#<6)x!SvMj>mv;61hS8OO~^AQj^08YW|>;al(ZIJ>5)E8Jpl`y{G`0kUa;J#
zFDW+0+I(g=Cp|QXR6*|~TE|m7-|KiGnlh?yB^amS;$*lH=k$6zKJOrOq8Xbjf9$9@
zX<~NcqbzTS^qBwAR;<jAwo3W#JvF05{D;Xnau60ZZKp{GXe&q%PGy$tNqb9LPLucr
zcA%$*?-lK9wVF3UGDO#^B|fJO4b~LS(yRA{&}z=PO8n@JW1&$u>R}n~&QH%+KiZyS
zLQ13rWNl>h@mNjO%hBXBs_+rd)zf@-7KC^a&K1CXVdpaQ<h_H#YX{?2VOE7d+KLLd
zi4NC>=}SU!^5NvVbEjmfaVh#&e@<R+fNCus5e7B?N~&nyf&!6-kG<1PVRGdwBzz&^
z)+b`_HMy?)U$E$(pKV^87Zpt@)w#vH2}D9?NDspJ&YfM$jT>ToVK~<jC0onvf`SlY
zdHgKct4RCr=d&Ojy!X@ZUkYKHTF&E{WQU$R&%uMkcu$P&rB<N49w$c|%V(46<zNys
zd$r=L2)F07#OK?~SRVO)0Xdw6cJR1ESsL0@WyQ`0(Sq$LidS1tAqMuAkCeQLP7cAC
z`MBx3H1Q;9%6$`wjMR;tom~?lrm_`;bdN>3JLuq#&*H~(>OctA5|o<aWD-`n;bq7@
z2S@rS>g5I{J=KmKYwenV*`8ZGKkA!p*(O8;4h4=rgb{yvqTb3A^>ETiT?`(`X}Dio
z!tCK^H4+uxnVK8-!lS;h@%%)vrba{Ob(Jpd7gI3py&FXz%V%d;vsY~8+T|1t-P_bs
zYr}=jxGs9M(J0O7aIS1FdmK)x$bHj}{2HxkD-L-7h9M+Q971Pusr387#1{EE7S-hR
z%;v{JuewAxMD@TyS$Dl}XHoi`;~bt?mwh*x(wwc-Q`979#sihgtpy32gx?j;k1G-L
zn9$2S5(Qaa>jAvdODquHB#;=ckbkQfH6*^Ms>X5D<$*j@h6jn~_&W&sJWi6k)fHj>
z4Ga<7nLL{$44yu}o&25(?&>PsP?I&6|BfI$!8Jh(zj9@<jWp!J9CXFmXCR~~R&~Rl
z*fMYkvxAsT+4Ot%E_^>kW7%f@N8W-&@7W}Q|I4qEpH(WQ$??jqkU6eQlMxSYboFs8
zB^Z~4%cbgcWI4M5V32NrXqDL(8L)rZMkAoErD$M264pzL$X+?x44VBWzOuJdOzYu4
zU0H{u8OV!q-Z#~rm_V2X9~Rz}l+J-Eu$|Z1ffex@aSQ2zIn2dV1|4byNd)WU$*ZFJ
zgB|gKx3_uOi4Fw4#%F2QC86V6<dUj|^k3|){5?L%-C}fq^0N=gP<^;!YLow0ALh+{
z;cJO}E{`?0-AY3?rxdO2YO)u0SZsOnsR0M(qW>-{jJBIboS`z_U-0^AktvmTAKwLc
zkUYMo@wTt2YrYZ$D_{fmMNt~Fv0tUz2-2}ze6h+a1ToIdN-b3u&xENvyYnXHhfpdx
zByDFg=r<oLN0vQjO@#@2jwmxE!@{L2>(<`O1ZEFEYxgzPuXi#amCb?KYRo+kNIu$n
zkSfT6G!T(K^lxy!A+00YiksrH24>ia4_OP9P5dx(K7Zij)Hq6EQdHT?Uk$2c+C!R_
z?cY(6U-VJZi>iY$vvjYR_Kz;QnRVNA``2ON#gD}^y+(#r1k1C8MP73H_E~G&ttsl;
z`Fp5DK9Jv64}_{LM#ag&pRVX~wYsk|EHsYKLduD1Qbq{EGhI!TA<`Yn4;v9B{fo+i
z`(=YNG#xK^ri%Slu8YqdvB*K3IG>U9N+&PG$XVnc7S_#S!u7Nz@MY=Ri_9Q{AP2`g
z&ILd)byWRSJuoxqKh5e~ZN-HV*^;$PIgVR+a;;q4B<1A}0?#$1OY$>dJw^qr>nc$F
zzXF!;=N2e#x>s<3SJfJmReqbZt?_by>Uqzgu0Bx_W4_KfCLADd*7Y_oYI%H;w;_Bx
zMJv`fUAvW`l`zj87$z=PZhGvi91Mw*jjgeTH|Uqs?Bo=f%M`$E1Td?9mC)x>Hb^~R
z)g~hdO@+x?aIH%&Witp5N()gf8*oWb_!FwMwg<zj-mhde6i+()=(H1`EM*8n!Zfqv
z+p3fzi`CDtFvuPAb_Yw3B#W7o%(%s~=t*WVy~umYWVEp`A>;|WaiK*uC8qm&<dCLS
zBbh=o*XfY$DL4nJ+c|4IXGp#uRlp~SQ%$*fef|6~`nT{>R}P=vjK`79S;dOlGJuD2
zA!6<;3myubiFlJKy`@V2IiqihzwtE{s!@MH^P2||zO+IZuf)<|oI~}UZ)=~ud`h)o
zHRos~Q-YD|aYl5RFRNkqy^FM{bL1&uFQ{Rt?v`}X)#826PR=W8;QaXNCR|hEI<-k^
zw0x`1)pU>C8c9r^a_37jT}rw+B{w3OhZt@?U@uPD=4ByBje@}KWf<k3u$_me!Db~_
z!WxT3msnnZeg6ftwW=eE=M=Hs9m|Bwa?mEyXCSlQQz(9+4Ab_-^=gXfWeEP1(^*v<
ziWAGIjGuXsZ5NT*tJupJ_mP0_YGSl@O?ou<%Ep1j)%Hz>kjA_$ea!Z3*^mGjFS13a
zCEeih3{31~PQS>sapq*L&LLDcUZObQwes17V47kwkr9L*;~RU~&TxqtGTySLqmhid
zJ5RQYh1mz;_AouG{5H9S)0j!N&(v8VAuN}Vtf?%iB9-h@c>K1f^qP%D0j^ce<`<6T
zsyze0@LCwf?Z&s9QFU2_#eNR^vC(CH8bvV;ZO)sk?;=0Sa6V%k+`mcBAOAvEaOS(B
ztn_m%ovSUh5oqvfmJpYGvXvb9^fP20S>5@NZz?5;^AB%h4I&Dm!#$=xZpHpB(rt_a
z#Ji@q*Tfp<w&{cHk&wHE5@YY^8`wW&4DTbpWxl)MQtRF)`<ckmylw9S;)_njXYMye
z<sFU@VINGX`$g4zC1ja&Yr85gYyZfJvq^E1b0tWiJ}{@|s8|X!P_BefcEo?dC0U)a
z=vj59f$qcl*m~A@>R$UK-BXujiO==Ni}9Y7pA36CLvBH8Q(?3Xi2w%=6y-^@ISalZ
z`Iw|UYbg__{q;L2#U}JaHYLTS$VG${OP@7xTdBkQQ(N=41<iAoiJ6V%3^gQDvNfd7
z#|p#GOeJOmkl*+2hnu+AoFu|b@A)4^-ySpK-j1rI-GAmCKWx9z%z2`f(}PN-9s_sk
zW|LRuom8_ha^nzAT5tP7+Vs9gvOGM%`VQ&w{Q#S1Pp&a3e0$s1%;EG#m+2?QQBsTx
zfnQVs_LY}8Lyzg=MSs3T^RB^_!(liR(!ng+8}DpRij?D6%l~Bi@%r`+VPv{?WV!^#
zGQ;vwdhZPB*^2Cj4&>u@R{N_k?wzmQ=X*!??j8{9*7$H0IQb(b4&}-8l9>U+ji=`$
z9`2-eoCOR@kqOY`0Gw)xdu_O(W3rqkFm{Ll?AcwSY5!i0uonRbpy*eMUskG{a!fvq
z@f;B?NtQ6p6J%9-(-tN<tTMb(yV=lXE)$lu<_T>#Q}>WCZi}Co#k6Y8{URs51Y)gC
z_7SU62Q0(1YH98iUakT!M%4FDxhy66kW_}Yw`M!=fl_OqS5sd*t4|02Gm(z~---c@
z{reW$N|ckAak9FO4)Z*RDe;W<K=p7Ni!^EU;%=z)safD*A&h76@Z(Jf`u4oS=6%HI
zCnMoBH{r+;U9}U%aX3$r+Y@F;y?^Jaa~)#*s6N%9d9!K=;={~4<={SfG)=R=hWYjg
zNqo>l`77cqQ1=U8TRayF{+m%{Hbp<YS65B;vD#~1qbrKGhy1Dqlsqculbs{|R*8zX
zODln<wSd`pML~6oP{`@G3tGFd)_Av!Vb7BlN)1q$BTH(?;#e%k%EtD!AUV~KFLjf1
zn1`?o<*+)+5O&(09c`bF45^#Z#yf!yvHw`Tww5qaRkaviUa_3^&NWQAKz$S-*#u%f
z%0#;R26WJQB+Ev)KgD5OT&O&ObR2U@2<zBaP?%~DQrneHc-Xu`40zaeh-(-`LeA<l
z-SS~sr&@;}iJOO`)C})`oVM+vH3CI7KP0c)rja<PY2qlsQ-=kaR823Mrf(xI%QfSR
zyXbrYw@6`_s)Z+7LAvZoR}Li({m(0wtwI2Pk^WwvV2OD5%nsfSYHH>SwgQK>p>r!^
zZF5&GB<-N5rtQ@9Tfj-h(UyN#Sb4Gdi%&3?yF{i${C#2e)o*?QHLs1iFIm>9+l3XM
z(m#l>Hyg@Vo8#c<#Xx0EaG0n74HGWvX2QYn>-Dd?N#k>Ues?eTu9O{p4ddZA*bc91
zDE99&?i>}NVL~jg{PFoaIfUMTWtnUXAvQuRyh71y-@6!!761B3iW0z7nL&5M{+&U`
zrvaph9z73b=)WJ&ya6x^B-r_{=6<c#ziuZ-6*P|Tw!&fa@5iYiAm8QcwfscL|Hc0E
zw}QqCMkM>7|I6`Q(4~_2Pk*9}e_j~XebmaJ@m6hDnFoK}&hIe$`=tavh`z9U+rP2o
zUzO<ZmvSYbahii~cC-J>aTpjJh1Ni^|GqF7C+{Fs<0a8axc|#>1rXk%s71Md=lTD5
zcBOpKcu@Ub0{g!okDvylqQo%!$5i^`*;4dC<I3xqOD+G)@&6n0|LE^uFaK}I|8JH5
zXB7CqRsJ*U{O6DVe*?Qp3ct`l!){F8C4^3jUtRK`3>WPo`&gw_GISc0qjvhY`Odba
z@(&`13u_TI6C=agZnU`PCu>DZZP-N2RJeJeb-RNl3*|u2p~GzieD9(kf$RQiBoyAz
zaI%^wbp}MNWMVGU?>SQI!S|VL^+EpZmdfWGD>L~B-Rw^lvNT`nO=K#;qAioNT!*M=
zfisU$@H_ShP>>vKj$1#PVpSdHwLYeYa#FK7cF|ULw-i+`##NW_xE*x2nCkGIAL&z^
zC%GXyLOj+>+8Ild-FL(-&69aO7)w@-frLQe8Lj2m5i;p@fn+KHQi-bNdS<kC9O^o!
zh}zgAo4K}ywkZH>eqCoSl#7ndaZsAArv;^VRF|M=Y6|7kI2JQyigVBlogp5px!H^*
zq(F7ET=yf?j)6kN-Vx9;Zo6&PJ6W2yhdyLNRgvbF<`?JSlAL@Gb!tNHGf}V|fU5On
z00F>s(Gw%MGVG6W^RSPzi)KWLEh32B&bHf)^vW%@Yk@v!rw^2?QK_SdXNtD^4%3U}
z<k6apxQ#%`TJR%qoc1`BG6zu7PXBa&EgC$C2RIk%jMVC9aRF2lF7lRz{|$(fgULTq
z@8=0T<8K*Il*|I8w@)pWw>jqT*mXn*CmUjOt-rAA$P{io4XD#{S%`%4_EZVK<vCeS
zhn|D6=p*qu+Xf#K4Sis8`5Jcj-n8>v$Pl3Fg)dMtNc#gjsWe%p$yz72=<6MOK#A!E
zJZvNRiI#1b3?Fh2oGHV5{2EmgrcIN#uY*EcYR{yN9IJ=?4I}cYXz+{ZJCb{RhZA;=
zK%FB$VSbtpw2~(SKna;1=RIBjxI7rV4hYm`sYc20F{5OUsAVT8UM&zTaXjKUfSY{-
zN}2R>j~=L!2V2dzDNmT~Ko7-A!VFKW8jcO^#3zI=GQ>aTn++F&2TQQ+d{(iP<Ysu{
zG+`rlx>nSPgQU~dqx+D&JlG8yGRCEpk=iS&+v#*%E2yM60SdzILGr~J<={oX&~3Hd
z#U5OwEFZG0+SeF7G)m<JOfUPuX88P2u3|QL)==)9mlq$kQ&d6WOowTLB0lT7WY_N5
zKi{s{`hqnMn7VtLJA<5TiTX9w8PNr0-z1=x(PWI#?vi^ZSx%c%DD~3tq!J9aRI((y
zdKd{0BXY9FfhoF0dau%Ae8~Ui{cv!va1|X3Kml3-`g5ctwgKoX-%u2W)Wt%yjwP{q
zu|V3zZXX{TGEt%uNRcBKXMGnLwLRpk>tJl8_pHj{%8pG}2+2;E)2AWG-nMZZ$>+L=
zcORUilZU3!khCu=K~Xjy1*AdYN)&XLs;Q=zG|x_SlyK6RVi^Ur{>Jphxf6x57w&!~
z!<4dy1q({^nAP$;38E=1zDDBwXfWAHdV=ugV)*x1Rjpx%AvMYYK93W7uDyOCT#vW`
zq;0>zHG>leuj&lLv}q3kBf7_NV5kw}Kz(KyXaw@!_|{A_>XVyq&t)>li*=j3$=lKq
z+jH&XD2)_c;uT^#gsQlk5`e+uE3;dFhS?MwOsZ6PnTcZZgLqB{(>YF#-rux=DTcPo
z0{$T3!;?$G9Yg+OFLPMG)3n?C36lBW0#D7@BYNU%At>A~hXuf_8Ttq5f8w4<s{TqV
z9lN=aSmLE(9NA>MpT%~L9166%&Tv@#FQ8ou;OZ5_eD8m+2=J*>qK{Yn^vnv%l@XFl
zL^rVXb66v2T5Wz(xc|HXu)g{00qa?CDES%vzYhIupI-=&Bx7=gc<s=CzurC_z`|4X
z=-2*{x+5uyK^W4=Q!YR)*ZeZYC3gUZy*Y6D?<Js&0jSW)3dT~yrWM~JJ%qIsOfglk
zHkORX?P_-N)jq8ch>`BM<4+F>TzmK$6%Fgm3T68eI0i|_ffSB7q^$%2E_->dzk0>%
z)1E>uSUY2S?`RDHD-r6Xqw9wsG$o^tNtQaMT1d9^kFL5U_yb7lrNEY;9?xoV#wS7C
zB4GC(4AQ|R5T*hqK@{d4$a!l4VjXSi*C!S2ByK+q5$e3|N7nJoCgFU9v^w?!A{Kx+
zF9kM;DCF@98hzuL?b<-P7@~wam(8#{hJ!a6+~$<k(&4mM-+H#%CqoeLxCW#3B<btz
z4Ubz1b<BY)pm(nk7zQdjDB@qAEE$S5dLr{>lX(Z|P?5!Lr-L&6c$K0p^r0EJrOIc|
z;)em@g}ydYqCb@;IxZ$yKA^2G^HWg$D;amngQ>xxjUS5X?N6UFKf-su9o7vsZ#Z5&
z044m$(=cv5l!$9s9sX8AILZw?e%!?q>Ic~k5$YhRNMrW*vrbqG5%YQ0$x;L~DDVPK
zL`B28>57@A2y~U$vZ@K2WB$FcIb=BH0S+u(EbO42I6N(rObOBhGkwL9cnb2m6)uHT
zAWr_OVry|yT~2Q97YC3E%ViQdN5S$ly+J>|K&I4n@Xh+wpSh*~9;ldD+wwX&9?M-^
z0;=eTXW|D<*GPr0rMGvx?XNE6aqj2Tk!&jq(o9{&eqanzMVM3MLK%)XECUI`)q2Ia
z<tHE&4U;Z`(E2)n)Uyheu078|2B-v}VBadaneXCQmM?>!f!KMwmC)i-538i_Igr*-
znP&7;AFmn62k&&0u`wYJ1J4hQhP9l^fp$7EP1CBJ1LJh~o+F<{tpfSc3*09SvugOw
zAdtoJiD}_#1;i0y!sC2Z|D!;E50*{b<J8OO7^dR<;&m34lS5Y0`Yyl^(VHc_2C&J>
zsZtXM$aN<-1jsA)sxH2Mc$!|nwmAsK+Q_PYKT1a|tR9s`Dq0CNQ~^9lt2*+<NraKy
zvdD~7E(H^4f<iWFokqVr1p6VUFgz1@T~d)pb78!im^wV7MbwJqKGCXrm>M7soQ))K
z_S!3^DF_i}v`fI{u3dXz{pPmUVyuyQyhVk05m67ydeYznIfcReGLoY}&K>-a+eYfK
z)pTuRL_cglF9}G<j=+SlWGUKSGU+&eY|u)f#3GZcx6p{FSvy%%QfyscAN^!8+X8J6
z+yi{|j(&L|Smzasab_oGIn-0m<6w9;W&%h<abPoXW?ZI3#N~jt*@ooHQ5~6O8Emua
zw*@aZu%ohZH&b6QeQzRm#jScl2-SbM9=)rnew=^MTArxh)tq=*D~RSTamQM`!>3E{
zI2lDwC+~nvg1=wgDLlPIGH~EQ_xN5()ob%6a`yC*;>4{LYPIx{7c<n%Oxqy!QdRlt
zXI-GM-(IbaEY^(b7tg(wHe%W;GTjmB@??bku8$<I=&2p%AoUzr?PF93;La#y>?LPO
zX9pgHmg)>k`)<qYel4X(!9!R}85@vcO@*2tIKlGns<O$ePMFIR@bcSidjbeoU1GYD
zBvqC6e1pQc(ZgCYH$xAVk_1&st2S~)tbm7(s%3?e%u9X-Cq4zed5_tE&UU<Nv;OBI
z`4wM3dtK3t)Q$MDI$mkj1j!cNl}K~g7%Sfbsanhfg%w^q!67Ol$jA4i&+X|cgJxM$
zkL{%(yAnffxYm%?)>9PrYB~QBoM%g0W;M<GY17$<y`_y$Yr(WE)J;vrU0#oq3~B#d
zLXZY}0+tV<Xz;xcFWqa}r*YR>je?|eJb{Gl=lqS!^IUf<<gLa_%Am;4H@osqzkA%V
z+V+JJCq0tDZm+bN{~*?;0?oqT^k?|Gt|G3cqD9;aVY_QCo}--I!uT(l&AXc74ooT%
zuXZ+w2B(7_^0ubEBfSA&2rf=K@|zGpPUo#3K5iJ(lv_%E%i0g{T2TmB-aSSRvW_wa
zwEDIrxQ#-Z>epc@Ey>~5hymwhdqthVf$Y$Z5HEG>ut7p?*OkmK!23c<tMW}reAZr3
zkEtQ#1rp?^3?bZIgvDON4HZ#1M9DuxXT)+UJM<LTaHP%ocooeB%k|qs2da%vfJ1Ee
z05NdnLmd7X_#)6~s)-6eh4%1R>hwFjPwk~?&_0Rt^M!?{ps+=mhs9E97ah&=#N>-x
z+ltUE2flf=oSn)MR+DcTK1ttRQo6p=YSf%{eim7aYy96s&98r_{~nA;5>C$C14%d3
z7KDE?lC^^}dBOU44il<Zzb=>IqZaNs$}R_tSNlj>jfy>b?{C3**5v>L(goedCDKr`
zNkvKHp4>ek<M|i1ECELhQT=0Ws{mk{g5!2+DX@%%NtT|J2+yGtNkZrazK!I$mMA!1
zt|u1!93~J)!p7WC*me^VZ<ORRI|*HR(QA~(WF%^p>l7KqA=w>tUkF0~-4?3Wcz%Am
zX$3rbvgVTQFa}u`lES#QBuJh1=W>8Vj~fKuURA5KoIJBX3aiK?v@;1-WGnMt5z$EX
zV^C_Y<1OzAAaVU#G3C1E?1TU&uXK<6;62<p$wy-PJ^l@%NHw}xtxyn6|Ac5i)84CI
z4B#pZl)|3@2_s>}uA|V$Emo<FiD!(Y$o2cqUYP&QuV!GbsNv-?D4ChU5Z^0MYSn}s
zB+lO>Pp_ip0JzD>VCs|cOL<uIrfPM9kfr=iEo4=TfxujYzy4r$+lL;rR;8R8GQ+Y0
zfE|}C+Eh0Ek<YXXP`TO_*c!|bu47XG#j9F(z*~0YQi&>Z`%wO)yNc_<ZK=1$<ss(e
zENyZH8F@X`|AoT%(+L78#|29z!?tK#?k_eQIVX5_Mm&o(a2@!ljg2o`O7Kyx9u$?P
zd$%wl8&5a4ZR=k?d!WzI(Qx!GZExf(p#s=gpab^rNOev=oZQin5`+jn^xe%jts7C%
z@n8uWkh^A@o#YCP2R71@CN3bqHG{HQa#RX$joJ)ZvK)5~mV~^Qelmc0BSkNk0{(<~
z29znj?Z7;!=Gfkf{`d`lGhx+AR(#s)cpL+XK<WXMIw|GFKF-w3{#5;E@zr(}F8NVV
zp}YbY#>rcsLw){*ocu}idD$5-`d|jAEo9>~r`jrw#}6ti3PQwxyvIWsK<%7Vz+G2x
ziXn5pFeSeK?(QB8|A9w3tUUK+k%di@UjRCdiAzC75CB%zg5{CF@dc+pz){S1HM*so
z2Vx3i57eH8Feptv6#6GEu^9EHt!1#v2WTJAHmU@TQ$xP%kNS_UfNX|o2hSORCb{o)
zERwvfENVFE)ci)XIy{m~Bm^k{HdbEIL4=5MMGx!v2r5^%>(zrymbNW3`7Zd(IWdvT
zsbaAwetf>;W6v%8AMG#dfe);_rW!+^d2!`zw{No@J^{ikvUDr?=pHP!kNjvItWkzT
z+su9g0Q)L{Tzq=$i9CrVmiZjYbD(M`E}&uCLEb~Iuz^A>pr)J(%ZBl*n3!C9XfIb5
za?}FevRq1ZN?O@0KGo|&xqgrI_%^5okw^Ima&^yh;JYey+!zzFA!=8eOE*1JE(cEC
zuYgHe2r~fK<=pxE#gyCT@wOz(2F?J=WGa19Vu&i>!F?T{xw}g#-cN8@jIG$|q!|bB
zX}NO4A8*U1CJQ*>Vv-%{n;An0iQ^I+CmmG0JiLx(uAKr`Wp_PQrw(7Mu@od4gY{vu
z+x-Z(DVz3($fHjPa^=PFYF))eoD9$BaGt}7LDIZ7DfqVOsU<t6*+80K7`&<))azl@
zL9h9WY&2Pt%v#GNIv&_1{UZwUb6j8EmO8xM>m!*H)jVT;6O7(hj0wpk97q5N$RWKi
zlqgrN`cnf)i1y81qV40Krk8NQ0r{l!ofC+!K?j*oxk<Ij^F1n$y4eBpH)a#jZg*9h
z$hmP(gZcp1Lo&d+<A1f^IUoK|(KQyv$l{qF=g;|)zQ|NN>?9xyrWpOX&{2!_Har&?
zfwGj#k<N{UAN+;;Ki=K;jF#Jv|8_5b`}{?LjT2n+604I-;9&LxW`qR;s2s%o0HM`u
zRT*<+8#43Z!nU6A0pXug0<#3D@j`Y=Z56IZgnS07emX`BPV_c|1Ph7DOWzfS3+F-o
zc^7D}{S!r<t1BOzSyb-dx1cv?L%RI%zSib6m)-2iOpv4P4+pc!Cp=MNw))05pFD-0
z{B+;M@HZ<PX4>O+kP2R#7&V5WyQE_&z@Jw_e!;c^v$8w9J6)9beSa>dH-_P<x(z^V
z(9|2V+uJKF!<wH`GW6=6&A87(DGi5I(iD&@zV@oakOTs-4cPs3+oWKPTHK?G<l=Vo
z6Do>x!58Gib88)nLSi{9naVRI=gv|wLvmMu-`X#Xc9Bn)-4DQid|d4c2JW3r1RBww
z9q{NyC5H37m!<e8Qj9xPkk90Zbzo;_<WXoqmF=*+Lem2Ea%u-AmbL(~a}@TErO&J;
z%i`m`rCSV0kodgMd+qAotP_OlSz^okPR@YW+iNbu`!qEX9LwndtESesgR1Z`OG%$P
z-WR(FZ1k7te~(FYLm+ISsI-&10_6}da~<f?N}JjgLzr5OSL`3Q0cKHgj}`BzlcwnC
z@ncOvxF$|xhC|o07*JDb-3q<(XYwJcM`?pzP!0i%J2?WcWN=_C=5ehCs9Lz2{!YL@
zg6Qu#EysJw75y8iMiz8lXQtbRMV%7z7F5j!v4S9)MkC#|6wDs}{~}8PHO_vyzbVOe
zHFsi{l+!-vp?$wVb=R^vH%i#-E1#G8uha<g<ErYbi7Fdb=dD-qJZ_G9vS#@u{)r&!
zWu>ASUuVWZZ~KQc=+7qvt$CN;0V*0nwD;RT)UCg2P1N!K@X`N1ohXujy1~PugFLzn
zI689W6A3Bb{^AnIffK2~H?N!H&n(D4di`vFSZ4@OEoOsRA<~JQ<>sO9F8}f&fL#q>
z&jP`cFy23Aj~oq(Q1SZ^@{_b+=Jy8v{_e&Y^MdlBQZVUQJ~JI%1V{;%H)(TX|3iKx
zN(1BO?+j-*L?H?9u{+KSkt*h(MBVTBuG{PnZ=+Z<ASP2<^Gi(p$33ES)`IUeeJ(#)
zp8wv~)T#Mu{_6);>mQ#d((3@XVF5T=+2??=^g;DDiZ(<=SHO-N{37P28%_~s7w3nK
zOh`a~2G@g401X$wzFA?|6E_OVBuh3>h*KQ=&4lqeu8&Af16P(Ip|-Pb*2QXGQN@1}
zVk<)h@u*1=dtQGyHSP&GUEvE*7@Hwa_J1>j2ea?!DNax(Jz`#WXFxv!lCXBteYYpp
z6GB4^Dq!e2iX3~Eod5{JVD0tD0~8r(UODO5)q*nHIHEY_lZ)d%FTlSK;vmh6>MOK>
zuMj++CIHv0u6xT;UQ;ac3A2M1{V4*d>W`wP-0PuOO(Lp*1Y15bY5`lqJ!jGW(5cL>
zmxFR@AsaZ+l@xcLOhsu}au9Rc8w0WvWfB1DL8ZaTX6+{L#ijBiA8^whAf4d@<`WPr
z4HotLL(ubBCZjQJw~a`^W<Cyhpa&Y&%(^IU#{MMBdrUUa2*Bhh057r2gHnJ=-vEq~
z^j;q*;158V3|L-8__PnomNQmv79BXend_-DSm1Vu7%I_kx3Apzn}v27H6Y{y6)}oP
zoF+sjLMu_0h(xEU``}(s8#REiK{Lpw*K%FMyrdNei2~<|U~E9Ad^HAo!OJ~BMI2Uu
z849@4r>E&*yqts6)pkaM`eWM_1ER#EU?&3sXsYHKs>&9=0Glg}foft6hNq^oux1q|
z)p*Wo9K7O;;+{aQ#{&9nzfu=mrh9MANl-_-bsSN<X$c4kbretRget<J=is?VEkP#%
zLK$cdoTq(W=O0(RfHOSbWp@$kiA>xa%$EI=fW82H@B;6-Cn((FJikeYb($qQE5et6
zaDcN(-L=vdE?0i(Z_EDAcwn4y2|?O;<&eH}BKbD&(IdMdP{VhGpuj3)30-a^vD+6+
z%GY}EnQ*~a=je#9Ar2jlQu)14i>l{eTdo7nB_1=G(Llm7U-H~Q4_-1KE@?P>2<Vsz
z)#&`MEfwaYjBG>XuYmy42t3p5xSKd_)g^StaZrbj+KCYo9#>S;ZChO}R@Dnp;K)D(
zv|u%^icHIL9gDv1{pf*)RXw-|un>(v>mcGA%xW=mgC<WHOMvgHH^c#R6x_}xsGR#+
zYUkxG*8%oQh638s%1&QPj|ON{2OHx?vsyTOY(pN8KcHGy+OPb9+9hOu3L;kDWT9jr
zXgk#-%gf%}#R5=kT`i~qb4k4tUDn%T%18Gh3L5Qka*vtse*~;aLNOo|Rm~exGzbM?
z|9T=gPyDbZBMWYxK58QX(+rAD1l__38vkHq^IcwSp>5TP_u*#4(btd-U@@XlUaO%`
zV%!E0O4c&}JS#K)6z4Ijvj$T=r^siRw&|eBpqs{h5Y=_3+_>D)oFJQ#)}E%B(QTAi
zIyOJOK=fw+?fnP&iGL2S3aD}(`<c`;*t|zSTEr~@Y1kmaJeAv7V80&|RaDYOG?XUt
z&N(N!H%~^XT&-qSx(QZx20|9jxa#W<B2tmsD+d5#j%Fu&#vyu@UQ_=-?a~40o(;FQ
z=_>JS*oe|-&gvI?JIseD(syw3008Cj&BlwQWgWv<11Wl3akl7tWdM#+4Ag(Vg!LLW
zhqncmtO2zUJh_k^G7XwCvZ~wmr@)nNPtg~ztTRiq8|27}r_S^QM4oUZ^~LLGOR)Kd
zHCj{3X^G_1EMBrD;4D*VYnIcXmK%IpBCt1rt{uBUlm9*t9xNF#p6qoVyQ~7Ss@Z{{
z(=iy(rN#zA49XBl`%@qTN%H~gmvou51=pm~WFBnZzN?mkrl3qA>xTSVbxUjy*kn)w
zMbU6WI$lVy%9P5Fz$>zorC1{pMSO>v+%MleQr{yEB7MMYv`*7$%s%muW{X~DIV2aT
z8BYS~K#C=U+8fkp`%?Mm4a=KSb-6xvfR}vdbZl~%8CFeypAqRYJP(k6uacUn2qc#t
z_?coY=CyF6%_||wYUH?ijrUhz%j8k2Uz9i!EKf8t8pXF!X>{B$-U8?->Rxkum2{J=
zqpxx8#795}C88>RaI1hJEARezbydY7M>5@n(bV;c!1l@_&hkO?Z9cMJ@fu`d7Da)A
z7|_W#zJV;kRgbwz(TM5;SOi=aNI6F42kWCy%fl-%1t9)Gd68Dh)QQQitIx)D10LdZ
zt(s+(DCLC96%=s=BL9&|6R~`KS49Z5B2EKs6G_pyenE?rB#avS6+I#Uo7>GVs$jFO
zoI!2X8UQ2j*4QflSrq&LaKM)MXWCannFDHScR;%CjiUWie)+qEm7@nZVJ#HwZJQh|
zHTZAx0ELi5z&}EvV>;Ahw4<-?dPrQOjSc&-9^q^HwEBI*)qkLlzZH(}DD+puW_bJ0
zJNx^nQXLA>Z-97O4nq6TkfvSF)ir^C0VBTvTTJaGP^^@q*k*08zswB<NWi|1GC(Md
zgmSJ@nN8IlPw>Pf)B_E;ZOZf1f(Gx#z&C)FYo<tDRGw^uHBt^-^%JDg6JQ5o^76fR
zvCd%k)&^=@+CeEukLC5JV9v{RuNvG<cQgEhj_;g}IqwyCFNfqZ&Hshaes|n0!B@Z*
zcmVb<j>&Lsyav++@|N8Y8wYse3O@s@t-gdS5^(238REDXpMZKX0ygt4%1U(blDEuu
zRZiGSp0@Ly4?L1N0o7$m=tHhC5Mz@$w8{5LT$dkf7$rI|c1f%_gRE%L5_tDuE{xAm
z*=(#Vw*WwM-p7?=rg;WYVZ>~7hgS7%I2%A!q42bH+*=#&13U6?H*8VZ0ks<}uE=s0
zWQJnIWbHU8UOp0MwPfH#+)DlM-<Z}rhyWGeP&*$T2Sf<finQyg$9z7D0beWr5rA9f
zKojVj>_CMWpm{{2oP*U}U=JLnhH<K&ZFHn$cKBz&s1?@l_Hqq8Uiq+b<bHavu>q1m
za=HW@V1ZE;`hkS{zy24%XHl+Py);mCIRX$o0_qA}k#4Yel+u3xzSY8exraQiWe;t?
zKF*SWlq=xCe(sIx_zujBBiBgX@9>FuJsbiX3x#z!dYPkqVG=3-)ZzKl@u~pnzSnKk
zM6N`<aBQOhv@70AbTg{f@U3&J&~t=u^5xsy#VH`5sRQ<&C+)m=0#X#!C$3~ih5}pm
z>wv$YlS<~D6sv{7zku~hKx+qq!Ob%lMZoS<>Z)y)^o=Xob$#BKuNksZ8L>SVXd<sl
zx2W<pr6}a(oIMPb%pq{NzP)RWx&Fl$6Zgy&s92IS0Fs9G4tSWvj7|Y91FMj_;zxK<
ztie8f1_yQ?qa|YD@ldc}JzDI1(u(XKK!PY64gdE-kI4HBGJ++TE{%M3pL#jF06EL8
z=&%Yzk)>0B(~Kmam-r^ywA~+S^zxd{?u6XBeFvrJFXnM;efb9RfRWi~X(s6f)8<{L
zf^-j=SjN`dPg7hVO%-4_tWxGNKpiIxTrBIKL4?d|Jpdyw8aBAr$`7LXZrE1dtbBU~
zu)-cw_W^70aC-o1E5!t#a>Lgr??%K(J?8bB8F{jb>=g4>pl?9bhcny12`#B1ka|re
z$pUdQT_3uP&Hh<7?}_8id`ETsfc_51=E*wB=6*1k^|br0bmQ$?x+$r8Fc>v9m%}Ir
z3ea7|0uj#X0}y%4b%9o%+35LLIT^6twLD-Svb-Oxun`dkGX7~X_(ZpQrRBqd9guU?
z?>&VG(HaK^_Lm`cy688H_Q9q>Jz%$fsXEVB`zw%^P$oHl;j~|Vud1xWq?9ux!5jUn
zT>4w+b)e+;`yCj+5?XiA>v(6wq{~sT9QNdUSsXRDmSElw93M~Z3meS-FJKeXy;f6~
zvn60-pgjX9NxqI^f<668^be^PlSbIr;G(=00B&on0kcAJ;*h_xkH3af><7iAq&Uc2
zm4M{d%<H7-FrmUj*(88@JfFDj>p-<Xa6TcKFX)6YVh5y3&vR6sPe-m)?Zinki!rc3
za$qp9Pqks7#LwdzR`iRWagio+IV{+V(Dv+XEk^bd%fQ?CFqh$pkmxv7)TTMAs@J9{
z=uDvN<&uP|t#lALuvmw+JfXw&7zX_th*sTwhwJ5~IwZ$M_@b;nf9DDizZXzjkRKG?
zX!qrxBNzeOzCCcdAZ#b?XOqs+^4}x}P1Uo6RnKfLP%fq*Eb-F4Y_}5mOZ4aL&YY}Z
zYayhe&7);l@kvqEqV{?8OJU_h=VK6s<3}3vo)p;b&ag);D?L#QtEwe*#ose_Q_X6z
zf)tE^eSoEw!QYJIb;q}5$rA?&PzI#gy1DIb6RkDpca%!yn{~biAH+!R>ySGEg|(5c
zioqg%t^vd&MjNn8sOQ1!%Nu~a<)?oT@Y8MZS0ZA;suCmjAnyj5MbqxZT@=ity-syM
z%QvnK9RLB9bYGR1rO#}L+O7iu$*fEaA`bhu3o1@3;|B_d_q=An7}3_DHn^Sv)r+Jx
z0F(RxW#plvSQmq5gHdu4&{u}lK-w2bVh_D=g@Sn_`)ffRiv?hnivWHGxU9Q<H!Q^=
zEG<CGfVaI$qWP#|LNbk!VH-P9@#UpONK%{r+<1Yh`jQrehBcLXv)Cg<9%ezUb`0tn
z0+jQ|I5uJ8&dmXYpq|niY<A=pzFfa^DR6*E&46uF?}j9GJ{2cXE3!0Lj#TnO#81IW
zAhchHvmmYF3Z+$5^mi8_tNuZ+Aq0b_DnJ<Da4lu|F9?wH%G(*=5O`$LGE`oAww}_}
zZGG{L2-ib7V+6w7Hv!te0;vW6A)(3dMKFJkwRHa8bgZ+FbXW_`eGSMv6%lHi7)#);
zc^9f7K2*mFC>ko!rL1}Vl7cMX)C$}>*lC2K_7nLv!g#q`@Gp#xbCafU;o2WQSZVs0
zn3`<he;=s8yCUt8_UpRK0mTFF<wBC93j>HZS$&mGb0tRKzaM60w&&*_Al8Y!0k0C>
zkl9tBb~+sR1s(pRWdN5^wLhe>0jx$wdWwbAgL~l_z|l`R;05w!OdiR;hbtxoc*}Yq
z#T0t8OmwNHKr#w(5*c(giPV-^z;R*KYYA+5SD2P^A@dwS=JQOD`pZ7@9E|9%v@V9{
zFTz+nE`Bt<m8NIA(UX~ZhE)zs@ZJ3ez-&`e4eK^;V5RsXnQI?Rhq=Ob46x5Ahy`&k
zh+e#Uv%fyd^r;JMV#_4-r>&k3_nZWk!33~$&38LC9L-`~%u>=?H{Uvnw&XdU|9BAQ
zb=KTo6QqPfv5adg1`0LUH6%2JReELIbrIC7*0_a?qncN2>S&zQhb8EfZ}gD&Zh$mV
zc7;&$h5a^1C59;*U)RvTbWkULKh_&f8OrUsDR&(?7SkG(@rgtt3?3jhBNR+c0pYn1
zQ-teC`N6b#A&OY|urVX;@*GKZ(RP^bb-_XQEZvuHtrO^;Js=ofH5&1p>PI8WW~s^`
z&Aok1Udqq3?%Q6RSs`W~1uq1hI8+4QYML$@5_H1sZ~lz7ND%V>@b%vDSib-Jcu7T4
zA`}@VDKiO?aTi*4+{mnC@10E<MJTd2S+~76QOKURJ(E4kxb5+KUh4J!^m>0jzwaMC
z9^}5S>%7kE9M9)*9M9t%P^>Y5?A;*KXDwxeOj>|*y&qjj9E6QJ<hRok#i(-kFJSEe
zNCQ`jHtI1EJa&iK?5#4V@&q+WKYLFlAJ6m7M@mRvkEO5WN6KJ@a1ZPpUb;Zm#lQ5R
zxK<Y}x6BD9lemu`i3y+4$u#a0(gNqRz$~<<JDlPDFj-%)y!X-e<5$+T@6Lxx7N6*y
z?ibh?eeG0SJ9Vlq>Dh6NzMX)^_?W)KU>npWZxeG!og65ERa!)8HHFgrnl07y)ftRl
z`~<PlUCpnv7cf-Jv9S@S{dP)IAN)<exYl0#1;Oa%{iooBf&I!5?6YFX#{y9AaaYj7
z^xY+5J4@q58)-xMSQvq6LhWs3^O}4d^_s(Z*Zn+x0yt2y51hsgMJ@e|8y$n<>GKd?
z@F;$m2?i9nn3A#&Nc$N&ksgL$8Oj~1m)ca1ra8aU_(F66!m4&51$A4k+tV?xNktC1
z-QSC+=N%(;m{iA{T@Y`yTVGtD``NRqY>*y!F6VRkn|j32gKb&TueDTP(jxF<0mx20
z7r!PIIKv-o{hW8_G$#G^-%f;?aj4!1cDu>zo2oHc`$1A*auH*@yM?eA8X5OT>1=<T
z5ndoIn^do!SXlkG4-ZdMqkCH)lE18VB}j00%EX`F`$Arn{jhxe7`5S3{%EkkXuiq%
zI&2<Sc}-b^ZSf*ea7Y1QOPjGYs-*phSdw?hGQiJ@&_V@*_}zS<{4;s2HKW5kPMJPb
z7G`xhZ3#{TK$#ji`=}Hc>SZu@-@I@~w<P*3@TNs!_Ix3987Pr^AMHDFeU_b)yZLlH
zhDybB#@M<ZnH!ZC{!|?ONnvC#sN}u&@d89{@zj{m?f$G9Phwl9Mx|~cqAHNrOAcw1
zzewfW{=PePvCgM-RuS1W^39%60}!R!8@T_DPUFo(=aLsM;&O6Atnz$-<6>GPk8W3)
z|KgNaUN&}2_0>%;!fdRg2XAj_wwAX#hdLUn4!Jdkge;9zF9|p?yMtD0L~%x;P+ci;
z_6JnDUtWt1eweZgd*0i<EMN6|iHQa@TbON2&E~BQ^L=T}6qyO&Rw_Gh3H`mSZwx@r
zx6ebva>?cM1p>FaZ-IxrnZB>Saem-+?aDBCeY`(^ADwO2B>avNV+-VDQDR0lN56Wn
z@s`CGVqUP3XxPYz6CF)Sic)S1p9_@fk~#9|`^*`8AK5VDnkx>ix5nSCV%i-X3>ADv
z!g?|Q!Jppfm?E77LB>xqACT@S<PkDnvC6oZh>?!00|jC{uG+0u25SvmNk7_=0!&%H
zkOXaX?G1@1k<NYX=_z^k4jpN~!(U4lGxI5BZ4RTdTXFNUh(kD9GU+E%x`6HSk5{Sc
zY(@G5U(I@^-lfXmrs6MBQ<;x{aJFIU_<Ny?F*~6O`O99mtYYl!NXy`@R{va3$kbkn
zsoUWe_zalsPWS-iKt}BK!7o)Sd0Myymg$C-HK+$zt*JAVJ5{+;763kVDti6ACtSsh
z>fRj@%4suu!ERv?Ds>kb5EsWhJD(^Wz36GL3oITYJw;qpdWwwIj#<uh)PS<|`J$L!
zcdUqK$soFYz?ju<CqRa6%b)Dc-(eQ9B<#i_rOz*L;1M(tyKYG&xA?4R((P-+TtG^Q
z5zV&HlSp0<W07KO;5Hfpzu)@1k|yQ_kY|hm*?3UZ|NL<Ip&6iyZmfq?3X8An7UvuE
zv{dFv2fo9p$@aaOnVaZp;EYw~AhF7lBgI&`G+g=Dl?3otdlN|sjMMlB!1JhKJm%I3
z7)k&}to9b#zm;1Y_B=(Sk8rSyH~6U&KLcfGz&rdh9@(cR)n8$+OUFt0g}Ld6s-HVW
z^k3u&{F*H33gYn1x^&0{BYWqm4AP!K-kPI#^^o%TnV7rziy4dnBjjo|{+}>Hm-lCo
zZ%b$fA<hW}5Qp_Na^CEY^oO!=#h{h{wO#z)Gn6TTv6=g1Nb<j1g9Hm5cmR+U06^Pf
z-TUnL96-Q!X0Jn;V*8)6lOOl``yKt$B-9u3e<+fFRD4wPZ=s+(E7<qQ-^Ep~VffHc
zyEz~s)Ic4Hn23Z`Klw)$XN5%8r_B0p{tHl%p+;yNkq<_{Qxcim$gg!D(*Kj89JIpf
z3VgRkx9>XgT~sL1KO;?AV9Cu23fEbEgWU9=s2)3o_pgUeR)$fA<zYP_l5_|0AXJ`6
zkoVb+8lV5yoDoGZIIMO{MCcO(D6GtaAc~`NA41T6VE#j=O#qlc<bnX_I{>VLV4PZ8
zynv}8&;aQ!|1lcxe#mzhAowhVWV?H7ZNUKI8;45@T!_q@{Diw`<#z;e1K{nz&j*rD
z(%t?_UxuMXsYrb!*e@9<0Evvm2(&}Xq;l6fQNwsyybx%J#8N{*gVuy+j&hX!des~y
zc)PtEVDkQwxr}fiXngO~k(vM^k%_P&OYFkw2Y9GDPe+;Uis(8(Bf213Q-)*+1bHQF
zka&gem8~ENG9*+&WT^}K+rJg%wbUjTfPjF^yFxNrNDz@-gM{@N%z3FR)?kBE2*9R5
z`#z)|NG){;&?%rCH3S)A!z#cuEs!S_itcxaa~J_1WDU0i1$-VzZ28-TLC!Y-ybdrh
zNETv<lsqIlLsDD_G#BGm<^G=ifvR7c3*ja6b=xl?_g2o=*MiNZ@vASL@Gw-dCW-D}
z@@9QbZImYq7vC!H{%KP(p+>e_-ZtW6Iq>u~G0-j);+FgH2G*bzPymn%g(Uru$%M>+
zp~ut$>?_i%9N_<iB=_*=m2MT2KHz}@rZt|r9yL7RX!Ir_Ud_?#Ow`(1X#v)$A{Qn)
z10eEk*`GkK-d;R&p;Hp=3QuGar=p!B&dXRP1W#vKI6?E@NV7)W2V-EA^T`#IE)E$1
z?~{grWpDTX0Rv$m3<DUj4JkC)ZpoC@&m=^0p}Pf;_DUyD{(@!aPmGgJV|k$F-Tc93
zFCn3I=L`Rqhkd?4ROF0eVF}$d^E*FjTxv1Z5~>s&(d1JO4}K%NKX}{I_{ttocP&0N
z0xwOkDA3r}IULXkC6^Vh4DXH$@fIMKJxU9WjQf9Rgn1aeL2s?zHVyW|)_XwPsE2N-
zS@$vWS_4wo`nBYf_<t&2iro~i3^S5gXd>JTl;bRC)PwRPn0k)12&kSnEk1%Xp8}vE
zJ)q>iu>?LoipKMD(t?3+tscVlAtt&Gr7OW%O<#WkRau-(bKANJuK)P|ui{13{08ru
z8=2I%uNaZN8G~v6l;Uh8y#yX)G9Q5wfcqG}F(#KNQSXCw4DT5XJ{h_JL=Fc>UKWtC
zI?vUrg?S4!DxE@=BBRBfbiU)ZqpUm2?8)q|bUT`Tq#gh*GdzXrV;-{1N*&t;5Vy2^
zRj`sYE?cS{R9Y94((YtG6G=T+vDyGF*tdVbb_=3Qb4{dTTEY;PJPS?4um$Rz#{Iv9
zZh(^QD(@P|;lR?#rol1We<ri_)Vfxj@LpM^3eHy5=h=v)CFx>q?B5j3Av}xAF<zI^
zrfaiU7FpsX=5j#e1#%Yv`sz`%tD#I@UVO*Z@&?EObQppSh3-MxsfC%5Uzptc5dC%t
z=AJprj}{QyKmvm)P`$*G>jQ$=YjReY2F_rmhOZ?*-lULn4#{5&H^#VGJ`Audq=8K!
zRG$m2dFsL~cp<ZxvJ)&di7dC!XRLuJ)GmnCe4wa>X-I{d66m<_7S16lfu*k!6G50K
zTEWKK^SOR0-?@gC-^rVmKtW8R$Zo!r;w{W9Y{40!N<$*HXm8tXIr6y0cFWt}-!K>k
zq7@!mrsn(teFfC-+|#6(M5~@Ycb#6YD#dGB<P_iKTptJ=`k~Z@bQj|#gzJ(6%GGwU
zcxp%K?aa{wvOKT_(^d)#bZ6|O|C?fLqwPJ+J?r|KCZ3yJwUCH@m{xSpX0GW(T;hrk
z?v6_of04lZY?0`_1Y;KS0}y)6N~DRGO(?9ZJ1o^CV!B_-<!6YG)|Bq70*V`&5~6rC
zW1AGHl@l+6bsJaV$pz)msa__*Z7pA|XS=QyKIUnCfX-PY3TWavMojerQ{u~u&Mo<&
z2h?k7j&Nq2fQhr0oI?qXMH6QzMs5LF3xFRnFlJ2IzI#coKE0t0FhY7@Q-U%=6d5<`
znW7`!VsUL&4-4g__-f>bXiYFN-8iX2sPgGm?5*1djPU!udMas!Dit}#%7zhK86w$K
z{<tzA`&~>N{ztbihOdv<6X@P%TqTgA`18MjcXM>U=ib8OjO<O9ej^)yZ}Xz*as>gV
zbzJ}C_LbQFa~YSW{=tfno%KAZGuU2xf4+FI{We*-noB0PSiL#Xe~}P-qNF#1`LDg(
z>KcjqqrmlETeZbtqu<nvOf>A?-aS3yO@kr51>o@&LcLwOLEgyf4*`-I$*JT2+MWN<
zK4Zu|_ms;)cb2#m%*|su+&fldlH$bhFu6NZ*#7ST<DakVj^cs%D<9JRHlz;in9~(2
z`wd`ux`WgIPY%ch#Xt)1*c<jMN&f=-{uAM7l1_s&2=VzFUlC#89H=9oM1ZD0r|9Gb
z{J$RgpU*>-^a;up62<gQqCipF>d*GqG?r<{gVHHi$&l++pzye@0)vd@fM@^**fWr4
zpPmoZiOL2&W&<RmMg|$h+MKCa+a_4Uj1b)bLO!=wAnEG>%;&hxt&q0H=oj4J$q1y+
zfi)+HHBG>D;o6KT5VX|JPE_mino$Tm#h5-K^Qw^(YIWxI`4K%5VfMe2FJ90I%4f;$
z09o+*GYA)rKo-bjF)0y!RW1i{-ppd~y;VIKX`>!3(YYM$v{lcc_}B>Hp<o#r4C66U
z{?a|?yi{a1J`YuhSwK{@K_Wm)(*{_*35Z>_ZVEGzZ9O;G_%-%+!E>SB0(kdAM4@8i
zoU7A{$XeWgO<RDR@K@ahtFA9!0g5#QU`k?s6XKMsvn3!Z^h9Jz_N(pp5ra|hf_8xT
zqnmvB_a;U8Dn8B@jr*=lrCnZyTEkzwt?#hQKm<;vwEatI#fUgKg}x&vLDAoT3fp=X
zQVEtT`Zfz;Xn6W{<3;bX;xotQLj{9-U(SxayB7N4NWVlf;`?O{<XMky79{1HKNcU0
zvjx5x0%E}s2&%Ug<`0q)IgJo!a>H;NXy7L!0}5oR39+VX19^IyfTIz6D*TKFjD3g^
zI@zYc@cci66*NW%9WNUk9v~Lef{*YSx2Ga3NC$uqXkGvmV<`~x$OWM^-cDeiIzq2G
zPG~*9bH2sxD^e4u1v0oC5GHNLr!~7_XnRILOJW4j4fly@&b<!|_qD$}?eE&i<(Px~
z;|7fi-(~qPcYsB7w}#@5t-j}E7)4bxcdMVnL#I=-po*#&VuHmUlO{qe{L-XoKc|YT
zY|rZ_gI3!I84~{8Y6|$j_TcD%`!NbfSMQk%CZ>tb!p1#HW<SQcK*6T01l!&MpwgUZ
zCCp4S;`3-`c@m0!U(<Cy3490fY<db_uH*Jv;%raZ5)U_R5;KubbJ#>-HgNk$J=Nq?
zV1xtI5~M?@lqeeEL9+pQ{sI6$E7<@76$TD%#Ce6^(DI@iuz>MU094xug|cq!bjWwI
zM|k?T?RGTY9<XQp>Vnl7yOWX9y*<#Ida#ohEEJ9muNIYsktRSCS`rBbn`;R1QvvaK
z7Z~EsBz5k$$A8D{u`$Ln+t^O`A`l8<SJDyQ+%$LYrgf@^LkNrr$7R*Jbc3AU=iZwY
zSsh?=H4lzNxZI1em{}~u%ERk=yCAvM!nnRJef3I3Q!3f+<enTa-eU0k|CqP@UP-e*
zL+drBA=?^5Qi^e-Z0P7@Lsx_?8O@=0M*>3;fxV4!Cv`{m!!z4R*pb}I$jgw`5_$Xn
zu9x4wVbPd}J0fcGA|_+!41Z%mo_Um4%HU4)3=RYnlcLc<rT5FZ9Qm!+dmVbk7L&x|
z?#=0ygsj2jyuWSn@Voy0WXa{^kqU7MZ5~JPffL=femK{(+!kwP<=d|yDLEC+(^)c$
zp!XJvHkuO@AmlnjBR+UrqV%iX^C3iQ?Pc?@<1WyqahqAb_Em~if?m-|971H@!G>~p
zw=fYxVd+Zeq2qb8M^(QM?mHh<3{`V{8p?s?W%{oI9%PKg7~*f?<|rAzsk%Rpg=nF@
zjPw~)eaTi46OrJq#U>>F=!b}(6zBQr$qpAs@i~8wr6NqCAOCE!eqpq_$+!0x(ItD4
zZnIFJ5!t&B4v(pWP7jieAxm?If!DlW#N75O_19|stSP9Y70|{Ey<Hz=kdi~PI`RUV
z{W73VNa2))oTH55<D6*8y2><hCV9jfv5Dv7SK@WC=SlmKgxg@DmLzB6Xsmn6Ii25X
zR^H}zD3QvQSboa)eMkS-zHpYq6a;Hxgf|!ubw2D!%_zZ`_CE9646#JV*yoj-0_jXX
zsIo`|0+xGKg3B5qx6Gbb31-k4tpIZS=)ybdOEJHG1W0c&H@(*-iU!2(t;YUpr!+^h
zKUPp#{Uu1h9TE)Rv`>9@Jwnu(D9Iubb~9zPcJAF;f|l`B7pUiH8hMz^%t=L>0I~Gy
z!yj4Yt2%GI>2&Orr_gFFS7K*9RNu5H_RK;ub}gAyR$<0VeZ4wc*JkN!GRAYL4bL5h
zQR4My)o#_W_EH<k)ulBIwtb&dgW|upx6r&SGdsZ2d;A(~9hIR5I2k3as|t1A#oKW!
z`>+o0MIV*>YkT~YzN69!i<ac#FF$FG5{#dDk1vOg$FOt%R1f>um+8*~%4PQC6H;cF
zr|%T+P71DN`XD}~?G>xH%KeMc`5SF`w&-L~Ncj{bc(s+S*^pwM-Nr1YrH}{6U`b9`
zD93`%dt_R5g;{$@DZSksP2rhXw`@g>mJAC%WW^!3+{o>4cV}t$<z4PUvaPP`$7%|)
z)QuSBy@yN^>ORP%uikc9`Crk6EB~37kdegUt3Gg;S^0eAbkl0;CC)tHUn#%8ZP_SU
znh7JtU~;;8qp<lKPT#Qs&uADAiD;!iU+IsyEWc6^yok8dv7+y!M#Ib-VTv86^-9H%
z%f*&}NMd6y2Y5qte(2o?Q5aIRqzIpzW*g@Ocd>rjAgCL!W#8h6#du9TR_?WdwS@UY
zCG1k<gG57*VpBSMWt!7QCOigT{;0gDA?Qo<KMVE2Pp<c_l1LZ0e+{PXvoZ1Y^EZvv
z7*F@0$h?j<XS|7y#prjEWVS-nAECOtz+hBxILa94YpXnIbl+rtai9$P@y(%|^LKW%
zRH@eJGogox>pMUUCu~AiAziP~mkAN9le!3}n?c;8uE)yiKxWJim)qxy(0^S&vo;h$
zhCz5%UMG-^e@{}+?1YVwMfmpLz3XaDhQwHzZYe34)Is0y@jHiHm(R9o4qu!}kVZ4l
zCdkb=`ShevNLMpo#Jtej$=-ndiM3zdh)Eu8m69zdf|OuCe}}~kn`%Tu3Co;Sj~ZpD
z?qElm0#uHSXL%STSUf7+utjnw<+LS}{3vEsv|2<DlA^yFCZNm2^JS8U6{bCzRB52&
z`sor%ef_bQ5My3Omo{?ey{_fUFuvu}KcqQinEsY$&iK-QiPi5PM0Lf^@58CRTM<Wb
zzJ3Z8B-#YYr+YEwIWZFUY;Dk>M<AQcq<_3@Wokt()MbXG669+IJ*izUU3$DQXwjOF
zh_!|t*VQaEBf_ZbxRmRB#`TbD%ybYtOsX%x0LthFZ<7lwMUIoS57*4t+bL=|3VNSU
zGC!7#j@@nD-|;Ih?Dl-JKf}yz1leakz!+S!Hi4l|fXH{=;JufIz~xBoY>4>TZKw5w
zkHz@$%^=x12*|~6Z0(ip7v2AXBuJCY5|UH?;H1Sy&H3D|n38eRu?Rmm(zKdTHbh(^
zT%36TT4Cgc?p$Kc`iwU$p2+Sxvpv)pYX~Zwy<W?A2h>Kf*E3kZBcpq)I&dc>*T6p@
zHyW|g*fDd{VlUQ*z%U^i6g-~Av+}~}D^`JJPT~BgXreR3ZV9h1U$dxJvWH9);dTlj
zpC8OjviJbt<l?lST%@t8lCvXriut+YAR_Q+9u&_63I%Ulx~9nNWjZ+3JB<MD6#diW
z<bFG@qnys^KGhe#z-GQK$*7?8e<{uq2-5&Tt>{QD2@aPJBb@R+gy*b5zxTj#=K~JZ
z)E6-xmi2+6TFD+qWi&X2sZODq8Ogo9Dxu7PjEPLR0<+cwP~0op^wVrY8T#nCWR1Et
zkRW?tNuwd1)c^i+rM9js<P+9NxSUnU@D;|svvFk-7vfx&;&qdQeK?8IswSNIA?|V7
zO9^x*QrUgSFC!^9RsRW_ki(rfA2eTv1~My(I9o*nZ1#@sxol^NLVqH&abrs0IK%$L
z4=$#<>T?*2xVn<RS6C84V2ZDXKzON40)zxt9DZ-io!4<}%(mpxTQ#2N&9I(Fea1tj
zLat@QvN*mAy<0YtNQ;#|%|PPUnYWhoLcI(M6$?`9f&rj1tTbVi1?lrboxM+;=M2<|
z-!Qu{<ESUJ7nh!;zVXhl8Pj{7OWRGXeG012$^5FO7DT2#UH*YsQKTom1!O0`cD!@V
zvSMbE*~EhE)1xpTcLQcjg)>*$dy{PIl7%*zXVMe1^|_B32=IV`G**8yqSSg}wx`qi
z<_l4W@-jWeKGh9COyaG)A|dchlOKNz^u{{1=)XusKFZM#`+U4Fh#-l%jc14N0EKdi
zbyX?=7}}lH@cYoCO$1D&TXZp|y?@ZsX8?FB<TwQ9rV24{3hvt0g+q$A`r~ze>wfJf
zdF}VQ0Vkz8UFDMwu%G0L-$E%q)R+|J8}@Z5DC;Fq^`!qiq<GXoGMoqZ<x<g-#oV@O
z@4hOOMErLI5J-<`UbgT(QmvQA=g`%B7QWEHVmw@^Ou8y=S-^r6Q=>uY)0fIY|M7Yc
zB@J;tNYs)QeaHVw<A1ZiuEp%vZI@qI_>~U8OCT+8dx*wh+|bTe906f-#~V*gd4cfo
zdnhh0TatIEYcQl>vUZ5gspf|F<yd8HO1Ce6iYESNdvIMQF9I9C`TcAg;ZZYr++JNJ
z3A>UEE4jB)+)eWRygj}4#Gp7MD*Mz|`?QkAnUraY59P`_?k~x<-8WGoH@VY~Nq$O&
zlQpRk71hJUBs!y#A1UuWI73Z%_q_{!oV=Kr|A@nsP3w(qj?m`NK0%!|!Pd^j#d$uR
zma-Np|K0h%^A_$K)#?@N<v)otQ5r>2b$yfs>=6_mv%o_&@3kn;Uv$MBrA7Zu%lu)&
z>5ksJckhQq^mYWSoPxGP;M6ld0fBrF;uC5jc$x$?`tUFwmWi$CUrFHqOp{ZDt2mtd
z6pLE1c|sFdN4$-iscBfvJLfPl@^&nS?0-J^_fJpp`T6-jm6ob(ZEq*!=VQ|{X4tso
z4H{Z&|8C-EZdVS+8F6z!MjHX#<&d^*kW-zb-TmX9nSrl*QXcp5yqcQY4LBXbGzgP<
zq*^S+frfAEFNuzeCLUN`1i&IGMho;YQn;j99)J6HYr2A6yj}Gb?zy12Ss3b_{-Q)=
zG~ubKP9F+C*kO}HKl?L^zkr-D2MX%LaIvOeA|oTm4BJ5D_efk^e5_NT;LJHqq+zq7
z^czq3DAq^7dgpm+np!E3b!inxU+;o`RnaciJaF~N_L)MXp~@ofwTs53${GHQbB-A=
zb~>DzQ}T^P_8IeaRm?P~FEDKir!t>ELgyDylu5)k)nP_jI1;$_r0%+pW|n`?2g$x-
ziz$yfyXzv@#PhSiSK`sY&meg?v$toUuWybS0R^Z+Z|3L)j1-QAC^BV!$dF<!#1R&=
z67o0OlZCXydE4hoUX-^6da#>mw9YEjKgJAQu13SVzJI5#<fBEgm2`K+MwDOO!{S}`
z{Nai5Px8)IYNU=*%U^unYKU1g638#?Bp>wjXp&2+>ntc4t1HDeVz@SY$X)f96Z3$N
zM3bJQp`o$w&W#G(Uhhg0vND9Cma%cgise8}N}#ZM5nrCQ;L07wdD&v#(u8iNrDH|%
zr4QXy68f=LfvbIWZNK|RpImB|lR|JZZXMU_EwmrCUIs@urDjk8iLhNgo~hu-E}+hA
z*0N=g;s0K0=mcAxy=r}?>r($UhYbIvl_hRo^=QN;Yw+hS!L_KRcRC+w0!$p|RK!Gn
z0S+C+x({<FGdV3I1PZuhZFh&1^=*4fhtcv1bs1ebT8%;eTFzUY(G=vBP;&SvXsFEA
zLiULoQZzswXkuc*@nnrV(^gWpZ%SZ_kFn8dHJ*-*Idz*-RX#p>2kINdgAnY^8#yZE
z3;Y?<cUI<97AgLjPjSnccG%QML4qqyT;2_Pjg>p%^*I_l-zmKtX7?806pQ2?ScK$r
z=BL0~BQB@pu12AI-O_oT2-%$Z?={}u4dBC{`l1ivr;^i*r=UucnR$Qcr&HmJzKV*9
zxSZ1hzfko#berGqdo|2@!SKjV-swmX7Y|a)el8~42kT$Xi@sKeNiH!ht9_)DC2kpR
z)G8iC%ub>xub_xXJ;WyVSA_R-U-Y>XQBnjtjuB}n<sj#AsK_g9?|=q5JJ|vHpgyNZ
z)dIR;Y;4S5t8w=fQH8OZywm3LgxJEYS78M%T}&%_Z#5~Hb$dqIxg<GlafHKpvoUTN
zd^;WCkvAI@y&CpbqW4*!Sc4xJG^d6|eaS0s8Ce0ug9HK2@oLbIZnlrtO&DdFWo%sR
z&1O($Efc<Z6M6@5N*QP^$ZgA$k#Y(3oO$YzjoP>z@Rc#F&&f!)<d(5&z9zmqb^%i*
zeU<{B<d+=%#xqtR5t1cBZ-{~9&Wv!6{=?{L8^XP0O2JL*vB&j%ny-XAE)&R|2*0}d
za(!_q2a1XPTgfv{4zqqMv6?%f80;9K5S@P-5)Pxjg?rJ-pIQTzrg>hOUM@*7`>9jj
z%3eNcd7+Wbu8GF%>ls_=<8;Q~z>R6$bVx5L@(PlE!g|S-DZ!ZO^GMyI<cx0sQ;MU2
zR0owXorXpK)LQ|>@r&?PwrLxk{d_jlaba@~0{MxNy@ioW2BN2E1+Icx)^eENkpv$@
zivLk;4%bk6N9KK@5P>`+fzV3#%xjy<?IBY9qwtq?%dw83A+=?jyn@t)`@y^0>y6^-
z%l^}tUlnVL+iFGlbh8gg&i+V0g@>M*nW<MvIvvEOa;2(=Zl(LUE9PKlm|Lx?6I$ck
z{J|vrBu_o_u>)XrQ|Sm{z=7OViVoiuQ#Qe7Qm?Y_DZF#%)l&hJNFf3|G@;O0cIEw@
zDPavKQ29)ypT<KMgR*)VX<ykm&9aY)1A|F<g)zA3IR8lRJPuh}3O?N{a^!=yn~fS#
zg?pfk41&^jKDc08W%=G*<zA|8`Ci4<L2Z&`3Y;X%?1-AXb2!|85ER8?3kc9?KITqI
zfn0*HI=BosvOT=+HJ^`ykNaSMI<4O(6w3ORntA#a3jHc88LpL43b@6=v*KTrhc5jE
zCrbQ`tJAh5#KXq*qzUc!wK^62Fc3hxwlz-j!F5{{1*_7Zr^b&}ZMWMMgrh@`x$4(z
zZhVxJy2d~G8iig2I5o~%_?PSHZ(s}Ig>)tUMXlj26X)##xcC?;yg0XL+f>Mg)!Q|5
zP1`HOS8deC%%P&|!LMh=!}>;Y1MI59YXyb%HWe$8W!Qej1&c9rkkNa>x9xWMHsdJE
zubSY9c?anc_W@{lj4%^;v*TZkm~O>Yi`7ZhWtoNwJ6B9+pI2Muj?MI`)&dpw*vJd_
zIo#BzinX{#-pWnMO~3sOKV&Ra1mU=8)Zp!~A2df()Gx4l0v;j&km&0s`Fjyvnpvi{
zo#Dfm2MUZDRnqJ7Xb0LSpt2(XAVn1(!(RHT<E-sT`qmozw8px+??+koN!?nw=R8kS
z)1Eu`esWS<&%hvH^aurAiE54?K8Ep$5*S_y5r`haPKP&`J3rmbl3qP?I2eD`=TXXO
zqE*RE!Y@RPqR|$58q-906@(!%F<qR=b<rN4TeT(G*|M+Bt5z(~?us46GEY13@G$|2
z7IBwcF=Ss*Oo+tfs6>piE-FU(YPBZ*JpKt0Iiu#LWH#>b4AV*Pq3#qPlq1nc{=L=L
z-lKC#_!!)7$J3t>fWQuE$|;jRP!;mkmn>Nd&xaYJ1(ZKt!te{y#4Fb%R0ssF$-FF^
z$xhU0?#jFxE+#@tmKSC<c*zBI1Fy>Vv<2}@HbsP=tU{(b+ZNVgS6_hWH<X3Dgo>eZ
z{&f8KaWQFGGS!=pL7YHU`P_>Umq)(E7H*z2@zAXEEl;!J9@)evFhovHbZL3#?ZM}k
zg6+~#!Cj4s#dsv!rOjk3ZlGUiCw7mgzH-b6a-p@@vTxnVXlw}opud?-gXpwe=LtxX
zYw5(x3*JOks<u)O_xB&%C^R<g61yalRFe^g@CLP;yYa~qFEYH(K&6~dEkVk!JILBb
z9QkjfDe<uHUH1<}9J4Yg4%Gz8JDn!_)4V)N4Maufl4)<5qWEv$wxBy3{Ih(Z72=SE
zrC1w?$nUvaPda%r^-QuqYIUxz<e^YoGO0lLozZ=K<C2-gUnk}(oYi~}|9)kMTvw48
zu5(~uwgBFgnn?kGx`c#hora>XGhI{)yF>4R8arvWLjL0%`Wo;`goMZpH}81JWb3?_
z?RHD|k6Bz8(e$Ti$S>-_|51OKveOlvs!$Ir*b9*2=2*@=&J7#D$0v{{Lw$mCy^<>q
zx$5JZ`!3<#i!t@sfH2KmLxe}GNbHcS<<V6<GA`{VdKt;!(C*)%=o`lC{QS(s#Kaw4
zU6(~fM6ze)<4L&WJLRs(NVWzAok7uyxlZ@^;2g}Urn6v}Siy}_$D0&L&Pk81SMHY!
ztc13Hgq^b?O3=QRrNQD)82mm5LMD9;jp+G-LTQBz)oWZjEfP;+E5ewdoNGzj<jl8s
zE>06lm6Il~&=wzjo95|nzj7JJvpX5)KHmDk)EkAvr*+zR#s|vb1eggPo}Q_N&kEfB
zsEfq#GLB)yXtczh!<=|4+d+ayAg@ACm8@VtV0ijBnx%Y#B)R)>^lx6V>+w^V(V)z@
z@}%4@KTRn+E=aB3C>+IopK~k{+$109=p;?+?W`)FtYF8-RQO)ly?4m9>^#Qwcf>3s
z4Y$GF?4Lh4(g7up6np(=NdQ<?eKjR)h)KprTKM{P`sU{5j_&R&<>lo|Ec^=oC~WrO
zSO~{SG+fSOmeQ{tIN-f!gXr7Phspk+Vv0oLa}A2y>F2eFF%1UDTu;u1SLk!xrTKu1
z?2>Rn<!i4Ro<klirWTsQO?#-ZnJifu?=b@6Ws%d%<dc6A!V4g4TcMc|OR-)YJRh~K
zYo0Cxda^4FF?CV2ueG7f*>`@Vazx<A0k?a|wTTFsLEn9SPt7bY2JP)R=@}XMXmHl}
z<72LU??5zGB=;e2@FmqEd7utI*vcu3(q}-!RIfE0kd-gcpKH7y7wa!Pk7*D~QFY93
z<AA19!;_Vq*SGHCJdB5~JLb076$sx^Bj&vgkO>S9mq18KFxXRk@}b_3`EH)GggMDf
zMlX}z<lRAVnGqEe_VMxpB);&a3GJ>w3@;Vl)@?h;({r9yz0x;i@8*H2nUI?5EoW8J
zfk`8A)z?!02zjt4j7=J<lO>N8p_8n*2j|l?;Rj!0f}!6j6+H<=>wz8~$Mp2|{bOS<
z>PUpdh7HTWI;0~G`nmTxaMOm*H~moO;UR8ZPrDa$0?*xb7EauwN2L7Z1x;7}sJ>z%
z{`c@^?Q!guz6&#iR97CL$$p|Vl|uGRGIzg~s2_*%2re^UHyIWHsf$Xra1&@CM8`J7
zKL*#5oKuXsE`!*dPDn!xUS#zWD&!9C(m`&@EhA}@Iag|hhuxxeUZXv1auhMoQymFr
zeR2t)j|RcXQ$QrM#6%l|+2~NQ&$OW##D2-X@EleVeJ2k3yA%&VnZSjC!bS-6U$n=G
zNm+hGaqnG!_wJqBlvN}Qi@f~&n#^gXhriz!%y9vD{FhQ6@6r>)4>#rY$Xs7SKQ2XF
zbw^&|do@=8RwY^h@&P}-*AXCKk($5is=xE>+`%R|d0k>*VZleEJsXD7@6P|v_fb@E
zA-&{|D|k!ruEzOfPaqFVUy8?m41-Cpj8BpXy+lO8`i<QQ&;DLa*r7|3$P5gC5gwU#
z0Y{QW2}7Ot(^32@ET|a%Cw#NN2cp<#EF`!UdOwf5>i1XYAvXB$7~lh^PDx*UpY!?l
z$fne0e@{iwz=<iDW?55+QwVgEp~w9i!iQHAf3a8~IYDZU+Mhxm7b4_o?>`%__37y8
zVG8Fpp(xKvBl3<qVzali*)crKE+{C-ZmIX!5dwgK=kcU?1YOE7N4pt5ugyJq3WXd<
zD1G6utG@56+XV8GqoCFcLS%j0ybS0gnj!~FWat@s86V+~kUc`=%C@_^+wp&FVJ|pk
zurm5R8EVUo8hejdD;DFU49jC<pI!3JX`QQYw^<y#R=I`EtmZil$8gxT+7FWz(4<?!
zSg0pgRJ{m+d-??k&5<pFZy~Sq2Sj<+dFR&2C|&~8R4uKA-Gzk8zs8%gae1O~B7wtE
zuUh3%L2=qK47fFf4?;b~GPoRgJ~gpzRQUnYsjO;5H=_kq%raO3a2Zaen>ZTOkI!a0
z@1`PNMPr;BU&*AuLC$2By6~7U%0+qqJ7wj2V9KPNmII>QgpZG7(;TniVf+0K3;v$k
zpREp49zpjse_!)Ek1<QQ3Mh;e;RZ#r{;g%Wj4XS}^d-LetS^t0XTvuY;aP24cPBWU
zKOpF$i5lbSn=x3^n<ctgWW;)&9v(01{TY`BM)r7=C8E~ylImNX*0@3;;~lo6z044E
zxIcVkM|%_EaMi9D^@W`D^mN^F78VC0dBh^|SE^L>##g3>W1pyuDQOM(c`|LA4X!lr
zB?<37gfJ#dQNwQqFiLLKq)gXLCtRvIIOI)~pF;8o;jMPR<=G{AzQenpXYEI}qg25N
z(;OJmZ*Yo?h0mO)O#28&G1G-Ik(k-hm^Q}W)B6G}dXgq60{PC|1gI*0AWOgHq4)(w
zd`e9XAS(omdA7bfN1XYxhUJkp4q(k-fJ7PqEe)|B=8-7}u~an*hgd=9pC<-Pj!69%
z9w(IWZnjb`coLpN%#PbFRsxA!Equ(Ap1UHs(sE#?17jL5IHvMJqP}w1h^7=t=*bIJ
zd`?sx#Xte^35{`3W%|RuzRURb<_r5T)Wh^hN@vkJa12A$GUIufe-6vqj>7qivwr{+
z=a3}xLVOILm;B-&C=ud>2OhEEV-n<*{=w472&YF#`1KM}QXX1Xd$AC?>U%MqDRX^_
zEXQSJ1Wv5Jq0V<=W-a$$X9i4eZGEBJGaalrR{5rm9cp@1H|b?)3LOTS5L0E<@H=n9
zQrBR>u@vqhpDPjyLGw~XP|z0&-6BV22-x_aA*k+%FuDHvr9>UyM%ZRApe_@GTSz5h
zs82#$15_GV>~@W};TS`VXx7#l05ot@BNjq^TTmfMmQoUuavlA6s+i|Bid%64=#SGq
z$6dFSOk*=S%h9d$GR=WwQp<SASu{$f%$}T%YWS2^R4RG8vc?XpI$0czKb`ltvqmx<
z7YrO{ZtVU(B#S;5uS(CYdZZB^u}f&yzHv0~xh7JMA-CIbS0^Nvs?6dU^u#{Cva+&y
zWcP~H>^nZn;87p@4eSs(i-OzGHl&pt2!LX)76~c_fuFXmjoV%+L4?(LS?^jDu$e=1
zCm)w@yopj-so`)kQ{Pn8tfJX{u?hkpBnS57Qp;ZeMHr*NnN3fj)z~auOAq~D_f%iK
zSG4r_0mKYjKb@MT2c`vsRf)_8SLXs#$CUk78XnJv5G2L0a#c<_(#RkP*c7SNX!a7H
zk)282$!qN^Wfi~NIU-imlm?7a{G#M!TeVV?^g>`KDf_|5LAYxX$!azQ9b!N21nL~O
zDG_vMaQVE}*flbz_0o}gP#;C7CL}rC0&Q@0T?w1f2rDcga}z+|oRTgk!M`%rgkFhY
zDYSE6nufR{^5R}$<*wvGcHQ!j+AT<oOFnvgaH^pb@RkckmTPge)OJ$OXyhE#W|xw3
zzeO*9O#0D4Iov+}pibWJiJ*t&y%OL3YImY9aqZ_ZfH0Y8A1mUcWr>X4f<*qJz?v`L
zWJvX*n;(+IPie=_d4bFOBg^g#cDtQf&iEv{>)y1D07c~4DzzJt4)B|fqF(OIuo^8S
z(bDltwVPZyYyD_lq?J5<e|74ua)f2a!`RX-%Ba)It&y08nIc3;V-5mhPs!QVwxt5O
zq9J7=9*s>CL&J{%B`I6G|Bg`UW!CYwc-_3Y<ma58cHn=z%iJUcJF(-5BB6lk<H8$v
zjJxH2S(MH1R`=;v^|&+!VXa$$voXD=IhK4p_FG6xH(Ng2?x(IazZMy8uJ4eVsr3%_
z!dkeN<r{RZaJuPHr?qK`OZKS1M=`P27Os;8Mq4q3RW)|CP5}G`Ij|SeE6Cn9Q|L7o
z*ExAC_DQ|aQH^#wBD_Wap|<mHYA=@fwQslg)$>!!Mzi1u*I?N*AL!xK3XMRS#^y(8
zT3$HubB?u!{#L7zsBk$4E16?j=IO|1szm~@&1(L1{An-V?NgB$$0biRK*0WhqMKtU
zR7Qz?YlBE*<F0Xd)wRoU((dcHTi$CUgFWLa!K2kGjL|)sxg9muDtFEaQmE#ozHqjr
zNNHK)$j3iC+dHSeUgD2y+B<+E(cG$qAa7Dt>v7H$wW)5BA;!Z_>o5Km^1Zh`ynCBt
z1Le%RfoczRw6gNRaG^f<PLDM6J5`Csd{f?EEZG+n+?SZ)ge2nOXB=-umAkvSNk?<|
z*5ksJsn%Aq@T)Rd9-)n@)05bCRFy+|aTibAoJ0&&Q*slIBP2GkCrXGwdv`L5Lx*OU
zIoDY8=Ww~$g6=J<p^D;6tM!Ee#pPlI5Efhb`s7(|`cnnn`ihXf*>sQEDv64+3`6j_
zs<u;B6l^AKdVFe5*F~WSN(vczSa6x2|M3DCesMRwdTkbGX8Hl??33a{r?sz7hFm!z
zpI>q8Q~Lzn{zjdWsZ@*7fH=PlcP8V4V#?`DaY@(TES5TL#CSVR2$l>FB7Q-Zl$t|@
zecd3Krk>MLx_*Q^lf!IIk6y+2q6{0;7rWm()8N@6D9HIzsj+HPgsj;+ABvW0slP07
zIs4US&L}{fSXldd7)Iw&yotLNC&C_-Cs%ADJADWds}SH^elrqcE7Cfm6j4n1*JZFP
z`m;${;DxMqZdz~jOk`hjWm!CzoYk7<FY*b_$nvh99?B?|Hn+=m26$-VW6fKwj*Avz
z6bL38V6R>%tKUvv?-oMU6jBhc^*ZyApWXvq?Ju)DR#(EGypp5(0sq_e>3ZHxEvAyl
z`GQwOo)hkTb{Wm1c+WLU)Q@yd^HIce4ZX@#o?7T(65g!O9!(wAta2}}wyT;UHQ!##
z_H%@|y)!C|oW1JkJ9De9S6a(>h$K5C>NvKn1?R_OF+XGE?7ZJAa$4|Xp=&h?EnL=S
z4PF%mWqkp55@C$DcJCNdZ#{GIjjiBW^{biAPT76HFGHN6Kv!`L*W2EspCYEybu205
z8%|cIW474es=comEX^APlO|TV1^Tpd25!{3QO%@1p$SL)>Q_7Z#<Wl7tFBH-SZZTh
zO0ZATsuah*L<d*tNtxqnx8Hl9ta9Qd1%JOPeTLEJF&I+2NLPq;a`W#j>89d?y)%Ux
zWc!9EHb{~&WffJt&s;oik+HR-&?4p2%PG>1%soNJcuMf6-ZZ7F6q{=mv2R6tFlcYv
z8C1WyjEBypDUaz86*qOqsR~!@;WDSZY;0_%x}7JvqU+NOi{)ktDmI%<Cbwd^h%>>c
zsLWYocUIA&^*%8YMh)5#w-5!7?qn5z=H9Y0B%?i(OyQNV(%UrbYaxr#jlpEbPsbfD
z<O_-$OW4dp>0GCDLd9`a#pfypoRZIX-g3QPd|FUqKVPHvj_`ZVy0kTEj<!$R39XcC
zBVCz43ObDR2n<wl3Oeie{i*=zEG-^?5ZxnOZPgxHlc-*N{?w$14+s@<=h;b;gA&F7
z4ne}qh_jGMS&=qYNu5dNzjf6TG>Wt0F|>HtYSZ}CR2EA5>67wmND<y(2_iQ5;XSJM
z(@rtt9icp~+wBb9-Y-w@+4B9}pRIx#t3NI)=oTrRo1Q+|qZ<;>B@v*+Jk*U8=np7r
z-WX2z=q;H}YD-i#J^aaEt*4WI+EzAxTU#4T9<_Lfy6xAuyz`OuDmhD3>0K7IGZk|w
zR~(d@<}x}^Xr01)!J~JzinH54OH)SH7}l~B#i-`JT&!?%dD`&lO!6ep&8kQwLo9Z3
z7^fZ~5Wg?X+6zire73L7CH_V!2}^G+QXRX+bh5*i`CnId#U(kltu*=C_;S-7K@09R
zog5skwoR~(wE7LuyikCYey{BqCFQAo<HF5jk9X2*SW*H!R!xe`>E6^CB&;QDG&nER
zxzN(Pe7<~?Kz`pj22=HhkbE_@J|PGRsuXMSG0H8f4;~(=sIGB97t2p8D)B6i>zc;q
z-i>!Wo#8-N7^*bE+!sW6<=wbw;pAZAmX)cTb=>(^?NW&Ca<QHJop~mqrMH6?cGcFD
zlq+zwxh;=Dwy@qYO#As!jqU!PM~mvKB7VxpFk5yvC?+qNPwKd2^qjurugu)1Ft|bz
zuGzek)+QGgVwKP`a=}I;PMF1EqdbmIH)OY41rJ-qAjyZkf7i4Ftgs%`F)TZ!cZA<q
zYd!P8urS;a1~|p|sKd+g)YoLY4V6(pvrOAj%?^$l(Xe6)x17XID}KbO`nr1;7-_rC
zhX2%!p^9{;iX_$(;_g~@r~J%pSm|G9S8<PQTK7E}u|*Rf4U3*hSx%tg98?jt9zDAS
zcEVLvX8WNeH|@r<A8V%WcJ`!1&0gfUzr|0)ZHGA&S)=c8g2UnFaD~D)pT-fiV>u-|
zzmQ#vO0KA1(k(7|?gzY@1yHG0HZDHUwH8n5gwcw@>=r}ezj-DOSi6bEIlh1Z_3yA}
z`-Qp4NJqyAd)MAR*4!#C=$D?NsBhT3r)w_^8&`Gq^|?5R)DbN7K&?Mo;j%{@LwDVG
zo%rnfdK|x?C1jeXhpl#uD|gps=|AO>YbPh-i|0bNU*W>xs=vD9Vu!XN?pZyt&fnM&
zF(NJE3IV3y!f^R*n-5HCHh|erv4_P^nY=s3WhaE*Ey;E=asqRw;-_~+{1J+4%suAj
zNgt1t{={Pi+E-H5b?fj+sa{O;N#N!R+xbDMv|jW&g^L;YiX)+V@G&{AT2(uIQh{Ug
zV=bwTirpD%3tEYVfG88$N`<tsu$Ft+Z72;q)#{#70cvoi-l1xi`e>j<#?H+ue#sw3
z=(y}@bZ>Ld9`}16<DW<9Hc&5@KIwi=xR{yhZQ_ct6`WgupmKq;oFt5vfxkuj!s>Su
zJ5^EX{TND<8_v@+GsJO}*0a;o-_%qkHK-@^)9T`my|sI%0l}Q@lr;8+vNT6WqPNr2
zxbd`SiF@8_ooSs@M8ZmME7(`LF_dN<(ZqJnabjN{kR1vanj61QYx^P&LoTdQU(lp?
zJx|)MX}a~Apa(Fglu_NH3plGJx|`Fjk8(ww*8slDfclbOBCPps(OT_9%3@y(clpxB
z<6GLtrJOTn`5hfaIRvx|rmE8`4s&WI6*kFOJ(Hh4f<Qfo@QhCYbX{@{ZJS9T&%zrI
z&QrH8jf%%$RH^VpM-By-?8npXCcijO{4$R6y;nCj(|0Z|mo9#esf2Eq%6Zjb&K=jC
zu0mO$u_#P@JI?C+G5&UbyOE|_EQOKv89mnH#^NFw)j1Qi;qN846S(9W1HChPGL;NA
zj3@-7Wmh++9V2IS*{NPMlv>WIz+8DtW92){ak9I~LOaoP^~=eXh?cBwF<~v<-1KG(
zvY}Z6Ig!%04Kpjs8H48FrajT$CRF=mwaC|)chU2rSwA}~nT-m+c(2<)dqrRg9j=X}
z>^eO=`hE{=H<c$!lj=;0Z?xk8YH}|f|Eh_HX7X^A`h(YA&U(4@opaZvCqWCy*0B-6
ziU>?2W9i$iyD|J9NLm?OTypV*j$-W2Fj#1<Pg7f$XXeUV6?|OuYkBG;lzI4di+Pur
zPAs{Q4m)dntAP?pq@GrTes9b(_0{YaW^V6eHgL@2wG)h%)>ES1>gO8sQVYSma8wjs
zJtYhg#l-AP4Z1@WmcABN>~*dZ>Go}YvUR|WS!Oe$8qLbuRCW08RVo-$O)oC8pZH)I
zQ5XI}ceZ=EyAXvI<d$b6@15}^;y-0~VLEN8+Ehx1W-g90g4uz6i`#<uLHrc6Z-ucX
zb$cDvtJ$^HvS_lS;Or3fDt<7Ivj2wBoW#^E@iDVhuDniIC4<%&?3|wEYD_AySvqrn
z_GRbq6yhHk`N&&Hb@P5}HioOFaaVxH$}-KWJI9;I72_R$H~L`5oK-_fA4zb&KRv?2
z&ejAt#m!UgiLdYI{7&54=rlYpn7>in6*3)l!B0B$ev~o;Nqo#fyx9O&P(jl|0&u<#
zDSi<t{&`XdcO~_WYRU%C`SXqOys}7bfRk9$=<@0Zi!CO=BriJ4mxAD~JP2g{SXGN;
zK)P3?WTnb+)=e{Z3_Pgl5&B*W;E?4s9YJx#raebZk(Xv*^pY4UzQ_&m<!;lQYzj38
zu|i;alJxZ4B?8q6kU<B5GTv@5hh3nrqvKN5%(crKpHx;WIJ_%rUuS+0|An(|H?tjH
ziDH6+zlV9-ca$hAtY2B!xo1XFmW>`=ey1bA&~7A3-5JeTb?xNk4b(j;zKOGXy1H+K
z7AAI|-mAdpl4iU$^x&ny1?Qb50KO3M$KY;zw3yw_(WLJLetf&Os}WNI_raRG*KBTc
z?Gvp%GK018r()KOc&DLi9T}^kq-~Wot3R{oWddVkV<i@-Z_6P_Vjhvf8PsPpSKaqi
zO+f)qYkkKB6YU9J+(&(RKwR2Xt&@B2l?r|bfT5BjA#8ufd<uZR3D5U}nPhOzC;N`z
z&pw6=v3o{FT8p}%upeAwsI9I6RT%uGMe1M=5XFBbXZp}Ij2;1T@hLOj3CNX}0`9r+
zy7S3H3yWMpg&%_MVaLli@Uo1RA!-_+=C6QR`Amsk!|x!0dy53}oO#0plwa#xun&W$
z=|gx$w9)S5rzmEmqOq?=`f2MwycFtb4#27bffkifQByaH-hVtm3~eutuWf)CR3JEt
z)+6pKi^Wpo1^0`W;j<G)Ilr2uotDYK??qg+wNAA@Zt9#6m}N|8?Bg_8QttJgfWU+W
zKmOo^P1Gd5DzQA*@yL!-Xa4M6E~pCtVBdP=6jThj<rnYgk_GFCC+<tLsQT`t;jegG
zGut3l4`=ztCvR~!v}k3LxAF;MzWH#W5mY2vK(7U0#1mfPQ6*<+RH`E?iuPz5yD@=s
zddmMrd1}3QaoNF{+iJFJgItca*yg7?!c6BC7K%$tOM9t2D%7i3j?UmnXK+D1f`_|S
z#ov~up0BDCI?JIfNGUrgrjuMTuf;*6QwYQt%wSn#QatEGq9ZhNfhg>Q7vt7|yCM|U
zCS>&r?WaIeDA~e4$7+j9)licaJRvRhH_jX|V6T4wj(6q4xY%j`IX(S;anX3@=g;8o
z?rx-o^(V1s&Ge`Me-C*8q{XF+QTOQup}STM_0GE%K?{GQu@C`pAC7#P?YM3Iq`)wg
zRXOjfL22;x9&9wT@A@(_*nwD*die|^YE$Ip!zq&Q)oTctzCQwPw)h8;B+J7ne9YOv
z@V*S%@mn+nV+GZX5Tm)zC1Sz?KfDkoN#=q(^HZBz_gn+oD9Ph>OBXVpBTosG{#Ib6
zx$OC*C83mawSBl8RrB=&4f}E!1?tM9=Asz+;JZIq^&qOpT!j%XX2|1?a)oanCOCa2
z8To;@t?%;h`KE=x+JE|g$WE<CXlHI9JGq$tLw3TokZy4$1b%|)(04t24E<9)wBUX(
zc#))LiShh%4AcZN+7PC@^$EYtqAKX_^otjBok)=}j*18BrAz%Oh-%^#DR`@9L0qRl
za?Q>KMMF5vsY~H2%uvOKhb6o2s;{XTfAaKcMUyZ8DV*{`F6_<U;Pn++Ko<Y$A`m(S
zsGBnBYXyC@zn>!iEbwIsLIVfV&*X&9;7Ixc)TcRiIQ=J}dtxi~AK(G<;d%A#<EJjX
zZ)uUz)6@F^6}3`I!&qovhVRK;aP%OA$-{Ix?tqm;-Gg7a__O9IkUM;K!-L}rLn9zc
zhq|$2$O%JNST%g$t4o2$j~)9|T>PZJzyDKVA+M8I-6PUueja*DsLcG1Fp7n72VIRS
zfY#@7N6ix>7eqM#%8%VBPrB^NoQW_#|InB`2S*mma2+N~HkZYO5g(@h{Y)R=tvS4;
z<am_-?|HNnX_j`GOP+?Bqy(tbC1~`xiY%$?<)f<(_w8gcAXoI55_*(l)DO?Mbab`1
zQ?rc~AA_uxNFrc&z`1n%zc?2&gqFH0d(8C*Tswd_K@9&p5XrTHaTgArL*b}u+10?*
zZlJ4FKQ>?$iI9#gfy<_c$7<iFfl-gFlu~MhKt5wlb0L0)WsyLM5N3<=!h4ax&4x;m
z*LM`518%@TYbd+wgL;g2MP^}cE&%F~1XHy2pdX(T0COnSov9JcQm?pYadz-Gdl{36
zu%MtX%z;A4ur-E~rGLu+I;Q0Y3t?pB<A36Z;U|SUf(7@!yP&el>bH8u#vtQ~2fOxz
zqMqp8d7a3u*V(5$U#Vq?0Hk3Byh#p7@cBPJ4gRePFuniO(o&v^is}<U_Z}V|5i{pt
znoytky*wPoe@xMa!hwTC0FXyXWsG5<B4r@iEm`s+XZe9RsHr8zo`Mzf6US5SH{on-
zu&qNfph;`~n*%<C$4vqmIo9MO(31bC1JYb2ta%^CgbihDwUT@*b7}czTX%c-B1GvX
zU#@$e=MSYbw{AP^Iz4}D_v{`N(FH<rD^JA5`^mGP54F2X5Qz%6UBBJm+k5rPmoJkS
z3xD@Kz08rribt?iGF{DA2<7Lrllx@xbC{l1SR1ZM96{sJ#KSzSCldYARo#v29T)Mr
zOHRx1J`FWCFK-ly;(ns3d>|b+)%aUJFwL8&ssy9NuNoAgpK^2MR)2=U<f{D_h3ep1
zZR&|)s-R&!6Y>r<=6TND=vVB$SOUK@52Rv#*+l7u!kTV+&>i3}``@!IqSpuyyYc1v
z|4UAQYK^snx`#<Ly?pcd=wZCKC(gF`c8*I{zPg~cDyL2Cqy)t369lm0xDl(kz$hcx
z9s1kRY>*P!rnz@ZU;d`l@z&26^ooES_<OD%%oEhnM@KRDJY?z+8Ac8soY7SmVtZl8
z%a*;wcbkCzDaX^hXu(>r*0Y2SPRJD_$nrY`d3j}2KW6s4W^;*nj53!U&pR0TJI*k(
zX`zmN`5S}Z#}_F#d#^<pDdx2AtEA+vd}r6oPaTj>U4T>Nvk`6KUh%K~2c@mrbXK>i
zFhR<z8=dBfhAB7NK`6^-9G<wF*KZxGHdOaWU^>?p{b_wguJC<Ot_Nnz1Q6F}G5x=Z
zS?R^I>jyC`$+sd{MRpE%o8(#A*oZqjSLPKI)Mkb}uTR6r@PF|}&SLoWnEsY@v(hgM
zCqp>6UGt79*>F#t`#y7DadpYKV8*8*`uVG~ip2$18sFKg_*gUxQwLICn8mjCdp-&>
zPfX}eRU6bQ>rs3*fNPt&Tezg|8CVvj_Ody_HsV*9{qj^UOWj+)Og@X9qbB#NqxbK`
z=qs50o&qQ}0>tfp?B{1tFk59;^4zust1x#3_(-QjoE4P#)n-Xo@yU}J754j}*A`3q
z%QfwhrTG>f;CSA8rg9sTx0-nu#x2vQQ>vZFJxtr{0`X<Wk3MPVY2J<V^~tSv7fRxD
za#IO7UD!}=xSDUEZZKJ(+V^-4+NW2^@`QPYE3>%1a<XaeT8j~5OW?a`7UA&bxq$zS
zOIW#JlRGKjh{G1*0?Q448U4hxjX-U)QPWOp^<*#9wTt4PID%gJSTVz0CB<mslJ6A3
zJD2#bPd!3~ZX~a?O=m%b(ffS(t-?Sz_DS5?q#?WMm6w`v8M2v64HQCC)do^^Mg9r?
zLMvemGHmNypZt3wu8GI!Uk0rbvPpa<KD?{)Q%86^lsa;=Qaue0^KkG3pAxC0tw8sd
z3vsv4?3mTvczC#Fi1Xns%*FREE_)mNkasX{G*lu&Dn^wqi{7KI_v@z~rFX4#)TnhZ
zr|$X;=nPdpYU!L(iDt=eGZQ!1XEw|56si-v*TuGIP~5SU`nA}9DK(F-Y%<D$TmHLh
z{YLSPD}k%Uq~4F#Ti;lmLL+<k!RnGvNpNNRfr!ZKo7~*>ZEZ3Ow^yA7O_wfUT1J(u
z-B4K4_sG9l0wsd4gz{`*Nq@W#T26$ZaQT^z>8fC$*|HYkdl0A5%bZ3;O;Bp`v7$n#
zugo^LDV#Tx)+0`T`$3HUL&+F@!UtCPtgt<?{YpM?SsKlEA(W@4rq+7-vUkogcIEeq
zx519g!qU{nT_&0Q3p99WaTJE(=|xnGzUzH7vI`#0gq^LNR*3&#>wS7Lrt%0BL9)Y9
zz5!6+@+9x0Drn?4f*2SWI<Qzqi^--B1qB5g<TNY~WAypx@i9Rz^7lW3%ms_*hdH>V
zI&AxPf)19n2k-3e=b8&104=&4qwM}TIr%R!NYIh*-(CMd_TD<Is=ZwsRRjc)6hs;%
z3`zy0VInFmEe$Fu(%oSIf`lSn6Oft+C@GDyly0Voq#})oQUU_sGbXO}u6n<<zkSZ$
z=Q`)?f4mkh=U~h+p5K%AbKf?VN^k*y32EC0g+}0@{Tw_1`#E#61NR_2*vGBOGHJO}
zfYTP3U*0gZ=>uS;?C2bsT_5(qpv>v2yQq(lirkOxWkudYzzO&v^)U4aZs;m+7{JKo
z*o!$YkJi_=a~FOSxUl%HyJP|G(Zc`1pe!yA=!KALROeF(q}oDRB+q{Wq7EO5YW#(z
zKb%2W5pZiZqcwUU&qRyB7dytzK%1eruP^#zY_Dx%(GP=7CsqRleMJIagG2&@`xYL&
zQ8B{o#^iV@dpi~P_lYX|to;>V1?4p5Ku)%3lrKH+W}*<}0Z+qEZtWq#bf7C+NIi-3
zU&UN#o%<||jE{lrP{GvH^sO6z&$i=PHA#;pG=i!F>I6QZMY!VPpFarEnYRp#Y=2%v
zz`cF{T861P%Br^N^?gQS&tkPH5#VQQgwAz#c#+Nj^fFmZ@iH?rd%L??Ra8{Y2?~B{
zABECNm>ND;{vY@(K8B8=`&hLsn*PAen&bGGiP<Gayje)6vgeiNai+5fM?t%bNmxQ+
z!A@?LZto^tS;2z!53UFeT>|U%`ezFs|9B}6;xYM2+N@Cfm{~v_B`_Pibvhydw1Xjw
zIIDWX1tvz%ONuLfMfk>Vkil*3UK03ggr&p$$IZ0Ftc5D!tZ9(VNEP<K%1&Q6q=PDE
zjtkq{y)~F9nf*@j-|xH!WD?MaN5^pMv;tH7gpTs_CmRewML%*v7G(Pvo24F5`L<&q
z0A7`gg(MZuX1shu4c$$9`?%E9)cc{!KTQH~NWu629S*5bRmGWgOHhAB=oTC@JqGnP
z90AK8VSp+q|ATg8bw&sj1~f)n-!F1jNkfACFAqY@uN*T&rt|zPpQ8Ewzx8BbvIC8B
z(|gHhZYZ*~r0rI7h=EP5>>}4*Mtc<Q;HZZdE;KAblLR~eA#=~?(NXIbo81R;blGyd
zk-7F!S2wqy5*Y*>#mmdvxx!7t1eak%jOl;t3Zu>j_VlPjizg~On~RQ~-pS(2Pk5mI
zd#XLK<sXmxZ*^hqDD9p%4G8#xIPp7o?qp2!KrvFNoFayGx5(L41>ec|w?`%(Rknw3
z{qsfYs;h?m@wD4|;O3Ex{@;K9qj-E%XP`Da=yL5EDlsW3tDr!|%iDWG$K@$UAZ9Y)
zg$Jv_A{CZ$3HHFSN$DAxxU)wLE|Kf7qBHJ?QKA>|jZo+KZy+UHtBV3H`6uN&CC(7x
ziL}^{4db4AE;sn9DtbFNoWo>2jv_nqa%qV#5EFbS_HvwPRG(Uw3jRkZecm`{;G~vN
zYxv3tHndo{e1Q@<+zsFHKAoA^x|rtrHA8F4K7MV@4fZkayA9-T!9JbS3lRXu{>6he
zw<b{*9SrQm9o_zm;NPXZ<>Z3@nA0i1ydq>?6EXboqrM%9VEAt@Ki*rV3jweoy-uhT
z?wjfX{yPw8HsfG^(gI>6#pT}j7bC+0=bD>CTwGo6`?X!>SDu`Iby&OYA^Sf)c_OV#
z0NpJ+EpB`-oFZ5M_)kpkAI$`Xo?Bh&@x(=*!-tU44z?w@po8pa@dcP#Ie~WdArLM!
zL&qs)R_CbL1+bOSzqOU$#o51c&1YqWYSW?oE9LyV+it+ec<nrgp^bMf|6Zuxw||V_
z&piyxm-Z{GX(+6d2;>=meHFhoEdG2EUw7BaUAX5=*}uzEet)`o@S$75g`mu^`8sV-
zFY@Rs7M_M`zwIXuyR@{l8L(QvSuZ@-zed{MJyVFmw9dyMWjP@BS7#321Z5}@#p`~5
zj>rQbp>rK-J?gkOaTl_nBZOJF3E_Tw{O0Qb_kP60sm9x);m<7^HE-WOXzC8eKSD?1
zg`<2@GXm?M^D5b+|8|E@4AX_RtlUMpp0lQV+8CJpQpP;423i5D7P&Jg5o<@if;_`^
z&*9!R$l3c<_`@`a!#)4kCIqpT<}a&xH_6P>E{D}ws6Zxgi@kLR*Fg9M0I=cQmwt{j
ziC=iApVkRCI;>R*y@;aN)9YcVvet1MvcM29d;B}qI74>@EVaH)V@Ky+*f?_vX~8Ib
z4!INa;$+Mu`kx+a;Ne3HqqPRvdw^UAi!XbSYb5}e|ChT}?9Y9Inm_qfUtiy1V4c+a
zxr+PW;8-`s9U#aSgN&43snx?vN-f%)YT@>x=gv{34!CwEivrUBXnlq$C!}5naT!07
z?ZcxTSEs>c!?z#@DGH9<^If_>oh(atVBr3nc6MZq{iCC!zopiE%+0aY7hUoHY(w}M
z;yb;#_EUf`Ug6Z4YWFX(r!=#hZxfKzJb!vgj#%qf`r`O`BgrVHCmBja9DI=Jh@2wb
z%SmA)XRZa+qk^;fUkIA7Q=KTVzJrg^o^*3JWo%xIAeC%G7Dm>Z;w5nj^hw@pc-YCd
zxHeEj(X66?IRuhBG@~Jz{UI~*!Z8!pp<5s^HXp6={K}oJ+xvsGZ?Et^Wp`>M8yZ|$
z{0ECUS+SJQU`9~Zkcq2$0tWE(JOw_kvUczJ!9*g+wJrZmQ`*-$@xWkCPezd*^wT=t
zhZKY2DiouRNViW>oeX)f`vNzSSwlyw6>R1>ar{RzUuZ~2Ler%;_Sd59-@PU?<v;bl
z2F0Q66?FUGeirFno-PW0xc8RLE&>`I3!4$<X3h7&|NQ;U{omqA5uC7tgBp$p^<@SL
z0@YF%`D1kuu?NGCJpfVVuLa~E9kXHkbtH|RX=n%#6c$!8Fi2WiTl?A8YTNJl6;(dW
ziD4fPhffL1UXsIE1{D+-D#spA;VPCIN(Z4}5NJ*TZdVoPr%Z&K^NJ~=^+{Y0R?N-?
zF0j%VU0{W@9Q%vHO&qy!!x1<is)V6-IAaCNC5slhsm;KlR&ctanqdOv=58~iw+8RM
zMEcDF8_uWRgoYB|UO(|K<_T?7k}jPpr%A;GQD=)Ep{HU6{Tr8JcBvowT7gLgt7k;&
zfhX;lYc4jynG~`l#=ckD38*`!?y=3V3P6(bHMlr4Q?{zQI;+0kMBl&w$*-aD{2%qu
zuw2nRK8|ir6jGvRU?}-D#{fKdqh1Mcc*ViA^F&oW*5*N&hMx65r42CHI1V^3{TQaA
zSC=6$Kl~?qM-dXl!ios2dPRX;kywd4Z1W@4ytw<~<Jj1vme4<m0P~N4Rl>hHsrVRU
za+bX!wmH6GW+qoqP*7e|Gx{(osc4BiG7UZnH(~p?H*qEqlPYfaf2AW41&c%22c%XH
zM%dKC0|kvK@vvWAJjh^q4WRO?S+2<G-|r4Tj4&~H;YiY=3IoDK4}<ZaW%<_@=6}Ys
z@)<vFeJ#VO=IW?r-F-rb{1d&}GX;DMI8=RCr}H@a@o@;T81@4)!Z?G6|3#+h`t72y
zJUa1YPJOr|>lQF)4f}&t?>0s4tvJLdLz`y0J%N2|{Y#o5d;cEkc7rUgiaReNauVvK
zb3Z3YKTY<JLqq%7wm_S|o?X<{(FY!qM0>d$nhUYnLGj<SP6+wErdB#9D|E>)EpZC9
zcC&Qm)>}wuO3r62kmHV$Aw1|88jo=;?&idR9kSs%-)bI%a%EuVdt5f6<l+zAKi4-l
zIy!MP6*eocwCecUp^y^&r$Q<KrH%9r^NT06-w}a@o%=^CL;sF`;;Q9ZVM&QVGHDfb
z`e1P7m~Emr1DBaR`L{*b?=r!_2gDB|A{2jX$VCZwHy!>5&po%%wDx7}%WB<TVA_xG
zLkp7$CPqeTNy}LPD45Un=hX>_W06Yi!_V2uA1g6Pti!{@&p`|0PZwDo!A|bL(T*Vd
zr|<ua!r^Q%Si3a(z<~q3!^1qbwzj=PLkz!W7QehcFz6JQ8v*ejA2WKGh6z`W$?HOj
zcwH?K_x_eH|LnW91*3%Ow6m3}JXkI1lT&1&OhePZ9OuKm@juR`D8U4Yf`Z+NwImr!
z1*_TvlY>s%9;Z-+%cdU5!WlTsO7OJ#6k}?in@8gQ;tefpFx}XR&8K0sS?By4e_Cp3
zweiR__ik+C>;Dc*{qwx@x5gNtQC!^IfUqPC;U($WF0e2KgYEyv2EzZl$~Y2QzQ>Z@
zYaV~9<F+<8%>YVXychkl|NmOZ8FpNE`!dQ4sGf%eBu70=^}-^ka^GGUd1+ha#VRg;
zDUJeK#3&(04SHO<yM%yaq60IMaQVP}lBh<GEBaVMVy*nv&x>zFqL3gH>kE!@A@Zy@
zE&{0O!-?O^8UC7p`~_tJ1I^?Au`U|)HFiR%p!<R;<(O70(0YoNIOv3fFl0b}PV;;a
z*HQ`41Q&3H^kB-~Ot?V-S)V(R&ea23bqeT9`4qXAg<eFWTp>Q@&v*Qs_d!wWYRNN0
zjQD_kc&Cwm=^v}P!&<lDva#|16<vq6IQ}@`D1P&&5k-~b#N}BSdjn5v8&GKZSh``5
z20|kY!2TNR{)-14A3Tka01aOE=gPJdtmXYp`{Q@L^niOmbB_H?e_eY0KWHG^jff%Q
zeg8A)_@6<?Kc!XI*#2kG@jru(|7Qjr*f(!3(=juzd71n=N~|M!L7%(1IOntrb)a1E
zKQ~n%=$am&H+3h{GGz_dfwrjxQ>rX0(o}Wvy@?~EZGa!N-J60^_a=7SdqX<^_P>Li
z>63+o9W0*YQ)hzaEzGm&PK)7&;!)2aSiV9F;x>@S!vpd~A3&z+<fulhWyBamztVy|
zP)SH_LluaD@yFGa<h?eT2+}5NMs<_oT1yzn0<=;jdPu2qxmsuKp-r|E!6dnE0-3_@
zSu>yvgpL>&B*s`#=c#5yqHKVu4EL85OKM0IN0X9~wZb@|Jw;sSR|cH_#t|~4Cv_&>
zY>PV@d#$ql?0+2bCt!zTq2N%%(v<+UV8&Q+Pra#+QGB|!sM9~if2e(l0LpgNyea<~
zCHO0|fAU#TGJ!xS*aD(qW<bo;xiEs!&Nl<7L9uBQnO>Dko&an@%R#J_d-%@=^p~z2
zjC}q#vuI?pEM4xu>sPdXZ?7}QI$n+N_(5N0MpS9KGld&S_g7N}^N=Eq3`imVrWL}a
zCi&s+kx!u4wECYmeO)4(p>h>aX6AsMl~`Ep*8LS@@key#Z=JUPo`GLfqlbVNr#Q6d
z9%b!Zbz&VDi!W(fgywaXz|j?aq{)h|f}{|;YH<sfHZDTT_JzRF=vm^xIkWzAy1@rT
z@7!tS^Ze6PMHxM}H84qKk-j&q5Tih*9nJrrn|7$(L|1`!&N~AG7(LI_i-hfb?Duz!
zQ`}Tn7%bg1RH(LAQ%K`JWpLm5?Uv#F*Je#mG#c+4?_y_`FZy;*a}O3NBLC=pq@wXw
z%wq)owqODnEZrxfT+y>7Z+l)32azAz0aVn!54g4`ASS8*z+ID0zsBL;*RIyFF2ABq
zA^%Jl2PZRq8SudN-Lsb&Gyq9yFvGuXqW|HmFpL^?v8%e_TlT;t^89`O1%P^nkJv3u
ziMUnkpRw~VGvxGr>-UST!8-q9hHuDFVQjuxi&q*qUA=YPl#4-34Pk5eO`$D`(Ob``
zrk~bNou(TQjQsrdP5x4);+w9f0mD40IEI|@+hiZQhG23NE54wM!xllUp-lyR>?d#(
z2>MyPZ-$f|>ITnl8+-)bicp<EjpqcXy@eU72RBavTu0ipBem*;+Oga#E0?R6!Z$o3
zPgrwf)aOVYD$9wrmSsQZPF!3sVYb^?O3Z%}d2gb{@!OLSvDhs-b?fbDp%Zn}6OJ0z
zX>A4hpM+*6ZrRmH%{%wJ#>bpUcDUdow7xgTK%ohd0lg49H%SHBNn}F5p860%3>W&O
zPa~ae-7v$7#(c?6_6Uxfb8S*cXYrW8#1Vv>vxy}1P^64){-dmCJ06kTj_0=ojlHKT
zo%piW8h_NBb3IYTi@jXwDz)aBu^lO8((--ddty><SI+=lpmk+tz$)$_>CwUjdQ<!>
zkW9E+=nzB4{WXyH&_<Xb+ZKvsv7dm$(PZ{K$+TKoPbMFp8{W$<Em45AO~#u>zz&Tk
z$AIoqkil&=w)J<zJ<GGNjTEsgMRxkO)!l`Q?wrgk8!bAD`bVqomA-%LrZO}?P~`7q
zd3^o0^fl+{eb{3C&i<Z(Q06;XM>Vtio$t+ibV#0h6noE(t)Se<N@u;RVjy|8WY#^;
zRLyqvn836esYPxptc&Ms>+OZOT->--rz?H#Bgt{1U$XYF;2cESMhiEdN>S#%xpCU0
z`SpQHAA3(iXa)hi!J()<spHkE1W;<V60F>o$-2O_{PLbLZ~tBCY}e&v&vt;*DH|iq
zcP!S)Ak}uxf5(SSEv1H62=~r@!18WzPyPL$)Idu|$`xz5)C#Cd=aU>)QEnsQJ8V&R
zQF$osMlIjWyb2yzSV%@)cfd5gLkL7)y`5RWek#ABMATn_%2U(cB{Zt_5(>2TtrvXf
zjrce|n2G&hcEi)IZ{mMG>dboLvVts;7LVE-X?D<4<YEy4#M&LfM>j>PD6(<2bur$Q
zd+Nsc5-d_%i^#eH1t`Mx!cl#84=jvoxQ=V>mi)26{fGc`L~Yf*y{lJNR-U}ba!Hzk
zj#(XJ^B$mXJW7_VCBI|f!Lo=)007*k^0(<{;vJf7?w-NQyYzXT+P9C-S>d9L7JzcS
zfF}*OF2Z1<wzbjpD&>PTt|^Xx8gaIf*q$D?k&%(!u`zz2xIt!gkLZBXYqMYQV6kfE
zC6f{kRD5(+kC}-N^~oQoJA)sjB98m6=|Ofg-IMtRXQD)a&;wk9n0M;VRY(kOD(@BL
zT&J$pdD&o~9g0AMH}S-j_%>brEpoE$JC9PFqy{EMaWlYqkD9PmIMjO}_+O;o&;p1M
zE=#?4RDss%^7kT%_cnHb29FTa9-%}OYe5mh)^PaiF?nhkR`WoRD$_|_PM!vo*n;v;
zOye(<fAoDK+87PM7)n||`eCq_f}BzK$x<gK8w0>z3_JLdbFwvJi#7y-xedUYa}uaX
zk{#4Bxrctbf?P-rVdIT93)4ZwS+5P3(fs|TX!(tK=$G~w?^d{X;TUz<J|Gu#0yI9_
z8V|U3pD_Y!DZ*tL(s|W<uNwoTU&N3VzrDh-WdKb_Mp#PRgB+nvmLog0%B({75R*h6
zFq@ylI`rt_@=bh%qx#oD9`url%Oy6lVG=h4x_fG{Bzgh{M&w-wS?aS5kU?|Poqx)e
z{yb=2+mToTd{9%38G`G1#AMEiM+b`n9pmU?CEP1|4k@(hS#=^L>a*TKhRj~{RX+&_
z&*i2L^HIVZoOM+yWNtRj?DfRIf$T)w%{Yc@Do??O1%#h000|#p9+biO<JBULva7ke
zT*Hj;l6|rcE0M%ch2Q7fd)u2xOB2Pfn6*2)(dpiL|3s>8he`=AJ_QlJ3t>A@WN`s$
z@<V{L$w5;-^5=?`D?0#<{Bxnd`D!_)(6CYiWH#v#SSGC3a05XHG@eh<d-s5K(>7i_
zA#fVT-#vlm70I+#H*XRjzyb^_bQBgBfyZqon678S4lL9OSpRj34*cA?38aUF{}+~k
z|0q;*A`J3ie)db!C9rOk_*&h3Z)fr}<uN+Chc92sBnaJ#6+D0bsz%vjYZXvQN_d<}
z77_>Ar5^GiUn2y4(M<VAP|I)lTpsA#A&n>8(SdCtNVYBRA$XiWPmzBcuE!#cJAmZk
z5Yw!jft*UQ%@61V%wTowlx_Mq3vnD%o{|W#Rk<C(0*{)!)$M8Gz11cScquBBaoLo|
zfa%YbkumiGt2xwTj{|Swu)O;M6^Q;YQ0eB-m(Mu_iy)5Jt6+$Ko&f&to`5y4M~;OT
zA(iGa$bxkg9E{~15uA?q3rAd=V0Dpc%<`2b+z7IPlt+XRF;}{+=F|L+*6s`_i&X+~
ztusRpxiD-qpv91U)X{<)ceTYs2n+^rKSi{BiZo-V``~N{NY*SQH^RRnBPTOzLpIK)
zg5#N7U{Hen@G^Zbk3}%BxM9o6UxzITPz165hQk2-!s?Cq8M9w^qhe{{%EpI}r0gM$
z)f=#kZ(z=!C2~00E)nJ0>oSO$nF~zjHD4nPyaCVg#*w$IY6#P#wL%FKeki=!Fcn=K
z&R5L@#&v#?!hx8vi8$RaqEWa#uhUepM-&qF3aA3nX>}%`*IjvDG#E!B_k<4}I0te^
zlR%yn$6)tD=8DJ%{7N7t)Ewhuz#%kH4KzsDVek1DE;RH?sKfQaYWJe{H0iXWAJu1$
zl`R}N=)}7^=q3r(@7v{>k}qXPbt~+EnI+zjBA1@njxsm30kqn9fLHqIy8UI$i8ryh
zuwRUT{roxpX}%bX%oha{7sBk}N!5iP+wMSm*GJ`Id~s9H5O%Zx<L|)cuk$o2%wbX}
zclq*@{Cp+Y;yvPbW?_EZLH+b^;Jaw*B~i6npe27*baQS%$Q7vQMydSFogYzOFEg;O
z)rQl|jxaYhHX(fHlL*C~aWs$u#6y(5Yl8KI6&bUyA72;yY&dz&Dw!neL>z1(0R_q<
z-@;6OT!1`OY@<ch?>I=DU0MkLF~PDMh0uy1^(jC=A6ZIF(0$t+s5b_(rNamj-HnQ9
zGnupl+S>;f<)k+sGQ!#B=qX~T1#Y#{j~k!u{Tck`?B5##Xt(AgP0>l~zA0C={$t#y
zsa^mJ-m3|FC_520s5XG~D!m;Tcmq<Kn%^pCTKleu7tC1NtZY~h14C-ixuKsg)gI~Z
z*I#0<C9lhFy20i!n-v#9kZj6u-sYLrJ%}c1c>&DsQzi~lO^W$B8aqIB*!d3m<N3Iu
zoB7c0yvc6je8uc;r2mhJ;>ih#uQ)#Q4%X7P{7v9TGw1n^Uy+-<mJX|YMLgt>kT}a!
z4P5R`u;2Fv;n15LoOgDNHQC)9Fnzf}xjdIR&;gXn9U>}EG%P@0*V9+goSo)8y-)5L
zL@h3Yrf!WSiCF4l^Zb_o?z%ti(Y)`DX9SKHb&vqT)WBt`qC1AB!+G+oBK0%x=CJu;
z5o|^#b_nD0W8v5-5AGV$y7M}P_b~-Pw&KsSrtSX*0`Q(M{7i70nA;SgmcBLiP@Wus
zlQcWE6h66(dTk>d_#F2y&s75LlNYn^T5T*|k8njHJ_bPRANccOPc|m2$AL3`@bP-k
z$azR;Yr|;>#qOo|R*$Y04xi6aO*5uGgD+6rx|VQ)>x4s}ituE6W|2-U%Lz4vYfv=X
zl%~M@Tg!>agn>bXfThKiCr}^o&A`UyeIZ}C<^$=(;)fim)jWwl4m>Qm|Mx~u+sg;E
zhX!805O0h&Cs>)jvGVTzC=0(DY28`1a^q6C&*Bw{y+u-#(V_Ty$rXFoxL6!N$<Uj7
z^+twH3K{(1)_<ZvuRVe<R1ROr;^;>?sc#R7*XMvP-*Zu=+7@kn{-~j=<bZ{s8unSg
z2d`0;i?U@tVGhHwt#5Jl$p-$NqSNsfFI42mp>iO;bIaFg=4pl1jHEi+3qs&@hqW(o
zhYPP&k8Se}l#H*CR&JHOm;gqhl9d%#W}kch7pQ!<lT44_M`;TmOHO1%eovVvlky98
z>SH<$?dP$kMYXk(!cr3i7)X%F+h&8+#N2xPL2JjYFnpXEZ^EvcUNw;p^KzpJ^IWw2
z*WK9(Bn|+>@--VMNI#4TPPnAyaIrOrYTa?y`mOGXrhQlIUE#;dF=dF6$ramp!*go~
z&R6t!K4?QcBsAHILud*;3#8U7yJCQ)akkPGbc{;yOGq1=q<?%MpA2D-hypGFSu)9$
z?q?|od7@_Ox5tJXuf&0F#NFe{6-oYah?K9Eg$M(aW-1w6iSe{WK`^#)3rRu{TGJ1{
zl9AI0k^EBKPR{hln$5X<PNHoN4`)8mTJ~71d?oIc+H5JcoN(QQB(1Nh%6Ho`79o!V
zKo*Tu|MF`xGBU-b;2=+Zgzi-on9!Ufxhtc8DZeev#}Q9JpUSVkmB%M`WlBgJBfXip
zg)Q=<+1cu__hUz-u=beb*7BBW%Bt$$r$)@*qcuL3TCF|==Hpng{6@G=eG^h5KK%WW
z_~|Bji_aE!c~-xZSmcJLvMg_O`Pcbu4UKh{^l+!sMugK@miZ11jdB5)k)$hs6L{6-
zjd#Aq1)C-f$bWhX3`YlhOm!pmd`auSzvd^4wCn@wdBvrdiiK~?Ob%8O-WK0<zTB0N
z*W{gdsVnViQ@m(oajJ<9Z@Z@(h#30yIG;$$dVJs0E@OzJd=(_m?n*bVi1ep%d;O>#
zC*W^htuSN(Ep#bjDF=#z9O<Ilnm0mrevn_uq1t)C`-3jqSNniulMBVupbo-7Ofsln
zaSEqP=dZMuYzlmby}|aF9@DG-QZrpLl3x}<P$S;E@Tz;es1}x01gl(7_0sKUZO=;4
zT~7l$s~kYBXuQ7fu^2`CMIxn`kbMF;2s^qJ??}hD{LAjQh1V<TlwKA+C|-As%}AZ^
z=b7*&)g2<{qZYb4ck6qk)aJ}Ppz$1iG}I%OO%9}iUdB2`CUf&A0IA)ACFb)mtI~1F
zpT_Gw?;^co7;~?U7Wfbi#k0x17<^^~j~eElurJT@G#=Unxd+2{{dXg6D`@Wan7+JJ
zt+=LRSxk`b+#>Zm5<4khL{CFcM`y0GT&41>akbZ|mgW`9ce6PoRaY#33D+rF2ol)2
zn@af7Biub-;6-Mehxuw*UfL%h3p$EuX(-w8iyaUZlP!A|{-WrPm1Ig!p>MC$e0$3)
zP9lS_E9B|stB)yxpK`1TcUBgbJ#tQ*8V>Tbspj4GM3;P5+PLL^hvZp;%H)W3EvV#F
zm`V0sb#Q!MBi3_Jj2ruy!9K{k^vX(I>W#+!F^~p9*sg^Iv{ohBpgu<2bAL1pLm%v<
z<oa_z_Uh7BH9cZ!a)&%ymV2!0K$p(W#Dg^vb3kD?&GG>sFW``0WnC-RSX3$ZEHPR<
zAOA9bISpah6J!nN+l=}6;)A-NnlCorm}X6#|BdDge4U7{)%sykQOJIJd{5JG8G;_J
zr@8Wms?WzG*RAwD_|%qt&h!;MQ#vA5J{W6!rX621n>@?zj$W?kc8GzhY?+hNFzaU!
zQ)$(k+i=U=)^0-h2q(|0$Iai-ix(}@FunPfK)`R=r+xFQ9jME{tPy+JzN|Z>2vm9|
zvckTk7DiL6O8joLzA5oen_fNBaNxThFI1Ep%-$&B-PX%U&zUu;96~rOJx6*q1%JuQ
z`5Yt86l9Cud`a$iX25>i;q^U0g5Fa{ljELO*-yC7t|Fkik%cA5I1>mhrzt{+UQpy(
zyD!!(cb=6b#}FS}i7p?%^`p|?3vX#Wzx-ipBt$OEriIZ^B_u@OEL7(X>y})Ur6Hvi
zWbH<2__Gn1cy`#DY*CVGb+n77eAaz&Q8sA4sJ?D#cG%RpQ<>C7DMeiE05hpsrjFX~
z*zWe&Qq80HA%qk?bfq9Fx}m$|iZ$Gd8Gb$(`&^B^_WOefl_l&%_dE!$xOb|xU6Re8
zY1c|A$S=_?+XX_uk#`8IsA26_i{{s)TKvG$VdxFM%wr2QFEXi$mE*9E-Hw8RNu7s6
znEky>A`4z|T2}#Y!*d{6<4mSumh6Nr6=&>9Yi@P~VGNRic^J+we0Zfdp>PoMQfI@t
zDbi0K`2W%fybKeh<|@rPy2!WBaCvo_`AW|3%(3i5>v2^VjQ6uw?%bsCn~c$km<`qt
zo0k+3Qdkrv4Th}J$!KdOM>@s(ia#Mijt_C9^hoe4x_PgN84aZs%aqX~Gr0DYZ2H%T
zuZr)LGk5~6sX|L}L25j6pXh_g%r&z#Vy(2&+<9-tW>?7{3qIilhQwN=cM{l+PG@NK
zpvsnRRnXfcs+oab&E6Ph@FG2aNcO<CzP-zub#EZX?Skcfl~Vj^=Jr*|P3)&}!|2M@
ztlJ~^-}A2eWDSKHCoB01@$?O%r_%he@9uJK*6d^qT(Ufx^!4O=8Fg)OO{^%$<|w%e
z843@gv`uvAt>pMaGUvG%<l{c0r&YB8bv&XikWK$o&Sa>EZje4U$$YXkrc%tS=NV21
zqjDNJ^*YKkwgu-8*>~JhUsr83E=dNJ;L06>=`f-E0z2-(*r1A@iFt5TK8x=Jn$Vri
z=O>CCC*By2S#+f`0lQQA{7uzEv?)SGtEJmc^|Us?tl4EI>3f?yj2PKI;AX!7ENW1D
zVmP5dFnJH<P{O*SSC3j6ZQVC}QLkUSTj*g1%P_72%$4(M*ujDx&KGJLd|KUT8QPQr
z^V@`^#}#=uh4eJJ=yMYXW3meU3fVI9H0OqMv&sc}#7ZP>2igxm2vX<DP0mdb>C!3Y
zEfd_LpI*y1*|vU1kg{#nI{%<wBdoKyFTn|x|Fcrb2EZQggx#Gi&l$olTPAXfh4Isu
zCr#(O-m<tj|5QaMwuKP$@={)?+e^kaug)ta@tP!7`wXmmoGFq96!ZhF^FALf+=5af
zc0RbX>g^@f>(@84)6F%MIEPOEKsB!93>WOQcVALHd#%+6c&!f)(Ch$B?Cb_Qvq|wf
z9mooJVS+WL%lU?{zK?V#m|hp+^2!es_?0n+11Y6SyTs<7E`1^NRh?*sT_c^#8E>}7
zQH5XkJ=>@{nwx{tuD7rsMfrceWx_4GxJhG%Je$jw=jRB_Gs66Q-}RpoTISY2zqoDX
z@Z1`UafnSG3?f7;@Wt3D9gy#f6se#$p39gX6`X_~iTLbd6RB1vZI&2_41Fu7CH=f&
zSX`z*2NS@-hnr&EVKp)O?KU#Jnr$fUt)r5wK|2c}sht_am5-D@YbLuoR%GQ6pJ`H0
zHdrn>u3yx@ku<TQ-geRd>df|wu+Bs01#UHo%Q+3NJvu3ePP}{E78en|^PrD#9_2S?
zOkHfA-p;)|2Fog|jebn^Fb&kc!&+twu>D8BA@Ly_oU_S(b!p2Bq|m0X%GG~ku?)ia
z8NF>!G^ld9Il3WPrfzFeuV{Ze)C#@1`>s0e-D%8D<b=u0E9(1sA@o@%snd>`Ybu|l
zy75dfoKM*>p6cTTD$`Tk{WG_tjx`)pKVq$$*L{yi{t2Jp<71|Z_ee=dxUcO`Cw~6^
z-ox+0c_f;t8=cGI>%vTnwaJy1Q=`sPbtif%{il*em&F7oN4q2iq8aaD+)oPkfp)f0
zo`CNGo)YCuXWPu1VZ>uAp0TWrZX@OMy*-)PD)dn^A%?Z*%?ow~n|l<BWN)@=&l~hz
zDfp6PUzg|E`rH<c*&m^2;w5PIO}`d+5EJxV`=+y1eHqjBZ<VGp`?u87y?N($7qfOR
zwr4`Es5&Y2k+7e&D;m>OKPc6wvuRjFleGTW+{nw<ppUP%+_u0})kJu}AG?<5B-U!o
zZ?9q7yMt}>c)}liBi0tGg88z<^8;ey4No)mobj+03MI^Yz2aB2OJ~(cO;-I4pcVNp
z8kqV6sO1`DSm`1}w4x2v1#YWMLqwS5m1Mdq%U@fVP50r6a8OU*_^Q^8Wlke|E7QW3
zTw!~~(HF3OE<(0b`ViJq3Lp1$%gZJ?Pj`-N0<jmZk+k+a)(N!h?pLVDWhJi8d)wEp
zUiCS`ORA&Huwrq3m5XT`^YTH2h|1WteEmu@qU&$dUs_UX#=3IyQ!cd&bPe}1pk_lP
zYqOu&Wxk(xvu^N#Zo49EZ8N$WtYm9>>DP_ME8cP3z9%7ycZSa5merGO=b7$wk@j;(
zk-YRr;v!4Sc=J=Wf$Oo_&P+!6yQA@aovXE8Il@U^p@~URop8`n9wby@A{Ph5S_f9r
zn8U=V490}p<{YnG2|Sk*k+kl@(j(P>%Cqr=WU8RyK!8Ey>?gl0b61lAhTw!-spqR$
z;&=LbioAZ@@_r=TdhUx(Qj*lUp^RnyxTm>SrP^+l&np@iUy#*cVk-*TTvwI5Jx}^_
zrRr%_+KCDQ&(_)?4CugK@@bzti)s1@T5krkU1sqG(oQqES0#5=>@%KsH*F>|C%ISQ
zD|`zwEh#C{(>-Jmiz+*#e;P-WgKAJfeerhq)J&Np_KnX*>OjJmAkTrZ&Q%~fT?1|o
zcGlOF4irngYASxS!Y0Fg%P!9>MupigMrSH_c+*SFJduf_%QGLX?i+qkECKS=oXYZX
zJxfja(%355FBEo{o2>PQl8R?$pSQT)BvR`)tL9^R9eX$~`m>Vs+wKl!`r9W+d)v6u
z=1=j6-IRaqQ~P$pn$}d+c)3`=5`?Z=1PeHwkDxcGbN#kg2i#ZQE@?%+w9c-wpQ$tY
zEa_}JY}RdLTe)&e?8czE^qG`=VXJel^M1NtJcP_wWJ^wEI+L1)CE5(E!n@?#yv{aQ
zw=(g`F?~Q?g=J!Pvcs9Q*tOT6#U?_}=yKP-T%m;e%DUvmqR+5Bf;Fi5*+QMiUs=BE
z$9#5dM1SDX>suCg=Wawwq`}j%2HGM#4aWTk_mAOeGmT`YTbDIlk<*6u`)iqyh+aki
zp0R3od!F>1$Jt|jVjWMQDf>d;gN#$*yu8#vQWwV^S|k4|TQ7R&0!_0HG1t_mLu(fD
zEq#4qWbDOH=V3vr;?Y2iO5&{PHbtrZSl!Sfm)~q}mdA>rm*=u!<I*RgtE-O=xgQEF
zEpokUTBab^Y29r%C+y60c75why$*M8>3|P_?<ZEjD@1%0vJkU2RQ<dQn)DwjA6f`j
zdYat1y8QtqN1HR&$Li$gCsmG}P#C!0$~j$S_^om+#d!Ixt8_QbJduuO#l}}oyKbhP
zbx;D=X-!an;eN7rX{dZ&qw&)uWxHEKCG!z6dYz)}#AKJpcC{=8sa|t=Lxq{y1Jy#S
zZxfRk+&Kqgd~7<oy6ex@DbJ#`*Y&ve90-L~K|oO;hU{t2!;;yqs|DE8P4DAoEH1K~
zOQ5Iz4!@0a^9El}*;fuR?b4U}Ct*`Vk066Ym;dx5zq*+YF6sN1uPaaoDNv6h+n$rp
zo0y|JJ39>xDDMeUjnv3zJaoqCVuE5gABd`_VZ{oKYftxBN!-UH0=8Z6r>z&+f8p4q
z;#YrnQdQ#XU2VEC+Fskx%c-rv-*6)sgzw~$-C6*j77eX{BfFGA&hZT>mRb<ndmw8>
z1Cu{HV5M8DCh0u|HhWkXr)RWG@89OS|LOwLS7pdMJ#endB;UZ^z6jQWUvFJ5<aT(<
zBSvGGG4JqZTkozOb)BM%XN1RYkEu5(etZKp5wY?rzpqyP#TWM%tgNpiaw~KEj1ASB
z?i%ZC+v0CwEI0MNZ`&KlmV_!;>GN2I=D*VVIzF6{JMQ`YL-2&@GYhsR;gw9M+Vi4U
z(<(h&5~4k%&k(GB`Of11<9h13y%ztzz@tPn({9_!nPYJ)0t}iP&{k&Jibr87qdpP|
zo{ak8#1J04%-4O>q2y!`iUKxJg~BH*^w?iA27W|3(83m^xFW=l<YFeDIgmVMl;|Tm
zQpSQ1j_Vj)0=k-;<$oM-6!dyqhqhFLQ{o287l<FRQUfYC8Ie%7s5{WqB@59CMBRZl
zFX=1w!#K2vAf6PmhQ3xku(DE$YzcOEM_m*?UdBH&2$T#FOdlr#7hWh(%drwcOPsap
z{^eMN8}12Hn*{;Tlh9uj?vclh!|(`vI<BvbS=BQ3!=m(|;mxIdE;Z76a_2!j6Y@`n
z=OYv;mXMDAMCWsx_aLIoynCE(WT~n6m))ZVKo1ASlla%w`{!uPJ`6)U+Bwrk?~>Q{
zB)n7px}ck}mwAk;w1AYK;JzJ2OxCfsDP)Gj&ZUjRoZ+8>+{~;(n_dO2pKkT#AY=3A
zkzE_#dAN>&Wm;3>YGM+7yPz18n2Y?{;>eJdW2@+)i|(p%AP1~t{ed1s6MBuAQPk<;
zD2Sm14Q+uOM>CX3SO?ETRm4`g7~vT&Aj-WrA5yKSfe`PZSS9v59ov#(U}lbe{`_o0
za`I!?4Bfg-z^8!LXSI{pc4iIkHz+MrhR6J2ed_*^6QA;n>ZnL9Y)+6~U$vXFuitk2
z;I3U^t0~UsKHjij)MM$Ic^?c|sb5OOCnZJuZmqQx)$b(cbs+36@TiQ9fpSK9Qdh)3
z5L9hiTVuHI)bHr-oa^61w!0yB!wi$liq<NQ0kw|PvHGb8L<wGL;wN&T(C;~{uHHe>
zz{@**vgR%-8muYlaOd0}EPzO@p$R2~dV^7c*?jAbUDzzErBON_l>?@dn%I92bsj!v
zW6TqdqJ%GVDOcmf-b^T*<hzF^=HWw1QLmL;Wfc_#e}Df82Dz3Yc;xo%Eo*pz*5~<S
zFSGK)$7?w94hAm3#|Jx32BJpcKRm-USbn_@4-b#LoSdA`#Bf4F0w|GHa<FST6RK!I
zR@z|oi7n8Y-TJXigAo|cNEAJV6=HkpqOELEX#g%y^IjC)0M(4;{P@JZ?}K730q-{V
zp)uI}#*GV7JKs%#CR29ma|X&5?&SUXt6yM$?(B04yh;))v?*6@<MW-SyZi7uD7*U-
zR-^?uBjuLqo$)D7UIjEB4CBv<iyJJo=Hg(EP%X_J`VM1aTS0p<n2n2ly$^rv#iH60
z52q&>S#9b)1|-&#@XOFuVQ+Bzk)KXFe0W75>bqHb7yyQd1jEzHuNaW2{R})iALpwO
zj!NKaTOjc&2i^zM;NPx_r+rW&6n6=HUN~tk>pdPgAAWE@3vAA7Z;_>>rpAJvs)#ZZ
z3rodc&g(I_iECA%L3=lWaP<hB#O|FJ_VHU}nQ<@mpWnC#6O9wc+O7%$R}ak`b79}W
zfkD~RiFZ+p@SZB8HTJeQS>cC37wQh&1g+T!(30-G*;nP!@3rjwb%3kKeyqFr8_cF+
z>mvs*NE56KtV1&S+MNXUEmlh&mL3YEBAGcWA}!6&;=A(DX846VP-O;F+4Cmh76Ahq
zZomxeHS=mJla!Ax7Z=yZuFo(j$*r*J8|}L_J~thQ$Y8ZZpGYbnt<m}A)NVuuWAk3#
z3uduDGJ{Bj2=UyooM~=tt+;ab2p*B3Dk`$}ljAxt%EEEcNgUYs7Tz<sP*fNoM;64d
zd3fl)dGp51$w`BdkZ^(I=nJqrl3z&ktjRt_d4-k7!Xkf}jb!WEU1SkUQMIS@91PE6
zpX&_b;%Fcbo>$y=0A#g*<f_@~99A#1e56>45aYkS8fRGJq0a0vzQ2bhNH*nI`rJF`
zU(@v8ZWpMeAZ?6>sdI4RNkG8r5v+ik&O!34p=U3YU3humXRMYd{(dYsJMq{8U>uqX
zVK$HdL<ElvUIv*5^`!=xYY>whXTgiLaTnEmR6BABWdHKv<X$}+yBBrP9)b7n5zNAR
zOKK`JoUc=U_hN6sI^13BYiIGWgx!xW0b3%y#AS3PaHSg8`xzlpw}S4kpOOZir#%DY
z>GpM}fJ(@=@Z}VHx8Z()3&T%W2QAZLAhR2lFyG77_Tllu#MO4cSD20hlqAqo*n2g=
zV+bO;Mtnw993b~i!&KG?FW~hd4!NcRqw4Z=7h)!gblt7x8i?Xq1lyN^<TN2AMMyOP
zM;8LGi3Hz=RLcO>b3PwvY!|HYwB=Bt?A_>q(Z{kyY&p&XHyc`U*L%{)SaP0;Er)$F
z9!AX~fuG3fEkMoa@@-Mv+pZ=8M|BzwOKoJ5#-KrrmjWJE!=0aDx)QZn+625BB$=Ni
zz6d7LmO^&kWJ8g|#6UbnM9r=?{~cg_Z<6%)z)Unt$nd#r%p^M}C#$GP70#=fkIyg?
zYR_H3K9}{8h+-1H|B(ov9nJm`m@>+9Ywu#hL_BVz?C`|s4gG(SzXEW+hd>jiWqe$#
zKy7JX;4ne2^zk_rt_UV^E;wqMnz*ADFpXC?b!Ky78Yni*fH^jmq`ce{A5*ZpZt=N}
zxp{P74+Z`q>Tb;+aePKzL-a)2_Us=B@SP~-Bcl2yln?Z~G-HYq)}8cD#lSPX#7c*m
zi+c#E6;<w47Z2P>uA2{L*w@zu1Fhd}#~I9vMmc9bIgWph1Q1`qi8wVqZO^Ux%!`4T
zpLLuEojwQim(%VvszFa%T35gKDU4YHapn(vK`rdWEvV@n(N&(y7NUUs^5aJTOfV&2
z)$sWj^!@w9l^$O+7KSS|SAvSAlxg`4L!=uWTI<PcdtRM)V6C7?)2^rmCq7VMZ~_pv
z#x8d-99LqKw6o-fIOi3l-zt&4yTWz1SWl{LMoN;@z`oO5{NCFs*o(F}^r?8o<mfcd
z_5hjfl`r;lnzy92EPcl;T0bUd?Ou?$Qmk9rC^g;q{BxU>Wkgo758c_JMn)uV@7r$x
zq0-<KW$nf8ckkkruUu(`rFq|%cyT;!MB4Xn6LK94RF&GeXwU6857;Az+<O^Btlc3f
zmk(dQe-DGLTaOiUVZXwrCmZ}?+aj9S?Bos!rPuGtlEhr;M#zRv9g;7Y$5xdnOO;kO
z?`Vn-<-M=d&X*oa@4yaQ)0sQ2T~FiwCem_4y{+WE-vHT`VcF-xRsEyW6zf1Wc**4Q
z$NJr#8M0+62tY`0?)`;vi2Zc{y4eLeH{0j6d<+}!Zxy{xqs>5Jvl0#4U?835xC;vR
zoC97-MtbG;108lo`{vozNYT+&H*OL@E7LAFh@yqRc~hcIYo5fbY@nn+$wsrR>QRyT
z%@V&T`PkaNiSGBxYAd_aWL9I={zF%)SVnI8pZQ|xS~w82;x*5k<X5!Nbf16Qa&yGO
zc<p@fi>`!;AmAE2^J`kGj|X0_>^=Sac{uozm(g=X@{ahxnM8nTnG<~1vs<iue*Yak
zE~Q#Fznkmn9q)s9qGuVBJO}S&xIEMA;>a&iNt@X$Id&^WYQcH#v3YCn6c6*IHIrIj
zwr%6VA$Xz1KfE*SZnk-Kl`J@0=-~m6-_Lg`ji1qWX2fe_GH%<deYfle3?;u>KFZ5G
zRrFpv=nYR#c!<>aS)r2+W#aUAGmqhk63wI)Whupl8q2N%Btx`co@Q?ha+j?^NUL$?
zgg#E=vzr;O=NikuT;^AcOy8OyjIc-qDI54-a7MrP!3cbZ2-z&$OBNVb%DeYsH2wwJ
zB_TD^5$yEZ5vb{5mw2zHhn2pD%srZ!Zt&wW?kwTBCwVdf{C102{|#qA5=G7NUO?-s
z2CqXR0%1tk#2WW+mVjmsJ64x&j=S&IhrT1+Pw&?~U6?$OmAv$HHoE^r9&lu2K?A9|
zPFzVQdQ(h8%Bg01uPS(cWfip+70SgHRSh?`_(ThrYEmFfHLOgR1aMx_2~K)TN@*Sq
zRqpwip|KcPqaPA3efk5w*W?$|avW6zx(YesMJD`l{3eBU6h*jXgjMYVlCoZ@b^Dyy
z-q8Vk<w&tMNXz4(uuNwGO?jtppIYz+v}~}+-@eSWfod?$rl4Y4LV_hooR$V`oTJ)Y
zZMwaPPmcpFcT>R?z1Kc`xK78w5Z%(EKzHoeGk)FiM-S9^DTJS3M5AQqc;D+I>1^@p
z$srt^=Nb@9S#!VMBbYKj((QjhHBE{aSiqnJj4ie`E*zRLMPt}4BAF#md&P$DoMD{%
zg-E+8^-pnl&jw&mW@a(->z+voUV^l`>=eW3-5AK`fM4oop7?Jvx<IN!j38Wm^{FnK
z_u9f&i+Q)t?>P|`b5P%HhG<fAQ-_ZJ0tDBvUxVw<L;`8r%0S*lIN~da!+xR7@JL8V
zdZA_ot;=%)0uvo?A)$Ct2xbTTQh#r{Ufa*1-ac5%yLX9wf!zcUdW(#zxx6|+HPd)@
zZthUZU_N5UZm@3gH+Bq6$e=|#I5>!|xc6o+fKyaK9&SBTC{+P4xaY7v#I*lZL;zHv
zm;=m^roc114HP7#Ra;g#Lc4%F4SAY5W>D86TfpLO3l6!b>NB7!jfKb%19b+%fDX}m
zh<3#W5j-VB&f@QKp})QZvY%xwlbEC=FE9`R4<g6J#KanbqyZxC`@4t@`JJ*(j~e21
z1CyaTawg0Y!~Jj#$f9n~VlZ$d3R-*p_?=iQOwvkT@4&LfW*=?o-m*m{SbN8+g)Q#I
z@4&Lf_deS2(z~cK$}+0h4=BFtImHSu;S+>?+Je-$;?WYQP=frb3_E$WaO+Opgb<O}
zQY0iKcz`bQoT%t2K)9_DkQhEfm7#{CFjFY9yb7yZgHT$vz0I`2kw63#z>_SXZo{SX
zk5w9)k~B7~7fjKuDtSI528qLbcv!AGC1UspuQ~ByX56VOMHGDvT!k5aQrXHPo}1J*
zB>3KoQ!%DF|5SE>ktR!eUxf{1b094rwHbj`DMm>zt?caVC-wDXNp3OeY+#<X^1p_F
z-yc`_&D|q%6Sx<z$o1^mGgf4^q_2-%KtN#BhKXG(Z2l0Wcz_B<(!6n2Xl@6{?J<6#
zK`s5)ETenq!rnC~h>RKrP_9tp@fp{gMEFsF$~mZWL3@i>i>Vrrwn$q=1RlHCxcc5x
zw}Pkc=I-9Muwe72s|dO1qz$RU=@S2^)72n@+9j3vS0Tq}-CA@KD(1=IVM8CIfz|ne
zHfWys%5=pdIV;8Aei<UIJBw0#K5LwH+>{KM*T&NO`uae!mNq!=|JXvI=hB%f$qhs?
zpfQkrjmMy;yBHs{OR#U>?ykgq_lsYp=oV5)lMS^!cgdgn5|`aRML=`3<MPu-8X>J0
zlntoLPw84y-Yh@Ga;m?$Mtk`nPtheJBD`LXqdx3U^{Ib(8W0tJ(JXN32I|J+I985g
zR+5YAqZ1?=NgHWnwp|W@jT;nqp6W~$-;zvIJ-0q7(dF5>-kwsu?%vgQl{4vZ-c7tT
z!1{GSv+x$*i*Y>X)>AoxpAX6sq0o+an;Y-F<_kJjX8Sro9k%zy0Up+U`-xEd@CZ1U
zYQOk)Er8O41&DQ@hj!sbnOidNC<6B3v+fIgLyT?|?BwAHgBPC&UP=IxPys<fY7HyW
z*Tpoqtz~O4=bYJe*govTe}8$OOqp%&DIz@Nvv};|_}K0Bu`!={gRw(AVy^;`>+L7{
zB)5-=OMD2J(UTyGlz{lI0I=vT1c^?$_%}j%nFQ=w)4CdUg_K&78%}KAc)u{>!B9p^
za{T%s+zsF38fU&7I$E9C9gbSU-Q^iJ{O;?;ra*6V2BM0Enb|MUUo#hOKYKz_%lWPW
z$ADq;C2QUrpFy=7lkzsB{~+!Li>Xj*9g3koKfh>;66-~oGEMzE%$QW(`{+4h3TGk@
z3Y6MvK=3KtjZ-J?yKSAG6gkee_uy>V0?>&p7Wmq#<wTg&eP@FMfc;aO+#~+NKDa(A
zotQQEXw?Q0Gwzp(J!pWAgfei&0HfnssP$g<lgS9xXT$Z)!v6M`1su`RdyMJskl*``
z-E8<5XtZPvA7f+_hgox>hjANo0@cnoF0)Stdw@9`_bt?{@UEzh0Jn1oD3q$tR8dIa
z?i_`lK|UPTDfH<(JXM2(GT0-mQxr?A0qD8?<eEgvPI!n&Z#!r{*)Bs4oCF=@&Kh8r
z7xrGP%t3^cP0I%@k`%54&(Q|LL!i{Ze~gWSdH;PWfQnUALPxP9T2<=i&b0qd(~rTl
z?Sw(Vs<*>dpl$hk>-?qLT*rwxh-}`Mwm?gwM>$Pu1js2*bY5xayKU{Z2%Wa`fW{U8
z9{xLnwiP4A41}>qWUwVN@XGh!Alnaq7{k1D=3WB_=MpHPBp?XFsaOL$1<s}NirNl@
zCE5a7;Hr{cGDLST?2{3C9Lq{ZL>=4!KunbqhYuH)q4%6k=-VxGFoECiYnTO4*5pl#
z5(TOe-OX)e&9&Hd13Hwqz~z;=u{@mwIx`DFfs+^2R&Amv_|3XbUndKE1!lmjtVubu
zOX&d}$_^Mb*wq0BGYJ8Hn)A((X80r0wkjqjCjGL%q-b?ft{(4UVMf%3+a%?ZJIaUb
z7`9~)79+rPlj7+ZxyB$HOSh5>BzT_BA!2%yGHa|9kjP;J=<leVqa0_BC)VUk;8;P_
zbEZH|m~Nmxo3f`(Zd>KTZB#D)1?+VoA)>C0>JB5(u!RRyHmHmdNH=bd#!%)iJmCyL
zi``7Bi=x>D%$zyE?aUYJcXz73?tWm|N#B6My@em3Po-PkVfic(QuViGJqq^=fh?I>
zc+9NB;Wdp;d?T4SkMk;^yw(!|PtE4+ckTju1Ww_QO7b@!A_wx7^l5wVl^(eyNa4)g
zRsbX1iy-n6mZK^0K63?-_1S$a;3bzSE0Z9Z4=1U7rXn}agaY?)fZmZxTV~nST4<3+
zK~lY9%+GrnLE22Y`>(r~#K(hb!COWE86|wXQm&lHuPwU~eY2=`Ez~PrYN_GC6kvC+
zd@HR4uyDK7)`F&M@&O8>KsYI@)!z{9<GWEI`|*T`9mt>F=UP9aW9*r=F@8{5S3AN_
z+-=aD)_J)@SVv_)I<7h~^N}$|`<(~{(SDcs1=8mpUvCIVNHDr~#{)B(TO*@$RsHqN
zt*s6a{&wa$WjSy)iq(_i`|z}sPO*ws)%a}^iJEKRjRa*I$V@^DrN!wy1*YDi&;|IQ
zEX(9;;@XwF-W8fL`C`Fg_Ysqqj&QxtsjfEudc$(l32nTqkNNb}OK+$9s__`Xuy#^;
zlUEe52dBS<%wM84(&}s^#J|9P&D;Irn4b8qu2niEIX(s+`O^!%u?O6G@*XXEeytZ@
zz&pb!uAgnV!|%V*)(h{cE>xn`p0@Dx^z{RH>6ED8meJ#gu$qf+<bwxh!mXmyfTixE
z^arsBOs&Cpj?o9-yBm+<`Qa~osTqXGkcnkt@%%(7ZK*5EYKfOfvdV3;5xM~<dOGw`
zfBp(!mP`f5dI1S*w>;p#JzxRa2$K@@RGFhW{Ix5OJOSN|;j)w7S}5tL-yW)ee$E%v
z4c`DJ)9Q-kYJcdRHS$)g!uQ`_g5qw)l<LkW)ze<HR$nnZ%N?(`pOFT-Q99I$ht{EG
zt3A{^f1w&@e&@HdDZT+%NkI?;JxyDfnggB6ke$VoETs$}^8xUxaF%n(@#{dWM~Bh$
zAYM9!8u=XbU@mtBLY0_)m&S|{5<CkBhwdla+v4i`?MG_{fnK%~VCrOV>OLjpo~dcO
zq_%o7<PYw5_AW*Da4rC2O)m|?8j}OZr&)B+_Y<`Fy^WDdE!Vv_*)RI5xw*Pl@@S{Q
z0Hfx)q*jJMGGgn=VEra)`sVPmrT?1c<uHArM3>K;(MMaH6sDo?-a+oSbRhGbx0hFY
z-PYnQW4QBbp2;q+Iz;*XqTdP{bep;o5M@3}?ji1xnkk{8Q(L)ZY?NYp^)QzkLGZSD
zTvXXYJguBVrcF<EpTGM2jW`fZ#F|TpWyFsTL{)zY)DKZGmjZ}xPp$m=1@T1oL=Cxh
z>5+r}U)KHjWj_^JC{$a{+WcV7_1-|7s60>i@N)z)-jOWotntM_!M`&Sf2N7WU&5;;
z=jJUf+S-*qX<qutZ?*J#AbeLbY7HZR`p_f7qaLO!AIm?E?|xo@RD))@hF`Dvmt==m
z#5eUs3HK9GhoC}kTL&&-$+0^j(Go=6*kL3-EpaqN`Kt!RSEMvRTOdEEUoZU`V{i%x
zl>D&wTTc46afeJS_hXHbOsP+&?YrQL-nhDztVh{o@?^^;{*+qpZtVi%fv3fy#?1T0
z;{~5v+H0LTzSns5_LrEkG$>x3DQL~ng?3@a4K2%Rl-8UJ;pqubVBy|=ZSvz%47pCM
zj}hQCIJXJ=mCEHD?vcM<ZRIZN7gS#;JnUPyobu&qLfjFGcq_ca8)-8~(LXK|mrlTg
zdYFCdIiAJqFP=XtMr`L(mk_|arjU#Ovt_%S%~F0+c9yIAc$vUS=XnaY@4%Z4`!*QI
z#yR9HxXSDciWM<9FyyDc*9ku!^NW93f8n^>R{ulkUL?rnmKLfwz1uyNldK^ueS&RY
zpdwx(MhU2%RdOZ79lhDeHqu1s3wQ@h-;R7*4nm7<yicYNmWBA1T+aH)2H&o7GBz}r
zVY_GYqD=KZ4XkL2Ke%J|>GGVLSkDfHIOTF-MeVvzjk-=IA1!LlXj|w9axM-FvI*(M
zem<+&n;h9P(Xo(U_)&wKlVJQSGnws|{?Hd@sz`L9(2}*K`-r*3`o$Z~6!`G8MufZn
z#w*%7HSSL{-${js)g0~RH={kc*#tc$DvRO*sf<e6!&6IDhNNL{<kp)4Pxa{Y7`a-G
zhr)J^^lYnEj%0XPxM+TY%O)khcGuC%YODAmqWBVEyIu{AE*FS=0LPW@q1z)H^Qf*3
z!cd{eG_y4#wRy5jTgOP-(U{yq-{Dm1!kMj5iB7p#RmqvRigU!#ujOJT+`eDq`XCg>
zKbgl#IFv_$xv{geUT=~g+T*zTJv`34{cKZzN0$PTGROWCxm9Bi??1R#%ocSIHM)*k
zJs-XsBwV}=doCFkG@a7BKeD&EOx3>heL9=At$*X$QR4?&*s;T3Dfw^4AmH$;*E8$I
z<sq?i!%IJe8N7w+ybKxMc}1RwEHXm-?-~)6K7`bW;ku@l645(?h||nhIWkV)4DtT)
z@<g34gw%_kf_J6;hWX;DnId;_aDz{d8(3$pTHeXbw`yMEMOMSwugjTV3H}(_XxmKi
z5s6N@Yfvq(WnmI(B&j}6>hGdS{L7tk5>M^NY|;~1PjYKP`t+-^2zekeJH-3zGpAlC
z;CYT6fQja2C?NQYdTZqy2Zd~}6nGCZxeu$CTusEVwh<lHNtKF`?zpTu+Mg)KYWJm_
zrrN6^8@7K|&Iu(v(j@Qq+L&43I4SPGsHu4*ik0H@ew8oj>Z(vNzq}iQPOLp8s1pDF
z%gci-?RM1HuE6WOEc55r`HtY>X=Yu)6(eV_Y~skpz^Nl?-!INWEXaM<SE(<lXkWK6
z)k|lYtX>5=+L_W?3%41yU9l|bo%LIc_98XO5)ef=t3$5F6fA#`67o;EarKez)6G2h
zc0e{TF!lL|a*JB#l?+TH%O1VR%88Sotvsrj<i1u+(ZNYG#n5s@$myKY>$-~*9vklq
z*8_~Mx0B&>1?;;$hmXpU=SLoskhN^B7A+)EzN3KY&MHmK;npd9x>-B${Ocp5evFw;
zT~TrPnE1*!RsId?Fe9^bRb8_?Ii9Mk8LG2Lo~%&UJEh`YCj~gjSV0@<)P7T~Iz#65
zgtc${r`{u?@e$RS_g5J1-=JN>v^MRB)Rgri6QPXIR)uVR*CuISiQ2;>w=tIU_dM9V
zuM#^GQ3rwbTu%%|?M9-U0*Oe^B3oM~s}7%-AFbqFpVkZ7%8_|{=ZO)Mh<?ol@-VM!
zy>(LwN>VmP0QbJh2Q6U$YzR$YZ_`rUidG<xY?&o~3^_$--nXpa*Ir?^R#F3Is!X3x
zXcnI+yg2H06EGRt&q2haGL?9m^-MY(&lA+r_k?B8BkQPIeAo~1KUQnl|0CTiV1O~(
zE1kB!%i6o<)OADGGS(ZtZ$HDn6pO0SCw2aX72#XwvUroTyq#;A-4h9$W2U8|YgNm-
zpNLOOeF2AP_LXKUZL>e;w9Oh!v_h8`P^^M@>2qgbanb%}4Z-*YQ&GyVthrb4SNF9Y
zUcEp@LBl44o`m}`h@cJ@JtEKd9c`R`vsPvJK$O&o+V=P7Ue4ZgGV1HumuYjHoXaAD
zQIhr_pIaHYzJT?UCV>L6T}Xv0S_hz#xEjOniITK^bXMNVRpF!Ux0)attkPT$LxsAk
zE*1U-t^hQ>(>%rhqVBE3vRt=raYYmmNkx<fK}tetkP-pu20_B0LnH+x1O=oKq+7|C
zQt3t!kj|Gzlx_qBMC!~3*4k(7wbwrD?DJjU@0{PCTwcEQ%zNH*jycAdcgQ>)M@~?~
zvZz_31efOx9zBi2n%3P{WO|n9-R9hFVE3-kRgM{%3Z=QJudr@0+%{2%yovd#dcRYc
zQ*59LokdmQ-49-;<3ikjyxDkfhmUbq@ffA?$GBS7W0)Eot_crXy+;JBWEA|Rq5gHU
z;vKxCNS>GIIx8c>l!lYXH)^j&BD;Zs7G`Gicpq5k)@JmIA;ER6iG)BeHbfCuj3<`+
z3d_Z8A2VZ_*ghI<-=B4FU1%6OG8EhhIaryO6QSx}?Xi@g;fK~QfyYAg(K{kS%x>&S
zXM$dCE}b8o+rvpl230o-(kCf$<{gGxJDqUeHVxQYw}jc}2M(Lxl)CN)D-v`0lg{Z*
zI2R@Ky>S4>VoZ=5Hl`#r>QC&T)Oc-=TxC>lKe`sB1ZJR3=bRKZ=}f7Zxq)Bh{USdc
z@}Q&bf}`<hm;`!);O3CXQels5lcZ3XGyM(oFKDN*=*+~Nrb_Lt#wP>*Q4tif`OeC5
zGBBje@z?UuHIiwfLd6!aCw5PJ$&o*tv+TXZ)Ff-&E<=bNBbISJ@Of+WfQ}qZ{*$lJ
zRci8($glBLeW`vva7D&0_VS)#l{Lej$7+B<&RPU7k7BptP7`T9h}m@|i`P3uYd)5^
zoa&crlBC#pEVavEBz&h`{O)V)<T^t}kI9$PQZ+bWFFP1y+b%<Zjz3OXFHCByqT|mm
zgrsuDV)L0kW-BLy-4F5voH~120sQqLj3r>V#6r_#z4nd8EPN3s_PT-fX-}tn#J1tZ
z+8L)`>~uNW>fhepTYg*<*Q!~RpO{XpJYi#sZG1gfrwBD(xrUOq%uPKhJib3!lX&b-
z3qd1ory?|POQ{Ol)tGDLSo7Yu*AfX2g!?z0w!}t5-ltY9muHXb(~h;B4k=%`Jkd&*
znO>iq<uoqCVKPz8gYibc`(yCxt~8~Cdevh5GciI;djj?R7NM0&t{1S~j_#(oh6Q`(
zURaF*t(|;8kYrYOCikN=;zh@A8+wZcz>_P!z_e`3kW8{LL2#vM31AuNDT{l7lVv~m
zK}&R-$mD%VdQ0}UcF_f2a{o-`<J{Z^`{4vp^UUE4X?QBV7+P(qfHk~BC1O$9E9-qz
zlMpOY_Q`&;(|bdE)qbUFOLHoVheJ<CX|SL}L})qK3L8@fYM9T)w3%ec*G;cdMs`uj
zC2hBwysl5MUs3C>QttU+T5bJ!<xx^OfuT&(j@2c#lk|HT3LeQ(Pd2)d;&FM5I6VA@
z<+b*bswFF)(e$bj3hVaO*mnNo$Kmw>$Kko9tbpmAJcL<TisNM-e`=V}lmoi!&9!uH
zQ4E6X-^I`p@lT?AIB%8Qy*V^r$DZ_DIrscx7}dG(@l2Q3rbZKmslIt&%b3m*oD*P1
z@5{t{O+)NjBKJLz^^Cu`B)fVt-7w=zngNK!)O_u!!0JGz|7F#KX%U`?8`tKJ3kgss
zQhkae@eAJ{^c72lY_^^nu%dbP^8{7&^!fwpo!jU_{yg?yR-$`9dRUh|%YFOEKYX>^
za2~klt1$qJR99DfVoG>S$}0F6L^o`C+>@euzzH%+J>O7&31Piwv+!A^2I!|>4%e5u
z>_!FZ@FvSeiDMXegaZJ{$N|64^u+mX6ODX3y@iqyTe*F2h4;h&sE||q6ZF$leD>Cz
zTTQ~Bb)hz%avIqhN(k^UUkz}QUC_&_X9|;{gG}G(@rwR@!(Q1E7PaeJbqf_1;in?I
z-|M<W_dI7Ue+WQZwI&cHomAxVth|)ouwjqBWaBC8z0#T*QI<>0WEH!B?lEuGI#h4O
zG$@O}0&T7KbiZwrjzm2Y>09e^5zmZHU@*fkq?Y`M={AP*$cb={ZF}nr<mB1+5xfVj
z`&O3Yl!NG_Hv(Y|<w<2bch;y}8ZMf=>L|Nj&H+{1x(px5&f^gqX>?WNX#bC_sGND{
z?0OyZDpz$8zdAFSt@2V)s}?0!8z~&tCIKslBz2|x#F#Xl6vwOLyEf7jCukX5GN=2=
z0K>BcaO?<jQsj;iaVz&Fm8XR{rCHtl59uhX>9D#D`zza=se6Ekq|n_Ku1%W?SI#Wc
z<l)d+JI={gBb4rYJ?e*YG%tPE6y?e~EMauWLy061awmqk%xpZ$6fm)1@Ox*rNqo;j
zIQcT^X697f1Oi`F1RJ+T6W!qd1iLaZ*GjQK7pb||E_be%E@=K`(U|*=M$q`J3Xa+6
zMef`{x%sKV&wE^Rk3q}Qu7S=tZzn%(ZEs-rwr^fL+gUICepdDyeNFSzPYc)lMWXJJ
znBt2iw;|{spXGpZw>s5C)uI_!UN93rKx`NP@qG^=HY*{|0FG+@jpLV>QdrBxo<AX!
zWp_^z9r-~Ry?M8{^<B84U{)&wn~#+1{JlKQUr?W{kgOw!1;Y)-7S;S=sd2PKdYmKe
zT!0()^INqzC}Ou;gxCpUR9j+g%tN17cAcDkeSI$lx-<#se_3t!n3bSL4bm6s{|uT@
zn{lvK*r}k^9)Ou38LAF8&xElL_#VR~qbEcq58dSfMh2YUQ+LyhB$XB3e7uTX=nE<O
z3BQ`G1vb6bDEe8K!EU`ls$hSmWfK-_8(VB-Ue)Fhs#0b+XJ_>bvq+ViMxqaEpDar5
z`ei(e-W$prpYA27<@Zlws24SNu5t&j+Kz6*%7|F~tTO#=$fJKav<p;D;A5gbqBOGN
z!j%Nrtx8dn^YlgQB%KQ=P1#K<v}}LV&i!ldLAo!Q^sEh~DzCTI#l~N{KG(BkAxlGy
z=`Zzr={KABWZBQ%WaEtT<n!eDK^I#6&xslzDKNYNO4f-^r&E@rSxQ*})xE6In+W&O
zXTdx6!Y18k=$~Jv?u<{^>E(Rv7x_?hpLzAi?s`Z5qrHuM58?H?+apmfEoOP>j3oE+
zj|Q%c0wdq$41THF>k#-O)kUG;Iyi}3Xd}p=O#lK&UfJhcBi2KkMp1f^Iwtiw{Drqj
zaWEw5;S+q)jH|pl<3ONp-L11#{IhB<`5C_d^i(jl-LOkpOl4ytz&u^oU$86nDd8k#
z#M{cYshm?1-cUyi&w<)s<ww2K?rNhPu#w!jXSP3j<H}JV{A`p%lxxgZ;#pV5=!49+
zzx#^nY9wX{OpAAvGZ{s$eLmsoNQ|xGE3VwOk2iZx+<{eNa6dCFmj13U8(GE2w`Bmf
z1y(up&Yk3fImX4`<`_W<66gVai?-6*Qy7#0!fEk2zQlR(WZ*&tlkcG8uvMF)(30Vo
zAL)ECk1HoAr`fEX<p>>$*7;&!`l<QpzLq(!$FB?z9h;m8A(nLjs(Cg)bnSv9Jg^jj
zG>pM&A@|BD142_)Pecr93<%`4O$Hxlz)NAa?JW)^tm7(^9gIt)w!|uahP~ss_{KtF
zOKh*NW`84}fNl~&G^VW&HmzIU4Rnz<u<x59Nch@2P3u`}FJQ2b&Fb&O_E$Cl5FWQx
zIU{TgD^ATERDM1a8;0Vf5YK`A<Y3gt^pcYP0V-63<C&(kKu3N6t5SC(@?Gl(XkRqs
zy*J(8FgQT>kWWKTdI=IJd5WYy_H*&tydReP^ZENIx(mZ~OfH|Z`9I@h(kMp(O1kt~
zy`h~!n~wv58#u8C%4CC<RPNwUBQTqzV(*#=IlL4vR3n4@%u#x#eF$W~2u+u9#Zqz0
z!{`T!TYP~DYC~5Hs;@VOtvLiG$8s1{!mT5KOj3y0^JqiQdg5|aa=H-BjSOo<2i?>}
z677ZKTb!ytzEB3CV@C*?cs)<x^VcHF&8KAJVZ1Pn;CC%MmN1DL1VF%eB2^bqy9UJ+
zBXIqZ2Sgn~W4k4Iq)SVEV1N|r4afex?4+mI1<XXKl1)EXC1j&rXN1O>vC?Q;7!&MO
zx!oG64_2wA>j-)1Ut%7|?q!6~P9iX&B|LpxDF5}l_0(9nA3wW(Jwcs`y<o6EWzf36
zvqBxz-PwCxUQ*4|mlC^|{_Xwx)5!Wd*v;B_p`z5Z%{3ldO?eb{!6QQTHn6@A(u>Z2
z+jf%x{gIc_6Mh72lq(jE@)-=oNa<Ywmf>klwOItUi&+c9!3d9m-qY}*+r0icjM%Xe
zb{C_?kE$^@G*{3y`wKPgucG9U`OSMm?03HAk);8lSmH(4x!gGS)wkApEcz}~m_O;2
zlflhIkZ~#D*y`0F>sO2x2X91wU0MgcLsUokW+Tss@|!$qlZMNyZhN$E?xo2hTJ=D^
zs-fMVjiT(srIjvnYd(CHw9Pa7;Rnca;MmW4!_5n87WX26^gmj;pto`@R=ql+DZ)Z)
zhZ{5icPEGPZdvWGQXh2fLayg?)L1sXdRl8*j-$#julkqy11;<z{IUz98Tgw&end;)
z3w*LHXya{fHKn>+lB{+;|5~Y=g8o4Yfg96%#HQxWx3c`oHQB<IwVOg$$(I_JMz%HG
z{fPopi^7<yX*yjn#-M0!15`hJa23e1Z@eY1k%tmtgujAcj^4@%rN@K;PxXK(=;LRy
zLKkzhL=6ZoWqFKp_cWp_x}}`arzsqIRi@v!um>c5OSnud|45SH(`jOQY<Z^R6)YL&
zx=<cSHbin7N)YiqiOk8=a=6~+D`f03(CkoHm1;gTq^~mN`fvggFe+wjyX3z49Ed1U
zT&w%8z<!cLQo%x`h1Nzc(+tdHvf{?-&1aOd-WG_WCWf@d1>e8JZ48X-k9ecR8mRsX
zJx|G6E4>Qit}1liD%NrA!WXp&1mC&RfVg3tj|e2kw&y(#2*176pn$qi#p_;tjFXq&
zI<1cTEPJ{%(24_%lm2eT){DHy!+y+}rzxB#U#jR9pS`1O<8Xu1Y9$3ghW%b}AkxXL
zzbU+<zg6BZIlQ}7g9*83EE-G;e5L9nbl-oBafZJa9fg+H0k_EAuBh1&DgaTW?PzbD
zOx2B=ja+NVfT-O?6VZV7EYexnCp*Yg<C1`aKuqP6eE2Y!diN{V^fT(+yPpKiDqt=Y
zossB*x?eO$s_#S_TPe_@oB%T*$#4<IJ>GJUA+wAaFf&QEOcyw%u?0%zG*@}@`~7G%
z2_>*o6SgxT@AkR~T-q?N8uw>?ZP7v5^Ql*+W;uT6+ybD>!b8HF0}p~OAMXfTTQ)QH
zcs`dBn??(F1kUA0^H~{EZwMD#UdO&%1Z21C?X945w>Dk#%|78^%iqVy^PLBq@Yf3#
z#BY-439%!eN<|>f0YTHsXkC<<j7N*bhE-7egHjJN+F<eI3z*zPn*9Y*-hyqc&p$(1
z5>)5~&ORfsnuqOqs34<{WA;arcpIs0wgk$Ck>&Ac92~hN$qAMMKTGcrC0C(duQY&H
z%qq*{`QxbdO>narL5wcew#?S2+el^#CtMR@JG{nBZ$%e=3df9%vsRw2c7zsI&H&j|
zk;gg786#`PXbx@9l8?TWwn^qm8_B!n?|+z`{PKQoVBof7s|&6sYg-H#qyGZr=9~xy
z!R2bZd7_0dlD92aUj*zCIcB`>fHRe^Ya-OIRxhwrnpmG_Vxi%4o|2HYUYY=X{&+p!
z(a_rG=|V~@#XHsL(}YaF4TGOyKOd4PdPaz)MbaiZ*f;r!+>fUCvEH2)_8a$BAeFY<
zh&cOm8SLulo-b3r$<7Z+M$yiJ>4!=aySZxEK>G6L+7mF^(X4+wY*iAWJ&ZoyGQ0wg
z5U92Jr)LEclF2&g57_t5D)Q61qrNTueCtxNW)iUnemYZ`SNuL~sA`I%71(Hq)MchR
zV27`6Qi$8)1C9Dr9F${PeHj0c{w%|A0MGjpR(8$7;x$P3tY+i5heLG#n}DF)%9YGH
z9CmlgVmKze>;{bKPWe14TW7AXg%k$Lt^_c(>2+Vtx>#_>H@slU3#DhsG>@Jc4&b18
zo*Qy=-2@etdUkg4zC<nuy;w+IHNa$w8ur>QnX;b)aNWSDg~N<W(=WT@`nr6ClF(vJ
z>+Lb=;gi*s)}%S8W&<T<4P!>16&UGi6bDg-W-d6z!!^WEktsLVU$gHpDdjce(66WU
z9#;nK5T~K+Q02O0fAXd1s;!aSZ4Kw9b2B-&-M3o|B2Sw5wEzNgh54ZngG6^>+732~
zt=<o693T%cA(LPXp7SY3OaeKaQU{eqZfCZQg^4rbJVL@H4L7=GJgr+OP@34+bS~}!
zKea@#?cj25>VD!@W#dw$Xpw5!nmL1GTfVtuj_E^%wiL55v1s5JYv!gu^e!;fCmVJ(
z1{Ww!N1_x{$<6-i&7v;dIKjwB?t|3n;9gha*kKL32y-fw=)t;Zesi9H!qiqhsX<`7
z6@c>790zM0L3<B)b=VZvKf1v+`a)Ab|JuXxTEv2-jncfnZ0+<)(a7zqEvEWsve(Gp
z_3gev)ggCsM(hCY=m<<3RTjrtM4Au!N~(+E4H`B0+U8J(^<^(2G~=|$t0pK{I9I@s
zBZ$9{#VCeLY5jYG$Nu&d9Z%9ca$81jO^_F3nfFSqK6rR%{o-6jn&A^;GS<^l>mg?C
zN@+jiv%8M(j0FoVn|_j9r!lK<H+(Ow7NjT7AZY0=k}q%!m~NDA{ozJT)xb^>jXf}g
z+Dj%eZ=k=3PSo*p%{vUY`m)L3>YAE^of>OPj-D63?ExV}Lb*XakF9bSnm0Ll9nC`w
ziOLoiV#naC1W~S8=hVi_UF7#ZBDg^7)0Tp&F-QV$S3(K;%ZHEjsTezEVY+YMW@q^&
zq;h6A;+|0N7=t|=SHaTpG1qi>gOrXM5H#O;p}<*-<zZEU#pkve1Z-#){&FSD^p1pA
zPdBYEg=$3qV5w4W`EuCa$KW^vi5PS^TjC;y1Vfv^lxu@%S{UP#DN48qNt}5`hT6_v
zEU5x|#zZ}L)sm|PwnJ5F#MH!8h1JHad>o6IFs=o9XFPx-*k6@EyPv#gTW&}FqGo6w
zslLQack?Z(Y(8J}YxzDu`eo&!*xBqV7T>qLc9umy8A9JlLhf?!GNCBmNrQ``;1L$-
zcDY8BB>39$?2|;^#_W0K&57LZs3Kn!uJrnc^2Aye3F{#9G$#Ty{Z=6}x%u|0up)tm
zf!6?3PEd5J;B-n9WwJ?+9G>j2ax|Z)P}L5x@|y|W5=Cqh`wU_-nSKF02EV$aRBJfQ
z#<Q1fwBKikN7b`T*~q+B8_EAfiDN*9q7bE`rlL#|Qrn&y#_M++?_?BN#nHO?Bg^^f
z#J;YGoB4~9;i%d{|H<ZG<D@mWdlJY|x$h$kFk%brRzEH5C9Jq+j8~RkuGeT2r@PA8
zR2PvV@MyxwbO(^@+=~w`G3vIWe9M;`56bK|Fg0*u1y|MtZSK@<TM<myG7*0p89}jI
z56$pFdG<btx8bZh&sf&(3mMj)69m_l!)ApzIh9FDe1`gEgRoJ>(oXeT{30_sdES%C
z{-wz*T?cTR+s9N?I6{wD7Pf?=%UT&pA+2nMBNdP2gr{Q1arniHX6vR-zeo%{q#)h(
z0{G6@`YG_auDNJZ=SyfhhU^OvsmI-AvgyFlDw@{pz5bFNum_n^o2CtY;h$fhji$`5
zg94Cs*i`jSzYr(;uS{uF*z{}reTMRelS$IlZiQFQ%Ni=ABdw$*E49vtk@^f72kX*r
zK4H^s-xVNb+%BV-5uM|ze!m-IlBm6`!0hhG?=q+_$}PK$n_Y_DLF4O%1j~iKuRCbj
z-r3oCGx3W<BqRgVi1EwZ`lqu>SBB}m8{a=W#iv`5y79=K?xC=_16-MDNHIZ7$>5F(
zCVn}kHU3U+^;h||9+9O^7ZYN)zO(8Rbh*`9B0#-UU?{2!V~|6Q%<iN~mgrXgYDXi#
zr!S@B?Lm`WZ|~~<yek`b9hlv%Mxn>(x+mPpod0uPXnc`AXF39KIJy9X!vl$(05{7l
z+sz~TS12iD+(kno^@0L(Zj(DMndQlgBiKeS>-`d;G_m)fkPUpdE?!pGm60HN?Y)w}
zx#CYb{E(XM>A03D8B;Y|=P85NbNWw~G*&zO#D2+N#*tpG^a&gmxE-#o^WFpeZ(p^@
zOwfPp#0f;qzcgYTzx3Ez%Wjyf+p?Br`<m&DD4tOZ(O|&Q$UDQ<_ex&pSo6EL>rwPd
zd_^m}+t?_U+Z$1Db5hODK(br|P>hXQP04J7CHTOuhwU>vJ6#{KdVNo>IS2Vz#5|ZS
zalD!+u#1-3cyU0QG)<>q@8Dc;8nv3(UVb_L5xZzC@5j|}{w3AeWR#s_@HeIa{k!}X
zP<xHW6z!fBFPge#gMCB{r2V>YobGEbP8P`d8|Ce+7+|Xo7@U);hDGMla#s%)ex!yg
z`rkZ{BU}{rNdV_6asIf>y0)O_GzcizlKgQ!z&FQvyOekFqvFe;I}Umuqt>r}F#)$h
zA>cb)dV=TM!8Iro@y53I1TR|}PJNT>Loy7UywxvWMK0VZEL3#qwTUXf9+zZ<)nF89
z({=TtbflZ#Xx=EvvubJtXrSLkQQ(O_UP`k=iu*}Thj+N_5gjt03vo3ycGWo4OjwDW
z9zA+n$?@T~mw6>tS~|`P+&p3}2YJ>(A<_9Ilx;7UPL;sKdc16?<Lmp8<r!0cTVfh9
z^PV~7D9Rt*wK$`Kq@=7omIi9P{HGoYVkuaE4W3Jra{YB;quFtzJL8jGlvdJ1vJ(k*
zcBEsF$r%(TzAg4v@H~8DRX^9B-Ds|lBhJI|{^#M{;MoZW*5}KOPtXm(nGO$nn&`Gf
zw;--DoQu`lykYHy`igm~2g;)jPGJCsEi?z)Qpl)X@>+>);>Fg%e6iG$nslGL-|)Jb
z`LV%L{aE7mcmluIWMl>lNzXG3ZL97e&TfoKMfL`-)5qmEKRI|KqjlE)OA0x^nzgD`
zMe;?9iw457AM1ql0gE=Oqk<}A^~<OE(Rr80ET^{BI`p^g?-vE+#wgWKjt6+i@;h2x
z5Y1ac(2OkJCf8Lrx0Ugohr>2~=Z-Jm-nEOfDwYwZ>Ud-2_ua96)R<6x;M2?{g-D{Y
z{;jKDhWWGgT=n(q$+xtv?|7lAv24Uk^xYq`FA3?W+zX-nUoa(~rqA?ny%ps4el?pC
zYhW?Sk||r;i=jNPa)FvXKM^IE?x9G5j#64jmFqjqf1CE1%YJOGYp<3snP!scd|Bs8
zBM#rHMpI)$L-WTF7Xv}@PQ2mBAeVKs4|IO2zbL4v*v(3}4`;_PAfk>)`8KhZeLEpE
zbwGDlaK=^ol)^6Q41G<WNNh%L#2aS~J=Glc5KIM}oclb2f;#XUJLub#PnM$aiNAbO
z7UnG)c$pn_vm5Ljn6C+JPlmS`eU4hoEV=KRr|?eRe;}_?N(8-g?F&1Ll0*l?2<kB^
zpaZR4rB3ps(j-$zYej7-?uzsdjGhBOqPJ=lf(W<X=%b37!>ZaBb_P-PW!%<TLvlS_
zrBQ^dPM;>!l76YrO+^{x)#mWsn_w$Wb1B_^ZbU&HOU-e~YIsB0fkU`^EtRszTI^3@
z$1lz!F!N;A<}CGZJ-}I<=Pa41v)_>FIpu^s-GN_bk0wj6KMk9b<L!6I&ZT!%`}4nw
zy^`Op-y|E(?%wSY-BXj)7ve~J7r+ryv)fX`?a1D^Ccs*JQK)zDwoPVp4T)(GWrJnT
z1&#K6JRyar@2V)<fIj9C3nm5C<GMq^B(bgFT^vMmK8VsQAu`Ri6<>)DSbrW`U#tx;
zu`;3^>U{8w`VT9==G+L8&37ReS=VB6i-%a6ZJQXPqQN}BulZq4sHtVX`ub3RHE;7e
z!cS{xkJ09;8M=!`X-q3vyQsBYk2l3TUU4g;YfEP=qNH+)t*fRV2vxZ@`j0sNCk;Ai
zDOdmd5TVG?4?-h!T&VfeK-9N}>gPO;qn?XClh2RJIWd?<TbfY!pwUK_jEoP2DvCx#
z@#`*yW+(q*^jKqT*u0jbZ86-_jyOlXG>thYXw@NVw}I9o7up^DR4+Vu+k5oN?YFf^
z%Sz&fN>J(nPo}^G@{hZ1W4j7vBc(qxILa2Sl-k#}fBGyDnOb%zt|Pu^g03SrjTbAt
zbV2!6>?*hDHoBm%WpjNYSnoPBu}|ZKp>H<15O>vL-)1^@@l5W!DutHDLfUT{@>WXS
z{u4v*0w^VU9aXZ|1Xg!F4pt5>3Mpl5CyZ?;%$6<@S-K;Pu%TkRKr$i5Ol@!lc$wf<
zaBsRj!D8_h&MB4rz?VTdbU-Q*|H|z65odsiOlr%sFm>l;0*Zb~U7B(S{1YEqDm}GL
z_<>;qc-NJc65G0Vl84nm873FH@o7MXVH&(BD+YmC(e~uadZNsS@`x*|XfQfP<v#Gs
zxK9+l2*b^TdYZ%^i<{sdE8@-+WhHd2XIA@aVsF_!;1;e*z)1TsMmg5d%XDMdr7T6J
zhz*(a32SB#G9qXNXUaf5U9O6IcLnUus-MU<(cpJXNuSO2r2O@8e>P;FY2GXg>nbCQ
z=0!{wF+Lt4EOUC7<-#7tX`qQ(lxdmkd?rr*QG8RFia|&2hvRrw%K;!9G{o{n`M!;M
zj)V0H<7Y9%iNuugPy*QLc$kahPrBr)UEYW;V)!2-mcxV!I{}@-N^jWmD%YB484Rvr
z2<e3kKCD;@^NsuB0q{~_0-vypO5F5Kj;kAMj}eBlP}^1PNm-bFYE|+Rgw`6?V9ir)
zg&h10X;4hlLJQ6>zK4|=sEOR`$ulNZw+efHXp|}hr{?fC;bYp;-6bvJCF^qiaFVol
z@k*3h7tbhNX5URUhmVSP48bSv(p8O1X#ug9bCY#?PgzKh01v)_eUbzxPf2voUdTW7
z3^unUG|$YI7pBC-Y9eC&r3fxIC6b%fXEFNvoR}L!aJQ_+nlB%R>u0-1G$>%|ybzc$
zTsD3?V{yQ_CP*)fyGSsXYSVH?n-7^Tnrk;bBQ6Gq*vAY61q7bDaIsDMlg{7~zFj|%
zely7Xa3pg>prNQYfWrafC1FV;)sw(DgNvnDv8FE96~dz;7j;ObIvzPQ6lF_2a!2?F
zEH|oJ{I9%F*LZQln}+apGRq^ERW675-tr!{^B9lX#pyDq&s^AAVoVnG`9YW`>N74Q
ztnFY407MYAamA_Qcy(PA#f~4Yy(v-G4)<v>{qhCco_tjQBTedXHfq3LEmk=fEJ&LI
zsIV6{$Z|=`sF;>+*mddlZLEPSZ0|t*=>;Rd_Cd*u3XdOem2Ue-i>r<4I(_rbN|D=D
z??7v&vgVc2In2lFRPzv&>D_(TXBI3$LkjU+cDLT_8N|mMuw56x)2WKpt4RtV<=|a$
zY828Jj(%S++<B8@xv*VYr@K?z;YAZ`R?z;^*ghM&s*gO&fgSzy8KLQMPvw*7^XgWr
zr7!@|uqKiLtHXh^`x~-l^GTW!B@*(<uU^)|mWey{oIw`34;-()4P<!PcDj$qqJoKp
zlZNvi=Av$oet+c-+gn_`zT7&5C-I)6bUrd|Js}>PMXSRSDy~O(lBC^Hb=bkUimuVy
z%`lv~%`xa)uG%pMa0MGPH6!6?*;<jnn`8AzSDcNrP19+MAx&X(bKd%OMTyaNZ-LLU
z%?9ySt!mbnF>=RJ4`EmRnTO?b>E%|HE*s_?2Nxt@I-^+eR$@QyYLYsyBVJ0F9&U!A
z!Dmc0>n%QcFs|Ud%5xT5Vt+|5@kb4&zWcV(Iyfixn+6L;ac3+h?aw7it+$pxaP4DE
zmQ?L+Q5TjbTS`IQQ<@+B1(;>U2JSb@o3;D17dIA*yikCJN+)A@;?TYYDAV7YFb?lV
z0H(RS1>RXy6cSGn9vMa0_grm00xnG-+f$I}#u(po-Bw0%(@34WBu;6M`=eOEy)Up?
z#E8#4>fl%Y!8PiiwOFqo{2J}LUZm4{HE}lOY{myiA?#8r)R_`1EUV^B$*+(=0;V~0
z97Gmu_BPGu<5#<pLp*rn5?z?uvAt=&nx?7kZ{X}6bpBo-hk3l^8)$1hUk1uBEzT%D
zr9ur7Wp9<IV_v*?@hT|g2J!4#2bqP{iPY_O!@iPX5Ppt=mZaKsH8JSv&C+k*zRh~8
z+wzzDh%W=!L=ybtYYMYwx6jfXtxO$t!R0*_`xds6R|OuxALd5J9?msteDR^@^2b2r
zfV+<6@f~Yb0?#^ok2_cFdfMJ<7H|LSV3$sNnE)+4xJ@5MDPd14*6LwF1iwjtDv0^R
zt0nmDi)G2CZDAe~IoA|1KhF-4ot$^VzYuMQ_LN5T_;&U+dVKl5CwFasoydLU4%4s8
zvp-k*be^*;$DGU>=N|A8i^jZhPQ8vE?WB&43gW86z#zna4qu7}Py7PW@r_uS(=Hn>
z3h8Y<gP&#7h&Q<XX%ZiT<yn5&;-s9{3J|C+9m-D_6P-RHvjizzkwMy(uKV7rYh<I5
zq8lG0tlNaMu?{=zuK0Wi?21A2?A&W*l|Up6qI5TCx5zlo=_vSgt(y18PN@9GZVmi{
zhOng~@GML}iFb!=@)*QVzp|65@&*y;H^*)8-+;-PowWRM$Xs>R$Xti@iup!tuPJ-g
zb5()@C)DGPz?mLLeW=aicw@S{y4Kvx!5W$mun@Vzr>V=nZI{v7Z?f#Edxe$9R{?ao
zFnZ$gsDsuz1cN}qg2zu^kIdVc^KYp4GT%wKur88UVQF3%rX1{tUoDYurT*=>(K|d4
z5+K|#frl|Uiy;h6&z??K76^yRI@U`Na3E@;Hp093XyeR#;iSg_$+NB`Ij~#_j;$Z`
z%?_Im3K-_L<#7J$iN#_$)hLpTzLNh1r`~oUw<OZi`ZY_^>Q8);pI^h8t?2zcmx|Qw
zmQ`lPHf@;`fUeyotYP;--vO<`6Cl$*2o#D&NoS2nw_6-dQS0Ofy*5IUH&6!k7do;p
z=sQnE^i~R%8a9Rb?;;Sf#q<;=<{Hh{Y3c%D_4P*~O4!^tYlA~Ge=794_H1+myIK9E
zK~0UOX^x4iO<u&7*v<{^@|oEfc6aI*)(h@c;M5YfbngyPqmz(kGMitJ+g0tJl4g}b
z;6I6^m3`zfamsxaujFQG%G7LLKNRLp5H_PHY`;4lXPq}yJYG7jYwx(MW=A=s!bj4`
z0ng&JkRuIe49@I|nH~oya>eiQ10C{B>0)Mf^7#DoySHC87m&o98T^ClsVDMmsPYVz
z;17Ys+H}8f6Wc|jpTYAh%0T~KzVBP(uq8jPyn&d&o=ui!^;hRnnHzVbe#DIAYf&cz
z%=F)&lyxk1uU~aa{OR~gU^VIdrr(Uf)GIY&0hY%Ik$U&a8TBULs$D>AB3AVGbC~bB
zVmRWj_>X;Bb(i3%{yFZyOH4s7lUt4@d)IPOB&|~Wd?<P>v#ADmd@Fjjc*B;H4OA0q
zBqNiJ!7P67`*gUk>PHTlRlX|5j<~7sS7&1|gm3mTZgsxi8GBNJMqQlelVsM~!vM!^
zY2^+2pnnoHn6=<gI1bz+As{QF^7&C!WS6G7p=kBvo@&*#2?Acn0lhj0+0Ev_9dhd(
zC{eSu6Zey^BWjd!@(zQw%7ptg=mRL@w5IwuViWkR_0eUi1{--Y>BdyVXf@{LVMXy!
z`xX&QTLw`3NqGp_;+>uej}%jVwF_lkV)HnHiE9ve>_t)mCYiNuLHVN!%N+A#6JwI@
zsu0zILVii^7nI9IZC10w7gF}UfDQ-<P06Enz)ME+r^c0E6(8AIbZsglJvO{@f&Mf(
zdr`|kNXbIuqN9$l%Q8Qfr;H?geFs&Z8*!|Nj_yzVrpU9M-xGhH2&78qwjAB-*}Ea^
z*xPa`vBPS(K8?(udCp|;y%F9ZE4!BC&<r9d3d@XfRBZh5RKr5{8A2aSPq0J>G~~_d
zmS}YXF(6KddgHQeLn>AiwE87^qJx<x+3Ci&(qWuh{4wLX=c?*^#!x+a!|P|;C6d!1
zv)u7NYeI#|-FtnzVs|yfqtf-DXHTl;r3+ESIenvy2lLK@x7Mnl3=9t(Tw2&qr~TQ~
zMYvok0t~g<-bYI#c?=b$hI(Tz+Zj6+s-=ND2hjCI&U<vdmL#dxqac1ev1v98*~kEy
zI%T;#bh0l;|Kjs^$zjRDn{aQ8>!ZCkE{Z*QQ>^A<Y@WHL<&_L#vd00*u?aDH6*T5G
zz~|zuP<aZ7p)mQ$f98cn690r6Uw<`}$m8qh_qiznLG;3e412uS7NJCS(AZovA=P9V
z^*Gp5Q6eP`nT`|!%C_2iwy(vtxsYK)%b}>~oWxXOQN3=SoL!l#I@_b1?os`2L*5lE
zy9=IqiBGc!sKJ^j%%!+bwkZpvft)ytN}x&b3p5i&BX$%RWgxg~JISgTN0Us1+cfq`
zYSUrNjDGve2_7p29am&7HwRB6z{R!PiGB#WD;gdT9S#kgCbMcREG#)k6{#8CUxy=G
zN6+To-G>)Eygf%{6qW3Np3)zGpzD?Ud=P{}k@g(<QTC#WangD{!=<k#A9c$xmZ`bk
zNM6Fc(R+_jy}$!+8Q2fACHb@KwU+>CUGI$t#!i-}Eg=TwElAiW$cvIS%2yl6)u>-|
zQ=z(pvmPs5l36cd*fXSbWEAI8sO>$Go|mP=ymXvfpjA&opvQ%}>?D`@wqm<?M!7H$
z39<$Y@ZYF+(Cxj8WXLM9jGsa1RQ*>JLi73;GV-_-ZzRh=jj7;OmotS)f3v|uf_zX@
zsV8TPgS50mSowz0H4%k48;Ag*t*j)Ot195C&zo2nsvf)If-B!7Qr7u;T&99rE+?x$
zn@7LcCC^+iE{EBQ2nAxKXCyXEc}^E6Mu!U`6+}SEMx#zSom0@xvIjl2371B!IxjXV
z993qa#oLQp=J+YzQ)~vF%2Iss6&HnepqKjl2L3!D9cks~0m}uHxE}oM%@EO0(Z5u;
z+<S-NqFU9qvYvAJmoSMd;Q`-L+nQpHR)Y<#FA0A_%W6|y7F~N8(i<O4GskE3MPMT~
zH;TRLNB3!;aZoo_Ae_`V{wXwFa>``Wj_jqK`IK|iRm#~Ek4I*xqABL{pEN%x)|z*k
zgHg4%1|>)3_m8K!+Kj03OY}L-F<YEuSgjkEdgFN5%nBv18{}Fk9DjBBbNY7gyF$s#
z!nek*I!T3P4Hg@(b2adRd@GAF%bA-x=xzWUToWooKi$vbytv{D#PG9+r>5X4E8s<h
zhz?|u`cBgX9c1&7##y547%gP+TPzF*^i<DT8PbMM($PFz<|QqmsvV~wTg@d&s+S9R
z-Ww~o0^4Zvzf*IxSXq1smQsyX66Oag^7bm(H-iq28JbRP(4@4-@?;Znt=)iZ(XB6Q
zwp#OZPieU6OVBdku$y_RbYJhHurdd`p7ZiLV&~j<AN=`0`POCN<Y~LfmG%aR*@P)6
zoD7Ye?5NjUC{TIHWktlY-8V*$_};f1bf5pOny=5cEa_EylMobvif;A(Bn%tzxjesE
zD>C|~4w>9F4)VBjez;|!ZBm_-pr0+0AbbB6-MPg%Xt4%6`0}+`@ko_UzjbjkON}yH
z`@p^MwgO<b^!a<1l_@V){po$LC&Hj>ss|YYp<J{16j;@4S2GK-WH%n7-+qR0n>DRU
z6gJyv6)Y0#i~fw|c?DB&>K?tEUej52PXHmc_BQ+HBeG7$*?YYD8oRqE0;b+vTJHy+
zBsT{~Wj}e`w<9_9Xy+%OQQ7%<-*dlYX}g)^mQKqBH6|735}ga8_q%B3{+>z*08?`2
z;7UT%i-@F?DFhrWX9cyae<~9Tt4o;q;RHX!b@n2EI?l)gP`}LR?gw@KDxoj}s);r~
zly5V)3QtRDBeyt*MaV1eRZh7WuiR~w@Q*gGx9`VVqYEPDp!CnapJ15pCTwr40GdS9
zn)V7Ue_8Xp={LEIwZ=iuTnSBqs**w-hw-}?fccwGpOdgW5rHO2P0B9?3XUv36V-F*
z<}K~84q?u+%x@sPdw=fY)HX;Kv{aNvEyg+U!mGnC@n0iVXPiLB2_zCj!rx&_jSD45
zL`K-w2DisX@SOEUV$924Po=ob{L|mPd*{bgt!(7Olvxe6uF4KF<?+5Bb0$(UE9@rO
z2n>)2FAUOl-hg;!d+wK=jx+m#VgGH4KC`x}Eql)uf|Rvv+NIUCADpi=IxD7QEue~=
zN%5s3<I~+11(t8NC7Az}y|)_L&C;$PdLM4X`{O%#e#QB46i<YgrcQg<GW4aVlh*RF
zHVacD!X~CH;3nu$Y-t9lLlK~k$DhuHde-^^F6}=iIm-Gzby7FINfO^;`+FG{yKd=Q
zy1tyMTL#c*qx6fKR2Z7J)p&U1S;O_sFRzG_@u<oXwsoInvuchryg0GMC)lRQ8uJyn
z0H1+TDb90e^4km$IV~@N1pjHYMD4R<UMQ^V@^_@ZzEV@na{AnKjWpCnY;$W%YhW&g
zTQ2M=sYk6@DrqZtPHNm^6?K`-?hjXgZ8~(ioqBIpn)RasdGG<^%}=LPm4MFHioyp;
zdN<70lLQ&##$WW`8Ah1Yh5k10Gs0W8;Cigqf7vB~r{J(XI>Qj9t7?=XuoPzZ2{l4$
z;QEe*1B<2;AAa<uWciB|M7Q?Y-8T!8C_7S9qikIKX5H;JVjSkllAa`=f9W5UWi@Iy
z&rly5&v(Z$Hk>oD+h+oZlcK>US!H{1W4r10h<wb0w0HcgPh-PG|44SpsN)@t1YN2<
zsa`pk6jb7ei9NZBM%iQi!lhqc1iKi42qi)GE+|*n%o1lp!}_>8SG@AHRV^U3x~ze;
z{SWfH-nggL9wCDfp5YSTe@F-T9KU=Z3`%PTKObb+`r4au<V48ME1DR2FzhmnNHXM5
z*f=E{sbQg#0h27=jSmZ7pAoQUWyoGbYzNXN=`{Vd2(%E;Npcfe#);lS6nu7`cmkbu
z$u7V6@qfvSWIhDl5y~WQO&HkQDraKXk~KLfSI;Nu1h|%ty>!}MODlV**G<3QbRj}i
zj(}$7nG$v(;=PdVH=a*~L43StTFIfNtZAnk*>GdZvmL-*o@riM<E~V6SN&SiFDj=x
zJU;aii;<Q6MgWn!5z)W@_{M7$2bqeJSxMtH$-^>#>W*FgJXOYyyiRw(4TYG{b63K*
zw3ZmX4RHq9AwnE9jItm4>)}e9*-LV99B|WXEg|jC>4W*f;dvsaJa(uPK10_m-ml-M
zVzPu{g+AyKRNhVJf3KvzJ`bJi$#~-Q;T%TL*-qIQGXm}9Z#i8ZfUneyf4eEH@uRjo
zGs(#vr@~C8N^D|#e3%%IA&(`|Wkc@HXW9M-Y~++$1WA{I`Cq3dmLlp3^!Fww3RI11
z?A@g`vrLccb07K>vNxQeMrlxJhD+mPyZ(5Qrk>-@Y6q;E%(Lt`g`|nI>I@B`G+;+J
zPm;BBb9bxrv5<hIoc@-W-_xSy+;XM4FONUd`leQk**Uzr6?<9HoJuT;ahusja!Yt)
zzPCzyivG!LgVl(fN>#7Ce=hqNGCA?T-0u~*tee|GZe98AV|rh(zUt*CR2kc^#1y=a
zVd^sgXZx4=bIT%lYA8mPe?FdZmqUv=C5N6#^*wOO^zV>FQ3+^UHMT@)%CNoGko!r|
zkwkTK0gqI*qkGjg@UgzIJC8wvDqD)02)BXWb#PvApNQ1UZWOD+`c~+~fPXNrRpU?E
z!HYQr*`{deGjk{LE%w%^P4dCdSNGZ*pFEm&BdFZ}wa{3l)$4DtmEltNQ%P_xHQlY^
z8bcrp`m9HU)_P<`stQGW^%s*|(%g0k_Mdk;LWH6!351@A&~j?SnNh5cgucqs?$2L@
z^ZNDpIq^E-N4-1OVGlOSM*y{DA{>`9L(d5U)s61VOq8#F8=s)QY&&xtkN#8-rz~SS
zD!%aEX8Rb>uQiEetG<Y|OF3jlbWhjVIj$<PVZ5#vzlF$<u~BD7F~SRWHDCDrQa}sO
z0!t(Bm#|f%@OOkJP`9PM@!Abmdd%~dr;BGk9`hfQ>n`TuD<hW`O}s^Bpsbo5pRp}R
zwS%a3`P?60lUq^WJ}(W|hQHVg!82awo7T_bNCjJl&JI=;i!#DPTK|*Rg@bzXd`xDX
z#~ZKIHH)7=kxeFH=NoRt74&+B9ZWt`?sMCqi*OrqwQsKx@qo-GS#`Z}J==+)mN$$$
zM_FR6aXoi^+!oe?Nv{_&v?z!4Pf`@MXG~87FbXsEnV-dr=~c%<7*->x=6C3F*OPYD
z-SWz&V_C@Y#Xz40Kl!=sCcV6Ag|=&SG~48;C*h0SOD$<p`Z{5EZ?hInP%(Ye47kcL
z5xBx?%+{Pq`&~PHS`$6;O<R90>&lc`{#YStB+wMi=iL;x$|ouE=Gn+XgsXGQI(e`x
z`cj{OpjKlaDB^|hf5xFX0P2)QB#E@_vS``m+%+__*SsdieUC>n3dC<&vEu+v?bChq
z;bg;RvspK;F;FqMz&5>Tqwfy6kT2>T?l|`Z206C83&v(j-@v`yhITFU#mYww)tQ`t
zOavh8OUp07ZH<GaG<nQY<swr(r;{2)o<_N?0LSidWc6bO&)VtPARw`Aub9<hSuPP=
zs_2YBjJAW^26!Fw#0p>mUJOtp>ssRlxI)Ii<9Mdy49rOfx_!I5h<gGdRva4r$QxtC
z@ml#WdpdqmeP9qB3-g|_tJbg+Fpxy>&4S!c8~t!k?+CPQT}4?7J=Hfgq|^O(l9s&z
zcCpqz;%eJRH#m7m4p*P!t*WYOs21arkdR36juN24M7L&kbJ>C?=r<<JA#j~3qI|`C
z;*ghjgC3gyi(JMxIoNIlzKs6OSjhRX<aC&52$JFm*#vuobRk(o4z@doEb`zMghd{w
z^0W+5C3)li5t>muBEfP0#vAv3i0K7N5bz?xp?!)UPPqeKk{2l`xIwv>o}bTAaW7Z^
zSfZCk=*p&Vu&Y`SS!BT6W~uOen#duUC^Q1D4o%g`XNNDLcq#eGS8%EgO28c)sQ)UM
z>%x|+^ui0}(#n3PxYhJkkp3A`o-PS=!4!P1Kibb6{_h)S!3f5Fo#=O49X15+99CWB
zJ?913!^Ph-|8KTBhg3dps5Wo#QGYn%IbjjsE;M}^Bj|wAE`4;5vq)$@^Gz$H0^^6(
z+r|YT^*MP~ixCrEY|#Jz$Wtho3boC163+9Tf6i5h{Eg&J%=l!~c4J?@fx(}>f%qPd
zm~xxU0r$uZ(B_KQ5Aa#sLEJqJ{zQK$7S0_$?$ZjO?ST%k0MSF7)+0T7I$->eAeQN6
z-c{y9A&MxQq1N~d?<U~l&rV-*I()?6Y5qvFk3NZe-pR>4ff-1gk>K{hiO`W^gdXP!
zEaVjYJ6E8;cq~qYM{<%c$>1Bz15s?ezq@S=P$DucM6eF}Pq>2xve6RBoPf~a_NZeX
z`AV^YAb9A0a_Lc^fai}dv%Ff`DgWeHf1@?Z$s38TR*FG&pwuB#yhDg{I5sXs5c?<2
zYGj9hJ(mE%Y<#ilWnxS~YaNpDNNurrSu|L{{~i62o5b^o%^C0;jY}u+<m1Bt<-|es
zsMeF<ZO+BdZ~6z>1I)azOK#<rg$DHMd>S%?Pb-uwnMH8;W`+phw!iOQ06RZ&jE|xl
z9zmkmivU&c(>yoDBP6qK>=hRZuw$ZmPh{|8e}b&qaQ7|F5%t6$SoNvq4_R;z|0Fbp
zd%??GjRdFjg=E(CKcPYYIFEmd<SLh<m}^G<gMfuS%cAr$cUSQIWBv2*Pk}EH%y0w1
z@9-L6wZ``L+|fg{Ai+ELPZGQ%*Qei}?;|gS+}#?>WAW#BoB#g(tu`>r1)y1JD>Uoz
z4;IKoOs?Wi{Pk&MMBM4(pl!O*@BO=zmn0(;EIRgiw2dKf@&y0><Xy*}r7E?ZlI61g
za-HkRrX_(=E|;WoTe~gr(~JR9G;RpIh(z5W-ia2l5*wuX-+9R+H!ADSBcru1x8y0w
z$eJHsxQ&=BoWfbXv6(|WO~~};U^)`;9;RG~XE25W2O_I<`I!5gAZz;gHvWYekvgN7
z%TpsC7h0?r{!j_pBCVjbKiI+{RAX3bjrQU`ckd`TTfptDkizT^C0%AQ$>UOQ8(QCC
z-kW{fNa)oBWzimnUg}wR9LKRJ?t-0ZM55oD!ze!HiBAvNxk?2#gIoIe`nGka$+ZF9
zqkw!m`m8v}cO-4wymeFi==v1Fu42fHP1f!3L({MV3IaVEV*6&obLn?}%6G)kjqlsP
zrh1r`{l|`O5Yl&u|2iE0Ii#^|R9G<4yVa)uJ`(?U0oY&gWV=%wotMWB#)1!|q^1A<
z7%*nOXVOTCdwCRk$r9K=!|^F)((vUj10M}bnSYnIhek)Oz(@Vh(f^-i+oP+mt<rhD
z6%;nY*NfWE0iC=BnQVcCBL;5pXs|#t;X88$N<x!o|2qks@HPH3a0L+uQ(%ij8iK=r
zo@lW(e6{~@E8e@UcrzL5Uvc5uYZ6tVCvHmKA~Y3%Fpa!ae-wb{keQ-5ng0<j5%@Ws
z_jG~_bMcAPhy8^{K9+{oS&`E%xI&yKj5YKU9$s4Akajnop1|h>>_)cZlgeTAKNZ6w
zMx3%Tk%>L*Ml*&TG>G^Q8O{2^-iv=<Q$Sk<>{i4dtRVI!!Qng5C_4d`I2iZn?%i70
zKoARVsy+SvQKOuV@z^MVk8EKU?2~jw=ifQPZVR-ch06Si!`zEB7{&~eB8MMnmK#Hd
z!5vB_<6l4K(km(|dQkn{zn(K8E%QIjtBj%jw=i=Xo(qJ)HoNq<*X&cVJJ|3N#cAVK
zOqg&CuKl%hAU4YTp~!EQ7XGETMlI(N&XWr`R?W_uau!J_x8*ZKQ8Y;Rw{JMh2|qn$
zU-Cs1#Kv*esw>#2joC-|7~6<WFpA^LVKqXd3x*Zw5gTxp`F2Xy@V6u)BZiD~h?A4v
zvcUVihPLZ3$`7bLJKLm>d=bBj)4=Qtz<NHW097LsbQfIlLgmg`x?dq@ov^0)cyyhQ
z!0k8fbH4HLZ3y{22^}r~;nyJcpIt3qjdH$?{{iRlU#$O*v@?a#5O@Ke<M>B)5Q3Q_
zv=8Hq1jlROHfQwT7rG98Ioe?4$1@khe^jjy&Wa(?Mh<Qr1Bj~`1<X^7hYHR9Z-#C;
zR-SHIcWWG9M{A<+{|sGFkzRq~x;6UhC7ozg8p*>qT<6o)p8xZh^51!E0u^W|(!V5y
z5HjQEzrRKWLu&z4ZAIk22r=ufxo>i;ff{78_n@Brn^5Y!hgobnc)z^7TwX~j<RT>{
zUb-Xp|5x=iSk@U^TIO^*&>q#sPsO?ristzQi_fcY#EbF&_RxY;Cha^Yh<C~DVf8qo
z9frmBV`uR2jDAn(&;RT-bkr8q!%4Yw=G;*=s(j(MSE&(!m(q5yZztrK3_hheMMDVp
zHrcZemk(<Ic@{Y0FdoXfa`@*bC}BU{J@*Ls<4=Pkr9bO_c2=%u#C#8(SX+ai?%}EH
zbcXh$`L%jNe{t`TMeO`pG2g5XMt9!B6FkAj?N?G3%4jp8wt)khUBPb(V#f~uC#N|4
z&!Mp~>qrg*5pfmeqxTd`-b0E6uD`H-^)e5B6fhZ!TZV|m5)fP39-UXej${;?@WHiZ
ze#_Z{LA9^|q7?)Mp!218aQ@2ePBS#*R%q+MFX3*m$S+eau<f)1|C^%1>LvSz*XoId
zP+k7_cZUC2x`oxtSM~JtU~5Jr2trS(2|*PTiy;o$puY~AL#v=jV16p3EZjIsq{lp8
zpT*Wtrlg`ev}1)8-laeT*6F&6MS`?*c7I(BKobHhMYwo*HUHdE{#&=%e{GAOVtJCV
zYDP22MMgge7HB}|vUO?;@`xH^<PVDLHf@uH2qZ&88o)yu?*FwzZ|$?+u&z^j6^^-c
zOG`ZKGaY;YXZ)io{}=m(!#mjYD%{WKPt;Q$M)o-$u}s0%d2}{{%pjh&dKV(k5;1wr
z-&7`e+p5elc}#_f7*hv9AOE17fG#)Ghfe<J^#oub6?w@jTLKeZJdR(&$v*jL*ypog
z65J&-C0BV4dB&dha2qbItnh(FywbmvC%^ev1_lO3!yY2UEBPh}?jOF?wWB4?Dtb}W
z1Gk%_4s3fme8bNO$CQ<M4Ftr1RRJ0T70zqqF!RKQ3#gMbM1Jn@jz0#g(T9*BcF<eq
z(}&lY7chRw)a?k*9DZ*xnp+sOr%~cw=5$P$=*!9V3WVwt2*v9OGM|m&OFMdXe#jkl
zX1H{}nLC5^V}L63a$BP?W%xJ7#_+ZO;i8;ehYcH#>tQQF3~bKf5)>qpUjmVi#c9a)
z|Hf6BU;OvG&^r(J2v#|U>92^C4S%`{B^xQ!cOh2Vpe_B2Sb-7_3amEJ+y40sunG2u
zP9TUcs1*J<S^6DC0n}jB6_enwe4_%tCEYUHv=~0?#2Dcx#l9JRF?5;6CS_(lmkTTh
zuW^ATD0pV?{%$Ia++%+jR{ix$9?=i@W?}W8g0;0ZeLDVsp2Y#s1h)^~-xY@ceWl6&
z`&ISR+DQhaj$qFH0Qvj>*XG`^0D>1P=0Asi(X6gSFf~~EIQ%_q=|nhCXcj(LYV!v1
zIO+<hCnSb}faXA@a{&UR4BqplYy}>B<o}&a3(h~Sy;*8)U>}29XG@16shog4e?0Qy
zyf+063(!Makn}^rjDK`+XdwOIb}K(WLbcc!!Fbv>)ym;T@{s1pxAP6&6c2|9wPyge
zUE1YM_c<&kLKOkMEHG}n28*I({w32w6WHaN_WR|Bc#r&_fN;kI<u@-={Hq=Pe^Qsg
za>v$&Aaca9A2{icGc@wkHgHswM=PD}dMzA5@5l+REO+D<h80b&$WF<S|1G`&2L0ec
zDPF`zVl6=nX)*Ng>lk|)PXW8>=mf{1CS9O_{;g;Ie>t^l*fqk6S(0-0-?A$qlOV#4
ze{F{+fWXmx|LisTI~@JTeh`Xyz^Zb@xzw@5c}}FdXcMY0bbX`1Tmw_|-|TAOQY=NT
zEdU9QnnPEd->|p!zi0=MVuN_xdr_OAl@^L)=#tG?FplJW=OM7Ht}2flU4{3spQGxY
z1D5k5$Um9vezv@rJUFtSed<L=+vy1Z&q`X+^QiqqFgGzWFU}*>ZV9T`q2S82lYqIM
zGqFjqp(QO@dQtfkK4QR_PcL$bAH$4AwkaS04KQZ-W-wU~FM321j6lbQKxT_YP#nZx
z#1hbf|M!|eRO2GZb^@gaP4A&zUxxP2QsD0|LPAqqPr{SvV)aTqZzw-+{IhZJUp{?L
zOfbebOjk@WKhLgv;cttH|MH3ko`V4}B*gvG5M&+>ZqW3VQAGa)Zd6+Kt>X0vv`hiP
z^cRaFK-XPSF&y|`3;eI*cY)ny466#Kp^CnWd;ZGz@4bR*fO1^<b1U<YhVZXz`hOnL
z{0ZS}+udGo>+Do^tgE#JgogLl3)1%{5uCna_n-znpPq$%^jLJ{;ZW9#*&Hcd!QTvD
zEjnEm5Aa5kmPw({-q-(VXH9%mC;#ryc=#8#FC<ZZt1fwZY*d@_f*3jbB)+9OO8o*x
z&}kGDHto74keUqJhhG>7w6#3OcGswpaw05vv1i>J;9bc#Q9%@UkbPq`M5>y!twERX
z>4DJTttv9VGNPK(3>;(5+idq+gWf9dk(t#=x$;f$1qA&A(9<{xlTc*E$6Zxqg88NO
zU#suMbK>aR;$D;DU+t21ber5eWJz@&c_Ma!$2^U(2b|4cFIWSB7t`vHYrOXuxcJ6v
zS@Mo_ORerp61K)Q&Cx&xu%I`te~B)ckK$qW$#IX!f7v^Cm9N`<X}6uHdw*l7AmM8N
z4t`(2Rp-2(8oQuEt4sZkOf_~Xt~0KAR_GK>p+Vo5vs-<qQBC>mJBMlIm}l^1u!QT6
z+5|tRZlK+uyEZH(r-Bh8+{#JT%;JxFa<125Oro;Grc;>7uz$H(=7CbdSrVh#DZ0#4
zBmB?%Rw3uGo*b8=8Y1%7uewe~<iB>#liaMvkI>)TPcT0$r-p~<Y6edB#+#h$Ee>le
zAyRo~G11c=aSM8kO%{aW8|W3rJ4YYpICkn;Y!zjNkU}SExM+sY{C+<XlB#dK&G-H3
z@>)H%RCVFDuxL~0g77iMJzqL*nsZe{{^kpYLg--+wC!%L6hG;dP}DgGij$Q2TYLCZ
zf;Owh+m@+&<~A~{jIUO7_2D<7N?px-T*c5qG9~EOINBUNVdaw?yJ<Ck;kSo4jy;>`
zfynQ4k1;b`%1jqC>q^O;Y~1gXHV->y_m~?8p9{P2=~MUW;x8^1>*;(|O<;SLwfL^D
zBbHKCxt;5a)Fo%0DE+?eA32?Oe{fis(+5|s@e<ozy7K91w87qG?vJmD+Jg!WDixq;
zDvlkq*lE09zvZ+%QZ`ko`CZ1U-|^EfOWV+$phpkstuMus-y^aPA1ZR$kjK8t_mQsS
z7tcq5PthQ?S=1uuEuPA|I;|Wee(&sI(x>74@VoT%2!@@-lwLPqrEO37Wo}>p1>wGl
z{+>mln+a$CarepxU$8v;c{Wrm>hdDL3eQrd(2Ri~6%A(@&g?JQCuOuM8;C7k8;}bR
z7lQz-$W<n;^5p0~sm1&Y2w(_PZ9Zold4{l-<w!xHET=5ly86_g8B-NIGb70*r6F8j
zccA_l>XT;w6ZXU?D{Qga5V4^E+&w%O09K-pAcASFV|~8|hZ7NC4I8L)4GvJx_0@Cj
z%V45cFTOm1;Y@VvN$pMfwg6uD|H0mO$5Y+E|3{KNQYpKLqEvLua3qu>vmzs9MMhR3
z<5WgvRQ8Gprz3>yl@?jqdzGwXmKhy>*ZX~{`@Z{pzMs$c_xSz$`^SCDIq&ytUgLRP
z*YgsR#TDX(59__YqHU&pI(~9s3o$_m_obC1(_i?O_2bsBP&fl%!mcgnp}($vj@ili
zOV=uIZBj&e%6CeT@td(R{bL`QmDNG=9`OI=2X;c$knnR>Ct-&d8*veA4o=!cbCqwt
z)Ey?Z*M^{<6jg6t8k-%H9#xz0X)e&dkbA%+L>7dkQ<m)QNB&b(nT3kQiLU81RaYg|
zqk~7R4^Veh(g4}5AVOqz=X06^y3|ovB;$m5^N@MX)?CQ*IKvk|dtU|mcn5WFF~zex
zUJyRBiv>OeaS)n>Bq2H6@~n4d;&tNFb+ru_DQTXYRP;|m?RMQ08H&k$ThP-Y7_mX7
z?(Mgw1Kf&jkr(|topxP6p`;!3nl<3#p0vSn?sJ8L0)v!ngT5*_&X7h~zgJL~ZP))T
zq9E#L7&-VE`?jBcd>}MwvN(Dy>+wIqwKDfhlm$Q)&f;}K*=d)5E5}>0XqvbFoHjVF
z11idp`qiz(w-2AQ!_cEphmPtq)nl)fD|0|PN1=#e@`XbIwoKGkG_ofSLH@HrBW6r(
zDtGsMbJ`{3^MW2H0dS!$o7FuWmJatcQH}ZQ))QQ}unknerKO)sTI}J+kJRm4`YG&o
zBjV+N*vA`_G=ys$>R0*k5)_PVz_R9ZJg)FX_wItipL3x)18~T);BGAW{bp3h48!Q4
z_tL>#|8t(i-#vV8m5b0HJT*PNb{=@{pakr{g391vZ02(C(a*Ei`x9C)D!q}t6CgGq
z)WJa4!TmBLiJuXqf$7&hCg-sJf&~Ef{J#zSF;m9ys-3hi0yD73Ds~-??jma=X1V#)
z!D#5PQ4QWxi0mG+#kzy4e5q_E8tL2YHa8LteVsS6ZUtY^0&x)a^RGrnAHM7tt>EqT
zOWD)>fhNxK9RhTeUvU%)0vJX@&W8{$(&3J=Hs+<-G^TWvO}mT$Ym{@zO*FW?fl3Cl
z6VU9ltiJypvLPD?9)&+<Z`(d-F4uMp!(K@aQ#1YSfb`4Ii&Dyvt=a={+RK!4>c`3+
zlnMJDrlLcV+4&fiqqvdcx}lgbx@`gQI2^})O>}voe0S_n(M8)>u#xhgFqQC$etFU3
zKPD3kAF?bsm`9H!cNE$xnZ=)hLk*o->@i4}ESZYWKSPRqv);&A_TIMruaI~00&~uV
zkw}PmE9CortMfI)kL>PbLL%O#jA|})(7z+^kh*dr_X&8UwW2>*^Vct`Kl^EL@Wq48
zBNO>D`JWcY%l)<reu>?#Ure`G`*K;7u(W-F&RsIn-O&bqFf~l6#IB?d@MM>7FEJRh
zXX5<NFrus+n<_s7=l$7ii>^l-oX0L_vR#S|r4kq`d<nGu9O@f1(Cd5=LP{gRba(B@
zRVygA#uR{(^q@=K3Id)F4?Yzl8$k;5i6Dmh^KAde%m708l0gWZAnnz>wzKB7V5sqp
zg<W|;f%`#Q3IwT=P_uEYh;L!L5S+s)$f)V&X$Yf4g*l7Woj)3-1x1a-u)}he!We%N
zG{b|T^9EUQ!yeJ|eWs*ThuEU+7B0xnL&Y^0d}4$~QnLfT#dw=6_m2Scc-~2eD8<B0
zVCP)ez{<{I^$4r2iBKz88bcoi!7@|mUqcQPh#-;q{3N^O6A+4sY1lKTg(SdL)8HbI
z7wyxCQ%r<L(gem&yTz#t0C3Gqk-A3y@+JY6&q+}K+EsC4zgl>_p6cjv&8e}G6h{hh
z7a0q$(PRwKT4_n$l@UGcPvEZ<qUu&11cE6HQ1E!ZEdC03<SJdN0C^aQLMC(Lrkwv*
z5c?$a-!ns?PiEV(BW#MUd*OeWY|j()p#Lqw*IM+}Pj7V#1lpS}eSoyb(4ucOA{Wm0
zG&liAx|?mcZV+6WSOS9zP|rbIQawhc?~4T$wnyR=k>*=skkTK*{z)j|W-}nrVV+R4
zQd_#(H<JlHdw74mVqj*D1(MR^>8I6x$Uy3Ny+qGdo$$zw8GB}-^I8nTEY#~*dte0k
zurXjE^%QRU0ZF@JKmZg{1i#j;|M<8b<<uaZmKpSIz_~?z1n4)#RxUp+DUn^_xd|lE
z))zSaMe#tjUU39^4<)++5#;U&`<p|YXz!<vVVVAuMz!C*hbH_Gs1cq<_(OE+W4`k|
zmHB*8iAvy5KP0AW&5T2xLJve1E*g3mIV}x%dpq|b<z^#LRl?JuQh~H)m4up+05|CN
zT;J2bD1l_-DGy{Xmt8SbeZ2E0tcL80?;dbwI#cgc&VU!tY3eS(OqsCSJF2$Gsr9$c
zU~1To_*KgRDhca;=QV_?(HXei!zXWFH{u2UUvFvocSz5y_WFe3FOZ8Na`Q1z#p+og
zJxHpDcIY22Fx&Km@PlhsXci$Qa|R<S%oQd=w<u#QqF<RcKYqr$_y|z1({WC9oI(<5
z_kEXhuzmh5<B;yhQu9&g`Xa}Lt_El5xe=)$_32KtH#B_f*pFmkW(1t6yBS&&)X_y}
zwM{hkJCCl^#k75t2(=ed<RVgrG()mVK&u}WWCgOEr}8=?CJ*H{^R9j3J=-~Mm@(fo
zu$J%qmQ_{v%NZ{rFVcibhnW5MYa|~0%~i*Y;qssSQP92t!NJ<~vo9%+Bb8-c>}yD?
zg)6e5$GzW-*=O%gTa@3lEm}9Iv*>}1&!<CFj)E-S0k&z0?no)J!WzOUNo=2m?lEuI
zEobarJtq+5g4fMY9IN-S3HW@&boRj4O=612HwhH$#@I*RE#Ce#$>9`F%Y`u0L<`n^
zVFpP-!<<CTXv&&0TAV$eFGfu``L-0t-P9{*%bz+;ugn;fcp6(KKwpB~Mfm~J(L<w?
zW$pbop6${b=VD#j8&;l_aNr1j)RkVN)C5~wTm33#nX$y@qa4Sw_0GMjv#S+j3O@4;
zy8xQ?!1*lQM1pB>^w$`@gG)ux`y!iONK9vxQ0j>fwa!$0uXk;-i-~^fCCwdbO+~P1
zzkYUkJA<x>C->H=4V#bALuK1F{mZrkw^JzF5a{tDcg6)_O%gQfi_kk$Q8nU~fHd|P
z%U3;MwXtnvWCWj<BoE!0eKt_u-NNTjt6&KE2Qf?Q>~klpX2UOuo=oHrchJ`O$SJ*#
zUs<+A@S?X>dsZHU?m~f;yZRjKn&?Rv>UdjA45eP#Af#~)C5TIE#)GE!o`m}K+3vL5
zf?MJom<N~AGkqAHTlH@<%8sN^>dj@FWf`uok;-=3ujoyL)9KcWQLYs!EERn;uiL_6
ztXfy)Ri@vvSnN8P-cvi7>vArUAnstRfH`s?@5+XZisi;{Z8W+I1{`tS`tK^SdLeq>
zWE$PCHc<giGZvJ+=BVg>#ms~kdC_4{dzGRFjkas_ULpIp;fWpwAG12>8mK#F(ii2P
zo7m*B;@j0?CXo5l6bdF*obS+&T=BNfZ>AKbCaic1aVt0}5t@Up|JW;;BDO9boRjUU
zx>b(2TeHZg-?aCp_qGz_c-kAn3i<&tBmP(2R>srb)oCuYSG8j1yw>ItEZbU34j`>-
z^jEL0o=zlYWEGgcp|)piMVh8Trv?8L1LdB%HJcMkq2o-F&4L`TKIH)0NG+tq@r_*0
z@Jdim6ow9(ndjc!DaZk{)gwe5KbkRP`EdcrTmv!-2(h%hU*JOI_<t)WV)E+WbuR!p
zs<N}^iIOSE2p2P}W|?^wtm@%WGRBeM%6Y`<L5J&VX<SxYVLR~ik7a}uLTu@Kqrbes
zW}X{~lR$&e3{A5hsUKdiuAT8(l5KJ)TFYx3T;XUb_nf`WJD)Li^7M_-c%6<6%%{MD
zo;wE}mp}RU>~=G;<(4~i?elSaPYt$qH=CJ}Z4$E1wyI@pskFE+Q21KQ!EaQ14SIMi
z5~BB5y{&sB;(U)V)v{#baDSDQ*3q0N>|%7OL^M@jOc6ka6BRqlJeDGWb-&izdklv(
zAVr8c+%6|wXKBNTxp)qxrV8$qzgdVdihGfIH;*LuT#8tJ0NCZd;P7(0V5!eEhbiKa
z&UP2Xoq)_qzil!qVorwUF?K)+;@hmA1D!M6l|w7%2QC+@JnwS_mW3vT;&GX*k;uJ^
z59Y_3l#u>(q9!w5zHytivQl`8<;P(ZyFi1*FmTJXdN320YP4e`JfRAB_*vTHM^H<0
zftq2*V9=BDrMiRRPoXQhep^P#^h-NII1Yy!gm>IXvPrYn$E-ko-e_do%)%lU?(>=w
zseB$G-TvqdnPQGz=8<m0FM|+S^+V>G+zT80h}XxE3HLZ)6ogLK6jYCd4%v1expL*o
z*XehMy?!izad|boU6yBAX5gp=UPj^gkwfn&Q!KXVmGl%%%tg;ktewUWUUs_t<oT)1
zA;a?RU0Mj(C7gA}d3$dNv|YM2%G~gju0a!Hnq{(<8f#Z0n`R``*ju=FCaUi%&`eY-
zR6$EDosyn)!bH``qGH0t)suaQ3b<8OUmyCz>5eDMf<dwIbJ|$?d{xT_(GM~%>b&d+
z`@`tP<EfHN)eBd1#XDbKeYTTOv@jJ@l45};P8_p*Mm#@iR~fR~1PY^y{MptGE&B}C
zwN0Ez_sRsPq@7(aKT(f&>Ui5GQ~a?!($Nx6{ZZ$`1UAz8kYZNH8KdGSjRtY9ex<-3
zOPFEzOz0Y|%o_1?u1xjy;#x3}xh$|WG@EaP)N71DiARmi{qpw+t;zEH{F~F~;=_%u
z=9-_X*R0P-pBU_MA6_fD^tv~&AtFDEo>Y)@mQ!5UW>38fy4EZ-LWeBscC1?_J77K^
z9RAGEw(MaPEAqn6`nV-!TY!(QD&H+^k;1Y{p{~kCrqzwtpJf#{TyhZ6Yv-wD+<uGq
zLY_IDCnGh1qpn-bm+~~{Td&V-2{aN1C$gibMiY`XY388XZKY}|Z$6#+U~yN-(aj;_
zml@M)`jQM!-QLS6=hX9X^{C{H);%1rTU+L~(v`dn38%VZaPV{@VVKUAS#JIT?}$~8
zp@BoWriVyZ8t>*B?uug{ogzPv32U}FeJQuZTs`PK-Rs%Lq`MG#xYA~<aKvt<oFO7R
zBrh|XWITKRnaPGPj}?}W*5uVO^xv}0o@&clk!!14W-=B_o(Q3?-Rg1Gg4(i;UOwC9
zGdzRZN8)&1M{8Ezk1-CMwZj*;KoFUHKG-F?v4egK>y#_>j*$|SZObaRv6!R-IP2qB
z0zZplVg!!R5YX&fYCD*^u1S3UbdYy0d!n?T%Db_>%c%lLzQjj{x}yiV-`rO4+1&A=
zH9BnMa(^^XJ$A*0FC#4yMXu6N6S^fl4wQXTn@;U$A1V@GzBP<cz1VYO%QF2o`AXP&
z9GY*GD1*{0W39QveRbn$3DX5iwaT<h!ewV)cye@)^_9AmX~n;jgoe`<o*aHrA>ONf
zKj7lc@V4k%j9i}D$$gzWcW~0kWxwlCn!#p5C7)4G3hC5@h5TAP#%c8yy=b-j`gmY<
z?=@O#f=tOg_^5SAW3y_c(ZhU|f&CH70^6Q^b6>NSm>XdAi5EA7>R)=5mD~=?@8W63
zd5L&X$a&=Qw1vFg<EBGe1IgXgq~t7{lIUT%A0z`6bw{OB7xL4y!i`T2=u{?ZS>BtF
zu`D*J$m%qiw(5ZfH&c@QQg6=ocnt+fy1UG;66BfIv`<xKcUl<1-q6>Zb-0d}Q{T_a
z<yf^%w2simzaN*{OjheO$5|XOnwV2j=JS@+MoNz}ilM(es~oK^KczM|j)oGHxvltA
z)kP=E*Ih$JCR5By9Ft=VCrZW{R6QnbOjcw1;*re!ih<{!#%Btc>tsCFzFKc4xHFPM
zeAf2mUAcI1BO7CcZAG1d|60}4g&UY#zK&Tbw#<I&9Jc^Au+{gzYL&7UY3O)&K(DMR
zaJpr1>ZA8sZpDE73(1P6Id+e!HZjdIU_T{w+{$0&99Z_VwYIK^aMBT0et{zdKU=%B
zyRgedj7lMi+fYW47eohNem695kMLCJ8ucv~g#;ho)xpy?x$+Se>zfsp6`v5yXGE3^
zIJ~-NY^`Hg*XTKHq&c}_^WsVFzFzyDin~s^^>laNTYM97S5^}<A#RDZ?7Sg9KeeP(
zmha_FEAFQ(tNEUS6E7hl5utS-D##GyfRDxBCmm<a<vAR;&cde|w5$d{Xim6L?WWmE
zpSL?Mr<M>-O&CelbsNoBxHEBDsH98(JO|ab?mG2LKQwpkz*AdZAFr}?lH}Zuv{@_i
z7k1}~t9Jv8-D8x7dQW9yP7*zpaJbWf_%`c};fG8xuiDiuzjl2?n%tIo7I|MOOrMY?
zI#RvMH=rV{6nA77z<nJ_m3bGlwbsjtO?YM{uRPZ}1#a8Dx<2PAle`ue`6*X$3*1-d
z%k(GN=_Ou-h-l{7?px67Sjihum^_Ga_DiR(>K2kq6+fMx9qy5FOkDDIR$FhCtNO)~
zFDLe$GYVF;oQmw8{655_d)u~gXI*7<fp3HjO+?7bdCTZdFXB0;me=^Sr@!q-hr|B6
zG#S^o7(nQM$$Mvsv*B`#UATZ|kQ9$-`9pr!yF?lR6}})Gf$=y*)eIcCCIbaXyJU9Y
ziK^XG%$*!qYDq@Qsch@|ORYQOBpRDo&Rh0czew(AB5}OdwGkOL$SPkU44ms3)i#-m
z$*FXIXQ#V0RpHuZ0`YlP(8YwSiQK84ZNj1LSUvlQr}-Q)+j}Mt%BgMg4lifAXO!Ex
zHW(D<d=_8MG&zyfeM%t=*7}-#59>&}(Og&mD`My6kOQrKjjNLwo1J*;p<9y8B~S&g
zR8a8f78M~JuLT|1S+IQSl%gYts0g1-OyNg{wo&stNk>BEI2R=aZkj>Y<-O$-^$tsh
z{Mt1KUROT?DImx1(aOW-UUh}{HXnB~=;p=6`w~nw9r}*1YPn=YdTe~$I~O^3RO(c-
z*G+>gEsY1Pi|)gQwbUiu+l_|KD)1OihBWNYjm>!3oXh-+3m~mldh8pBJ#G`Dx%^H?
z>Tv2Srgmy66CTu-mE$(YJKUy%>wU3$9Y5;(`hiqD&~L~!cP8zHxGu08iLH}Tp6E@d
z+TgmEOs;Px6kBAIQvAwYrzXxVb-4sN^_Yy$IcB=Fs7roV;RxRDG1Enou_zul$hS_P
z8pts}Rhdd$!g#K48Yn2*G_@Qy=ec*sj?ER`qmN5ko#L<9WI+e@bCJRv-*@1w&5miH
z7w3Tkg=3T%Gat@6rz*=jV>HMRqo#2O+eI0^Hh5vtmvY-JNuy%G5XwTlZF~}Wx8>tT
zMitiG>=`xW?7UyKztXbjN{J<n70RXZy0i_RDY&@fN#)qN@^9N_RAg2*7B{56bO9AA
zBO_9u*}-1=*zZ>H`&1H7%CL(?yirzT$@vD8V!J2Qn(j(*$82_tY+bq~!a3IIIzA`0
z{b<L7{XhUG`MCMP_-p!u^9(NP28>|~eiu3B@v5_a7p-kqO-b_&<G`(7Hzr2s&L-e-
zbce-6-4VtN%qxV~)I^1MLY^d5zLvJDaNGu4We_sKD=V&F(e3GebVqM@N2Kj-FSEXg
zwk#QOO%pbx>U~x_`{;Bg7E}22wqN5)d6R7KMASr-0jCiMmUOimpIKavZ{TX2;gp$n
zO?_7WF|(K`Jma~Q;oZC;Kqfg$tWJ#fPL|XY(k{{ToP%e-$y9T5d1erUX67**7WCzU
zwgdMM=(CR8OsTn37SUg=u5RlwzxqUGd}(WZ5FsS(Ov2-)1-*sUq=w>l^&ngBjI0S>
z8#hlm8z20@z~rjH4hx$?!jR<v{UVov%F>nd1e*ac8oHa7QX^>Bj-BXGZu4|6yXcUf
z#VwbXuQ)KXQryDXjzp$+#y_;~N$u!3!|=A~ng3Lp7YSU7t)s(Il4K>+K--ycZGONF
z@vHj7?qT!hquT1V`m0^UTb)}Kanm%qw-x3$dnYo=xAQhEZ2mSVzk1$MipV^$=r(Jk
zs_LJ1wZoNxM`U3c@hXW|UJw*7X6Hz}JRlKP8H>4s=dg&Cy>8dO@S0lDv?2b2;AGyB
zBTUo+?~XcqJ2~Zg?~EsHkE1XkJSIhzl~X(2?mxEhGLTx#mELc_x2DI_pux^GGkIY?
zkA11#2uE0~^*Ykav2BH;^r7}_VL#tV%LVGCuBsJ!n!eY;K^bYaR=O)Pv!cr%D~^;F
zjJ9T(dOu@*o*ZKPiu34o49|qy8)F;rINZ#gCb~x}V`t(4j(S2IZ)R}KoiON`+qO!m
z*pVG;oO`}#aBXnO{}_omA}{2&ZEi>UVn;95WvuZ@TRGLJ5?K2z*jHEJ>dd}zf=(!I
z>oeJ*9<MtBvjlL~H}mhxh}(ZT&d4659*w<DMNq?wr%g^utSY_qYTK!{(&QEY1dyuh
z7WL<whd92C2VHhn(Q%Ln<!R9;xTZ!>C%Xp?7EN9D7}~yS_-0R;R(Rk1<xfT8`j~*+
zC+P>yjZJ`CHDRK2JLu2_2laEdp$!`M5%Fui!;q)ay(`N)Dff$neSch=VVR0UD&c^A
z1@XB|{P!Kbj+Ua`Ih9#OyYBbDeBbbxA%b4MeQ1T%sbrHSX;UqDAlCvtCK|AMmJhen
zm%Bs|);?n`w~wq48B7?Podcah##i+%IlI+AKD?k3?_zSOv;DAs`OTtp8IL+}Wskk;
zJv|W)zREYsya|ACt*yF3y@YbD-Vj#0!yU>;OLxW>Wt~|ywEQe;@N#du{z@%NJV}2t
zFLTv}`k9d+R%9aR^Q82B&o-O!oK3DRoUYqE7`vRtq(;4*O_Y`{RlIDT8Lu?)>9l$G
zH5`j!(rdVj==}!*6r?@hqYhvGwr2A9seCIT1CRmeR2C-bC1Wc1p~toR#$1AGKHa7|
z=8a;Ipm|uQ-W(#K+aCK`)9`6#`haDlw`a+%y)x=?)Xp=P+9|?t8#mvhHG675P_9?@
zh}L8n9Hwxf!BKHvnm(Z9$M6ur*O*-x=d`XJeiDyx{pJ9IPy?n^hq;{iZEc<#gA|7q
z^N=Eq=ElZGbM6lMC?AzN_rOcXq_PwM6^)fNmN}bp8gdejddxK$X|c0?sRrgwQ6OJQ
zn=r~ex$4KvD=O#%#3s_p9Gsjfvi;s8L#Hl9Y?ys-2_AZc;%(}aA0Nx5w|jf?jxA4k
z^is|Dd)Py_j!+&scikzDA|bI|gy%*jXH|kYhRd!yg$R8uitZXOTd#|0yd)npUA7LM
zFkhMbf^auCso&D8>(loM53PxD56z0rXPpA?OC;Xkh_t(3>K2zAm@7Xxnb%>Q11u9j
zG3@SNzu#u3q+Oi%F>xim`2l_jvf~Si_uQ-fN`e;SB8SCw&3`O^E%JBcQS`xLC?B8Q
zLpH>Ox6=5TTI9z??Udd2_%3<zshcg{Czu2c)URwkisf^qpls{BTZH;88z{HYjtSiM
zBCULY%0dwWuj(GkHYwfpYZ~C(<%5Mx@hlkY3#Rh6*IJA?VHy_vS&D}`e40--;H-Te
zbjkOVPlbZy9P0J@<9^mX7ANy}Wa;)lqt&gQ6HYsVvz9j4RI>%Y9c#)?i2;i-gF-A4
zn-~CJXWik-uelM|i*0*8AY9Bq!AX{pG|mZNVG<#=>22>1T`0TUeS0(Ww0z=L<7h1A
zbK(}f3e{5?%58Ko5}I>E)+5x02w?D`_F#o!d$^0|I{+Uq$gh13iPQ^uoyQ6<4038s
zALDpEqN06i`YACZh->j?vBvYvP>&{M_He_ABa`3vo-=}BxPYXH=rHjvD`x+6^{2I~
z8G%^4`tgYl`mhG(*M`{zHR~l!nJM#-srWk1+i*c=(M!{>^_%V@l?_OIQrYF%)-0d0
zec@$vHh2Ttf^vCaB@?^a`Qjd@3-HIdN#fI&sdNL?u}XSnrX2v7kM_N=>B+avHoDKo
zkD3aDF|VN1(;NctVxB9`(MR2K*y{N;85J1Zow~#uq~4T#7=enM*XNv{gAja6vFq2y
z!;n}q-?z;N53eM|Ls%*OxyZz&X#~kc`gobQB|F$s%`**gjCBTK+tdmd768DXvxt_T
zZ&Wh>sEN=Vm33tR;8S)XJ_%F)N+hdvH$t~?L_$Ihlut#%X7Z{$HG1`q%MXkBk#0v?
zSrEJQmL~ehl}G#aa|CWGSE<1Uaj~!uRI`>GccI#{3!4}dV4C!B-!?u?_3>BD8OLc^
z@PgKbR{N3N#%|kGZAu8}VockuIYsTve((cl*E_8z>I;0>L<8pJ)4WFTH5V5%t8;Sj
zwe}XPmUGBOz$I>wKCQ{`7DB5VDru4ZGD^~M|MR%5IMGcsx+MZh8z{5k)9+Pk@3PVy
zxQM{J7FTR-8I3}2k;gA!<ijQSgGM7(Lfyc?zzn*6iUEi;yY2FRfO+}t9%<BbN}o(t
zzR)57gHLUGdhjG048A*{ss9L^y3}jB<80{QcTlj~t8Gi|>(hGw4jUe;?NTLDS!uHc
zFC(@0$TLx@BaPWTATP}zZDnzv2`iS*g2J<+&!xj?O%Do+rxZpgC}~<%e6V>E4ThUg
zWD6%iU$_G=*d^ZSYNxiX6+d_wXT8ZvS20nI(%tgw;OREu_=py=8#Y+F0Y{iqI=+z=
z0mO)>FG3+(bIEo4ozjj^#pQ%FgY5Ux=<;{}ye+I$@XMgqHq>-)f|GogN1h`D73*TS
zh6ar*R#p!HO1xrke?1k{=vT%~09;)71@FLJr8-L3jT}t@gICmqWR;*tkku(zeDDYl
zdE8Iw3pwz>_HY_9MED>*{$b_q%|h{A&lcI>Ffjd3&w;em=BB2HW95xE$l~ph=%96C
zjPtq3o->IY*PLO@WZxXb&@BEQmWslNS8YAco%k%tlnJCo>shQl!x9jf)<6PXiud|j
z4qy-65<cm>&|B0PM2WydK#E}~cshzNSv6$(eZ-H6W-7vZ&KR)?+U5=u(lZ}DQSz^O
zdNQ<FczW_0j8sV}k%RRC5#R!FY<*Pqz_AVFlx(2a1(jfIxyHu%L{?XsPm6M;mcf&`
z+xDl-=#w87A&akP>_6SVoC_6RWq~^mzoyrT+^@f-Jn8Zt02`PB?%RI}FiMi4ybH2P
zy-u^KF$@lC>Q(HxMiT528^-X+c66JlDb6|VAW0TqpXnEK{8GKcJ^r!I7JRDZe*T5G
zZIX^pHMDl(?tnZNG)uIyAW~gTbcc+r+J~w<(Zy>`yQmDeH-@YxK$9mgw&*8Bomte7
zO4OjLHhm=5NJaR-Q$fgkEa}*`SDKa5oSp?Ap{}xtim>|yU=?noMZ*ynzT9Ecz}HAi
z(^AshVXJ1;UFx<uhyeR<^isEyr@D^?iIfB&m<ALho`$Hy-Z%ct%qEB5KcXTW{wz4&
zTKX{bh`~#YEhPsY7+UY=01ZY;hRytDinL8I-cP>lH4vzaw-X6c;<KeJ!Vx>OFPkqA
z+pbIqk`&1U#=}k5iYeU)EC8Vq5!>~lEF_RuL4M}O^+`U>DJGZASs*eh6`!1R;ZMX7
z86|rM2h0t<@n%XJa5pyHGHn3n=mW4vkFVbr?)&Jsf$}STr-@ak2u4!!l#NZgXALS(
zkoA5A`=+_T89BSTH%$2RhTj)wQYP4)&SLDi&r@Maa~i6F>y_3rO2H=9RLVC3u?w4S
zRW!b^6w=Jwa#I-{@CKYAXjj9BMC1-yfjl9frD{f3K(_ho9^K!drbkuGvUlr9r|=K$
zH)^xna5t#t8!A_3jHgu+V?<s%k*|XF!N#txeT<BZ&8@8hQ&UrU_SlCaphj`eUfaPb
z4fv^<sVL%E(|Dzs?CKuTThH#m5ox#5IZ-om2mreO_2BD+n45GwAVG1xCcBBuClEeC
zv(Inu=3IJrk6;qQhF)oq!Z#?+ZPY_#nQ_|`Y#nQrKNho_uZ=Z;FVP0SIM&2J!*EB&
zO<)q6C`?(olOV&dH(*mFK5}DS&o?HN?}5;R)l-Zlg)vC0FLQY-D<ku*CU+w$7$~T$
zh2TmX$l_K?wt%WS42$v$EYe*F;~<W`-?D*cqxUXN3<W2FxOzr%_2?2MReI6xqpEzx
zK8$(`Bo~O&Rs(yUS|Ue$xHu~O1WsR^XwPVFZx;kJ@vFUNZ=Vq#Xh%x>(yxA?W))YK
zW)30&;bjPdAgBx9@5SQ8PeTwYX&t}4=jyD^<P8UJX^N}T<_rE8No*D^(cUXUlV>$#
zYxc#ym56MqaEz<XZoDRv#yuS7l2T7(y%{v(wN-k#`+()nVB~KBPNzRxJ4ufQ<$4cG
zrIpnWpMNVYxJX)nZbGbh1BP;Rd0WGM?n^V^td}DPA*<Q^Z2+s-m1Y4XwQl)dX7~}s
zjTfkJK!krRZOm+>K9>E{eqH4aI5ny|1%v+8k3THLE+2~x{BGLekva2WFu}2b`*}y(
zkYmGz=eE9?rWQQD@_CCTjnem~=shp#H2O-|aMpC)TPSJt%A+qCkJ>7u`owdgapaiY
zyTF=juAeYzmkk=WpNqZk);?jAsabJ<>eQ-DL%DoCGk^b5%KP^!CHJ$)DzeqrCtW&l
zZG8%822G6u6~i<#p&)uyA&^!#IWDzlB4SBLsX(%$jJj||#r4d+?YlHRaTgP<KR)I>
zHa6#&e@P*cz1yV0ZHUf@MmJaWNu$)Zo^u*wk5-F?T{ZelvH;M$6EjlYo~w$V&+RQv
ziK}(97y9b?+D3FWPn($cz~gz`lIs=|r?Y-wr_7>iE3qFHU7!C&bZw>6ojQ3W@P+qe
z`5_Z0??VR{j@Es@8ZEHaIDITOk8${F%6pNe^U_b~Kb@6Mo(;LES6{=ez;5icBD2r(
zetLcBNRH~RCDH|=oAk_7;roLPECV&2SeI73mMc3bZRAqp=~4Hy)Oox)aIL}-h=D(e
zyfYL#fD^XC$E-h8?J_hh;aBs<-^V_ps#|g!Z%Xzr*zsNb<58XP)CmqBQ^~p8H#(EF
z8{8J2UW=E_oD<B3qhhsQv-1t?XNTL+)!fIj!5LiiW##k@SL~gn-^&ON-w1QEV{e$K
zvC-&x5^JQCS5%~LaN)vwc4)=VF>OS~bo(ed4)zf8&1#mv4!t=Sm~Cdzt_iNd^6pPT
zTx8)Bz^@t9$VFD*e464QpJKF}cA;{nifZ2PEd=8G-*V(e6kg77N|y$=?vIO9ap9u*
zOFwKL!}DXxw7T@qUNGajLGiS|k4{JLW1$c>k=P&AxSK4x4B0@Ey42&F7r{gqUi#?%
z`}$F~(cFOJ4NH3BB^VV+{vq#CgZHT>Hd35J${}&TacO6CKP&#d<W~5S?Qd`%egOjj
zwO6cQ_{W)fgH+MSMEew(<{UY4<b;k+=*xlv*%ny{z4+bwd*}smj(GIYRoepApq`Oz
zW^yurljoDKzpTMmZCK2Brf5L|_!na=WdGtnSxhj3Fh;9@X%@U6R#@*2HZP2+iY4G2
zyvW$3As89uU?x~+C6|?I(Gc3_ZDf-`VFWs1MzZp&RD-<%#D{x6iO8IN1TXa|TviG0
z+(Y1humhg`mSGzsx=8pOSfp@8X>QcoyhLSKZ)#Y&FZ}*(wM61)yt7iQp(p?KjhiBm
zCyszghY=dRh0Z7#_C`jG<p?T*;ZI)<VT7`Xn`m{@YodEYzp7|VcQ$?d0xDn7n_(Pm
zP*4EoloFcUGS2v`y@lC1p_=rl@WxTmJ$Q46_ho**L|3s(Is>P~X-L3lE58&7hRo%+
z>dnvG_=T>#w6f_2yQM`U3Di0q0KVrwjJmP0u_<|Zn5&o?vMKi9269@Jr2h4^f&&v+
zcv1#q|3x5ZyRCXG%(z=6Yz;kEcfj~(6j7E&juq}D?N|j4GKwN;*#xV{nvz@E%Lt2A
z?*K%pL-_WS5^|b2b@sRnrRuV(2LDoD*hCJB5CXa?cejJNH9y<*)6}%yJ$Nb6AT+pR
za5o&ThyVSgJ$VI<3{p%gyu_SFPFuRGCUOjD!dZFw)9c<1i>~r28A|wJIg`K-VUxQ_
zwvPGCDdXeg0ixhjS66TD@1H(BL!&#yNe5c|mw!zKyZ+bBfc0}BWkU@APXjwYFYj)q
zzRcCmLqLLgra3y^S^;$Gr{4jXe~jR2Vs~ffUT$vgyVLaK%Yx+_r_7`#oV@N*M1nc1
z{#Nq;(<(t!403A2fkUhsRJ-}NFCSPNbjt_;_#cq8?G^yea6d4OHNdc8`(@qI{0eyf
z<JN&2HTEf)eBW594C_p3bl)(K{fj#MsjdHtk-$R^D=90XcbBjqTtahA4S!S$?11z1
zmO-!54@@GnXBF|K4=3#F#<OMN9!v*rI)ECF_zYOdqJ4U{eMKDTwk**j=XoO-I?9P0
zUO%*r9zo2)Ke9nu6y(?PhNh$FLF)uly1Tb`e?~?|wvqpDj~wjKWmW|2e);lcHL%?w
z;L9hG_vhjMs`LuP$4ELpQn{a>st#ke1!aI<{MX<v`h_QPp>OY|LN0ImdyucaVKr^%
zP6hQSCmLLK5cS7m;Y&W1O6cp0w}2O){wXyVJ+(Bt0?0YlQssL7R{hy-Cc6K+zWcUD
z0l(Y)M8_GRQ5pYlXYC(S_;g}!t_}wW$J1xePImY7#E)|W#Hys2$k7z>Z!gpf91*XT
zYzE92f1WW&?ha+-l&FUKezV!Z%9EC^1mwv1Lrk1(SN`M3y}(&tzMSmr>@4tyM=99X
zHjKKE+;CaYOamD2U#=*uTGJcVA@q3crrB>)tP5lld~h`)KZFXYO6e>Rw&48=dLi)|
zX(T3Q*DDn*ZmIFFR^i_--`^JmBF1PWPa*ODS#ARo%8#V-Jmgmue4iuBp_bUqO=L1|
zX_L&kt$TzPM*wkrHa?7OkqaD$neE!BEKWZ7qF5#!m*$wq53qTuz0<~lk~xTGX+8+I
zYVB~UKYC7l!JqnhXnpCk$$2ukt*HENg$|!5Po28|+^|UdeF!h_f0f^`1n2jW1@}{R
z1SttXmQlDszy=>75}^Z5BNNRTRy=|LGG-T{zMzi|Ts#fN%|T?Z?bS&N_KkW%PlT<u
zA^=9EFcm?Ok%wRN0?4^;cP9*T&?&Bi*67<wZ*U8_&^|tNg*S8vmvBVr`ylAKx>(Oy
z2mu3CQE(eeNl4>$B)zDn+Z)oXYN5kWWZs#Cn_usngxO>k3EFsv@W_1uEUUIVa$X1#
zZ4dtKIs2Oez=hE|ySl0=DcwGT!JLG&F!3^x?f5lFw0H<~8G>2y3E%51&46;wZ8c3J
zUKqTo58d9-5T*u*B_rV7Km%ifwDW#WvPK%eKleQ8eGDKAs(-IXIPn^BQd-a+=TEG|
z9eVIp1JY^s>9w!e&E?sr;<9aCaMs_zrTYg6foQ<v^qtte>})L=8JR78>}2@H2iv^K
zkWn{un&`L+?+#}AukwMFYJT7q5X05pp=C3J46mOey)E~2O*8;5!LVcGlodFGf7wm{
zMuPJ*SRG48z&@$pU>`qxEb2+p6ZA;xfx^uY+na_$#3us5pQ!swgVw!0-`rROYEq<N
z&boL7te*p5{m{VqIL^QpbR4nm0uWf6T33S{eV_q>f1E*w$O$g>wE#DMRRje7!$_0R
zjF-BW1lex8|FZ7F__0iE#uJZDi9Q5AwhvGOZ@xVH>63_d&MA1A=v_7T!63tK|74oC
z`#>h8iH@ou?{?4!n5(|NzPY=b=%{n$Co-+K4e*YS(tjE;Pt79X^c!sn7k$N9^paVX
zH9!YOu3^*=gqnh2#1U$fm3mN(@Of(|m{KqE@;*G+rSvpA`*6dlJ!`!P;<gLc{of;Q
z@UY)vjs<@guHDT#IOP8u>lkZknf8BS9shlo_aT^Pmyh2V94VTo+R8M#r^X!^08|0v
zWnp*{-QsDwPKd7uPmGN@6g+teG>XVEl|k}sGB&Q*P3(p#x&JRDH{5(-kii>c8o{<;
z-TYszH?T|4<O5dfBTw|;)HB8XiK%I#BN0Y~>dvqh)CLt$dDX^cG1-|d<}irWFTHW?
z(*7$}!?1DTJ#H8yPkz<O?QN_l?!fBq_PeWsy6$fQh342OoQ57p_>=VM-G2KPf)GgI
zZRx{*V3Vj(Vg<=E!5kR1_DrMLzd+3zRyZ|TdB9uJ1;jh2qs5e|nR&R%cke0L3JBIC
zyfa0Ab&99dJ!~8d<{Ht_4wa6Pe+hI^$?$x#8Lg>@n9*^6U|~0v)m-85cJFyu4P>6B
z3QnU9^&)+3oca@w0shNzUm%tiPr}An!SkDb;L%h0070p~1alui-KYS;#C2-Xzi`rE
z*PO5pLBNoGXmyo3t@p~1RcG)ynH|r!)9czkAeiJCzt9Lcr40oV$H4OS!zR(7MvgEX
z4;|^}&UECnKtQ52;o=i!R0MslcL`RBgim<8%8M9DO-AZx#-?+Te)14$(e#AD5Mv!x
z&|Ncw=Luepv)Y3`kVF&6Eqj)liEA(L3A(1X`~;^@Twqkg=rY=IJSTcvKupe68hTZv
zWMtGc8`%PPoB?h5?*H^uPntiZe>^f<K+J1jFg9p2UI^-XW}33sANw7aY&cd6^`!W?
z|8kNJbH^vKqgQ}NKo{IcJ)t$*zG6K;MovM`mHCtS%-DKC%>$PW>FTu)=m8F%;w<Hd
zb%t4v`NdHnOT!lp53{{`H7qzd*v$3TS#sRx!4?4G+QMdKx)>?j#Qam@^a*tu2iPJ%
z(}ckW8wl=rhgt;Mh@6di@{}JPfjQlm{@T<!V)j-5*L>T?L$}}~q>q|fJ`<D8JAM8h
zza|dOG!jKX9cEwnMcD752I^qxB4HK^4m|=Gzy=goqL9~RY={v4&#!|JMMT%8dud>`
z=G{`9w2lzY)e*s_&{1xKojdEJvJKrvU7AiLCX#YVYMct)$(^v4pRx=0>D@?oJOyJE
zS^W4d#R3`SF<rtg%~Tjz^u?`Yw?61E9}+`KBF~e?c@G@&@WzVZGye?ScykYGnZer9
z(cZ7uHg~o5P1IFNq(uIq5sw-d59r_*T?YPmGMa<Mt}_}!JpFXkAw`uC9;3>g#5Az0
zZ(deU<fzPV+^=Vmrm`Ug&|Wya5wt4T(CM_o+)^#}j-$u?SAa<b#MM<NWTi2bf_kiY
zWJM^C5YV-KdJN?Dwl^n)P`R%yfK>I3cVs01+jvW~@o$LZcCK!8JO^ti@jtEkK9;e`
z$uK`Z>MPdPj}{kQE!~g*azFFP-+x4;`L&tk_w9nX7ZTY3)&!j<AsHNMWMQ=tV*#@M
z%m6=yh$QMqYP-c>mBd+Z^Ha8bmT6p8OlSFdE8NLo*bixgFfv*b3j4tcA#wmt)Qyr4
z{yg~_iMomIbM=z7Krr)d#Ti;eRP%UDKFnK(Mp7b$X0IT7LWCE(ZsgVwL$fbejZDd|
zEe{PtmVQmo$6z8N8ahIAcllr$L1@?n)Tf{$j|2b5#ahaW0X;N~n<7Mwgn@YB0l2)i
zUp|D|+!+2GDZCBe00tP4H-?{c18JB8wcch^6;Kjn4vdKUtMeZqlrCbj{F*&yu>#5n
znjzo&x#!Apqs+q3_ex2Vcz|cDUF4*Z7?LLJnO~<NDzIMdqoCAF^Sk{X-N9B(tffao
z)L|vSWzQYLKR4C+aSCcQkeJG?2|_TJv^HcaOeHAsBFzw>H(rR@RErQ|s+xBgU9a!S
zs?{8^+qju23=C7*>yjVS|8ijSK(}VZW6m8zZH-et*tU$Xi5PTM^APYX!)T`HkqF$E
z%s{FsklZ=2K(&I&l<1PEf!(Yl_Q078?_7nWo$~?DavNsEfREb(h&xCJ15V3K^ym*h
z{(Y0$%BY)vB6OEvje}?-S|Z>^8Ej|wWyQYQ12d0Qxt&j*xi&0HM7JS|1HqrbV;+9Q
zZ2K$-USb<pL`O;Ur)vD0)&%@R4bcKJFP!Od?ns#2Un?~Pot!TW9f1$Y%3+%AJhVxk
zpYFS$63o@*88ldfLDpp_hW!}42hrUZ!sf`jTN&;WQh0-kKceD4{nUvkwJrmylx@Y{
zv>SY|j_~eKS=X0bsF;m`>8l2D>;c;HAXhPKvY7S33c#wJxOw;a>BN|7)8r-^zC>8;
zQYQ2O67|6LBZnrnvKZz$(ZNy(!O-J(_{B)vLcB0TuyqKCNLE*G@I`e$@<ar|A_I4+
zK5LwXH@`bZA2Pia$Rk%Ala{$`2ssojd-B03y5t}12P44=XOS1e9dNFUj)!Rj=px<U
ze&7+X(o_Sbuo<|G8a{yL9cIATQsJ!Ci;93PM;FY?yqK6cba(hp!pKi@9Oy>TQB8Hk
z@tPU{JK*WhxS_Rcq?5ET032hl8uwGNj2Xa}ZTQv5G72$N<A>C;(1{ms2g7ZucsT4t
zD6D^sN@zKsrX@VxMPkqy^(bGlgUA2&E}J=!qO5uGppW2tfMk=r3~>-!QB8~qyp|LW
zvICKW8|X^N%X#d#DQ&>+nHq}Dkp_><ej__9s?m;3-4L$b20_rXZ&mi7!gebnY%j8m
zqX!<ASKus+Gh0dZ4J$l3yn`&u%0Pg$PgJ+qBVU1qbO3*$%~vat5eD!OCZK#(7rkF6
z5dB9-{ZN!VWtD#(9-WqhGTMeP-a<xI2m)S%Yf!a0U<5sOZv_D^hCn>$VIkBj!15ue
zwH2XttAM6@y=F5j74ZywbM1W`4|=eEDE&S;6!gelg@HOJ!&f9fUvEjF)h(fYT)W%n
zPdfkt8eEbFs8z;d^tZqkE9qSlJtrUo$(lV9FxB>0h-*W_1dVQbz=vn(oVURJjlq9P
z%0}O%w-srG00IyZSVBO@Mfe;oqOc8TJ?wfi<78q?>%-(e8a@bymC}%RMh^yWtoqms
z9Xx1tBEBwO_8h=+LQqC;_JD|NBNKJB=<Ww+kv$IHPeK=TObz~BWLGfywWoRrdY;ig
zpZ)?SYRbL{6QqP~Wjl1@2rGI8xBq1obJk*{_K_Dx6n3vzN_e3|kg{!s)5$#$YYpGh
z5f5_)lad9Scp6DOM}xFB(Ruq&Q2qDazG5x?Kb%iK&4qn8(BMCu+zSPE;ecK56f+Be
z43>8Nneoj3l}v@hv<2?=zmp9ASTNy3u9N3?{)}3*2{va6BPxwGR0p=n7PiS|qP`7X
z=w68QyIAhGf=D`YCjNm2Go9du!`@!W*P=|-4A(TK{b7m;`Y|KgU(VGSL8=Osd|)@K
zz)tMA#w~}z5t275<-$tYz`tY$?L{^G0}c4aIas8OeDV>SO04RDS~^7S#<buL7b9DK
z;d_x3s!1eaysrH6k?7$bJBw4kzx@L=fj5KR@CcKeo$dWHDIBDHCuC6(7R1}<x8hon
zR0X>e<1Fa+Pr%C_Xl^At`0+mXafIDYI)v7;RyEfEY@Pt~X3#6<h+a|kCp$2I_}E}R
ztU&QkO{{%+blVSk&z0&Irf+f<$3r2<S3BRfCnB#xHH62pRZG<zG;75=X=WHoS{i!R
zn?k&FT_G71RQZVtnBz)S6wmGl1~8mfrUTsQc*YR1qvX#N4mi52R?HjnZG5T<@05XJ
zz%HPCJ8M}LxRW0y`k_1Rj0PknBVJ+qnsJ%?hYR*);ehfOdVL>ugxZ0TDR1bFP3M>A
zwyEo(4DuSrecZ<qz2L-7SUw*|k2E&_x2DCc@_i)Bq!Za#ay4uCITKGz`qFJ`#z)U|
z(kx&)(D>~B(Z$Zo`l}v`ll!39umMH^IVnO=xT@N}hL}4(6dS)S=JJz329NVzb`QW>
zBYIW#r8DYjC(^?}M17>t`XiV;Nz&o_w_!2U*Hk;@+?}mgPgeB;0^{Ow(+c88VW5W(
z;NI*)#j7sp)>QW!kgQK$6{)}Q@4bj8(CbIGxW|!@$<lJO>@Nv>0VM3y*C#F@VZpl1
zqhj($r>T7a&Mom%iWB*9e1nAmc?@klJ_n~0{@3~%XE0gzUifQIQcH5@nUI@~itf|!
z;B1N~(okMv45Get+aC11H}3%}7Q)iwV`C@IjgB$?9PfX40erb{=+q4Q*n9N*4PKtG
zsEgVK5!+B8=G}I%nw9Djc-=v)kvjZ7*iu;7uPctO=qRqk+V*jC8bFQ+j3Nt$U0NO|
z7YF~95>ngc<8vE6%)~^ej@~R3g;9?xLcM{Xe^TR?xC`=fbG#`3gYB?2FQKRdo!#DZ
zFzwzjf}wN`Dl#Gc4C0OaQNaLhr(Wh7-$dzxm=ck1G=NtYmLu@@LE6SfULYcpC1&T>
zJsu3nWpYQ4z{07)O^j&%1m*yr4>ktI-g2~fo)HzDVSgkKIzCI{QsR?<A$3|w9|qeE
zwu__S&?xF^Ie<iHYkT;_=;z*PGQ&I<4%N)F;=({Yq&i<5-;Tb6jB#lp+l9_50Eu)+
z7-+!py9Vd!{eimg=;#f9X6&*5+^kl_GL;Jy^E}R%0*~IueZCmAm>N_5GR>xPj&349
z6?rrP!pR15ryA~Tpu`{;;$f*s^h!IZ{8q1@PM28y^u*vS{%?j1FaIN!wQp)qpCfEV
z+X8EnCp<+L)_5=DW^^3da1CO<QKWjHtD?LK2=D~NE$SWsrOO9snsP5<2xbk-^Zr?F
z5hV5@>hZjrv@|khpEt-sVP;ZC#~c95i#cno!2=dGMqukZFfrsnA}+nZvmG@t{*nLp
z^G{ECE*wuZ2uapRU_F7LeQMN??9lrY_E$3FffD&`<P|$XfRAXaiW*G`YN~hKXy&9M
zBKkSgFSY>%v8X^uZQOAb#jx-qFwkNSJ4@M9iNwjqMrhXahcbszX7Hb?__-N;v@n;G
zJN9%3Y@pl(XN*4Q1si(Am=MW#@xl1+pPMWt0tV*)+XJsA%0S&TLTtc2A0>(o9hvkr
zXvbpl?JvP9sDYb%fGzMdx+l+Ae@Bg;rkaqA2IOp>xCP6xMdhvp*ddBA+gtl5$y475
zzsT$`D~74SW^4X|0lZ+4WeouFhOskkfV{}x`J>2r<YEUgdeg^>2bnDPD&-#rIvOO@
zMAjTh13+pOz=!nre@`|c_rhJ{7X?58#^wOFciZ@<XR+_0+P+@1MT@E#j(SY_;S^PL
zPM=^^-$-krt|*@;Ed4lVi@;(47IQaQB^Gokct^v&Wa}mi;G>^lcarhWF|@n{%MB;8
z{ZpcBr#YqG=FPr>V24P-tQ|DM$mT%8Z^44`<bFziC(TwcP7YAMnT|oe92R>YzJ!6T
zPJi|RI!42&A0bJwPn(i5&>^<{fnR6Gq>UKT+<|M#sIIICQ&fjLx{;c}A>+zRG#y|A
zu5%Ocz$iEEBp|(~PAACA&A0>l5DNY5kJzjyhsX=R@nft7-RHIZhPMWC(B~FsUc6!Y
z25FrZQ|ptK3L-#Ia=BgJGimC4nY&B-8l;xif`OY0b^_8wy&PCmMy5LCr=UPUe>L*J
z1;K>1`84FOpv3i{PNwiV>0rf=ocWmn1)Bu<@6bctc-DR8%?-O%2=AckU$zQ*r|$=R
zv7vh){oFw}AQwce5G13#;pkjlDG5kSPCTw)ta749dWbGgUy3hT_nh%aC<VkpzocT;
z_h2;u4zXY{aJhYX=m?pj7E=Sz%hL?72?UKeSFxg31o@A1ixt3c&|F|Y@SKF`|2U|C
zME(vf7)29)d=biBw3$8ZF!&0viZ7*e4Waa+g?#qw&_HY-)Ct^dUqezKvb;$X$lt{i
zz6T;H*EZ|Wm_YzCc2r=EavVM#lLl&;xkdZlLM=Zg?wGvwwbhx(H_Qr0Ve(Kh|4`?Q
z9L9bfswg50&eUG{-a7@09g4Iqw<$0->3LWZ@6xW{?;(D@V)4xC&R*x#6#!jsmy}i!
zQ@tvu0MQABvWQLB-9lWBIvYA7gjnkHW$?K;mr|z!HVvst4}0pRi3wE`)uJBA%BAmr
zVq}uABEx8ReZaF|44&K4QC7Zoy9Y=z_*eSt`sP=^Y&&Q8u;c=h^cNO4i0<VsPAr}D
ztQfIFdZiv8h%Jis;;k+lS?DlH=#a=r8xX1r=8ko_hZKq+?U)c)r{WB8DXZ5u(#Z88
zbPU;JQ{bi0I&;6M2igyWN}gTubcE#kBIV(v`=XD4k85<QV<vQkbhI$TAWsdNnT8;<
zSs2(%y1u)Rw2<Xp?@>>@M!K%LPWsTtBs0e62HXa@t`ilOD>EFNl$nS;65H|;(3hI+
zjaZi|?(6$Ycz(&^`>z1{I+e(=<1}U420ynGi7N-0DcijDS@3JM$9P%c=WXnQ$@Zhi
z5)nz9c}I%u*q{#WKYsiG&pbTxm;Z}O|G)iU`K?V5v4X1OfBc}-Gv0_^*MI$i{kzfr
zk1vrQM5F7*__sHq(&pEXT_6{fj%ktOJpcM)kWQFiAN}*icR+SC(kdVRccX=W`|Z{L
zV-d*X`2Umr+fx6vvp;kYAj9i>tw{|+Q1Zb{%uN*H=HU<=KL=s{gGd>N_Ya^g4M!T6
ze*>nd@2k*;P<H04XvyU6DqinW1}>>ag|lP0cHkGWF$XCO>GKlOlP}(U3E&TY@DZ9T
zBOEBl^u7S$91>?&L6R5%X?>2=VmZE|*jx?i9Zxed)RBU%_3sJm`@C0L5=s}#W?$1M
zP33q4wm!8m4ASds!&kVeZdFJ*lunvPICxmwO6@x1t#8yGD5$nA=o^W)F7sUMs+wp4
znY$nQ@;=~lAA7PkzV>5O!DH$sbmJ}cY;W{xi-!y<$L5nlAE1r&U`GkSuEWumk3SGD
zGj|sQV&_=1FYQJNFVuxW8BRC|P}w~bW0e=>vJQEp%J}EG#`b}|+D<yPVFP*^MI4a4
zc_`GFY)LY~Y%Lv6js3#U`#s>?2vUGH3{*+So4#>4SkLNPwXiG!C&}YDs37xYLQ?Jb
zN%lJ2M#xw6VL^XfYr@V4N%mKFC#)g>yCc*9*FyPWq(`;K8syO`g3v7}y{xeQ{bSi7
zi5YA)Hw3Qo;~ZYoBKg=!7;pxe5vnZG5YRpp#Q~Sq+T0~(s8IO=o43Gh^9pHP{odPn
z3@1Zewq7Oy(O{P5!&|GZ%dol!ZUn~v{nBYBfc>x_Jv|SdvA_-Ud6k}3PTUlA=IGbs
zxa6)9NRghYKhdV2&S5~@-wKp@mh565j~polB{dA1<#hkyzU-g6gmhWPM#oawdfD=g
zc3FGq8Toxs^fOm8ezhGk89^P+;+U%;Z@+A3dXepo*~-mj(NQw#b3i)sEi2hY1K!u<
zpl$h)I^>FGZ+Ew_$5dXVNqO()f#vs-2+-h#iN8zesAH7J3|_Uenyne1>^*q%6TNng
ziRY1v@<4q?a*AZbTc;q1D1;e)tfcV#<@o%R7B0dU<%6qG#|`8U1@4q+(Q$>D8PfOF
zf%R$b!{R5!1*_|B!cAEWIRF|{bAp$-Vx=w1y-NK?)b;(>p@xCv6y12l31Iv0%4=XC
zSkzTblZGr*Qm@zDJS}(mn1T1YGP_aOVcxc-2{{Nf?qoAwsZviQ@~*a~$q!FSeo8Mm
zv+4|6w`!jE(Ly5Kq`Y~vyYW=W<1-a&z^<`$rVYwm4vXI;*--aPv#*S9v`u@g)MI2V
z3x#22jusIwOzNoKdU4KkLEh9{$oOrrlv?fItdj=T&AD5IK*hve@2$I9FZ-V9@A4CG
zPhrF#jQV__1}G_gDfK+0cD${J4n@<86%lNBgn0?b93T~bKtXl35)wXsh*{Wn)In_-
zY1CwWrQ^0TsmdZqvm%f>%9S3{et}Hn{eI=a$d<-z%@<Y^5p#M<R&ohz*@_M?)}U@_
zl)L3(Uh8%V%{D$@&fkT%b0;On>QD2{h*e&DNjlpDn1y9s_sgB)_uFJg!EBf#=Ac1}
zUrLGy26kR~RYE`N25)5nLK7JsT}xl${@Mi6q@{kY0@A&=w0oF};JV$?$;qkTZ8G@c
z>~$dfu&@B`k4>H&d@(M%w%Mgw7W;5@Onm888#Zvao;<Foi`Wo(1Hdw4*8M{JWAfDU
zl)ZPXo&+%zX*6s<+EKok7uXQy3Bjte7VAO8!n3Fgda7ppK0iBsCjsH0EVVI$G}8X>
zb?z_M@-XM~aszL((-&J`aE_b0fe`iYbkfNR%wGllPJea?yy(9v$p@y4dUK%yKeI#>
zaY(|X^k^JgZ}`&rj&?&N$T8U^J`qLx=?k35@&aRge_na#B9z+Sut_0)J$10`<yd`G
zo$6<j%~<i%V&e%UdL{T{uKjf0_3VSL@q=rSlcZt?m8XNEGugDe&4EDDZ8dpPGxplh
z`{DRweQQGsYf5#Kou!<%!`s_e+<DmWHPA-SiP=(4QrvEsG~-<nrJs7qg5X}&T3tsB
zH*)RTlm1-(u;MeWo3oW}t1eZ(?8bWq3q}7BB7emwD5xB%UQ-b`+7Y9PNovsNdAHFf
z0!Pj?(A!;-zjqKyG#821Sc2qb;ruC(y#0*$x|NJ1$p$fTJ!K#+kh~^r2?<t2X=og8
z63|~rh>6vAe?_lRfAwQ&3z3n4&|2N?*CceF)kq|a9uhZ}WA_lxa+zqq+s}~RsFG-K
z!WVq>zpM)uKtwQsTg*}8hp5K410Um}UA8kxMyIv+xEf?yee#W^OL~&1Z?EIpX}WnJ
z4Y=ee2C5(Ozq`q0Tvt^`D(Q(0!QU?`3iA#gysbbQKhvCLqF^7{V20tXk3>9(eug%v
zCfju1^}Ey#{9{lpIbccGBE*Wz3s6NYk2FUmjkdYJxHhEH<e;~jwQC-{5|U1ANVnzp
z@0;sxJ5uV&eGFjb|8iI8@vMfaeEz|-1h3+y<&ptXCc>-*(aYfkd1T-^Wfu)M+Vmm(
zD*+AEXFa%=j)@zR*uMU{H{?m4@iAxqY3uP_;4AdAo?eMv`2mzXvmIlIuFDmUG*D`z
z%JV8~XlMxRSM2}s@y&&jmO4V6Oo(Ta%@gaLq+G}0nm}4((kM_cJUy_Tm~N!hR@XgW
zJHZ$|`MtjCD~Y7V6J<^-RxJ8_QS!Pth57sacLz;!8)aVRYjk3C^KQ;kn}7n(ORGM*
z(5kbRhj0;ldpj`ddFT*Mh6=iwpM$!_jAzzHb<ZzW@cbI>&uQ4Hy|W?zFOCbRjU#Mt
zi&gA;G4-wX;75XyuqGzgh~&<(e{FEH!fZkU)cCnco)P06UkaIlD`#BI&OnLTfjLEE
z2JS>><GGDJgF)$66A6mu4y!fi%0HV_J=fzns@pUKYOY%$LI2H>87>PW-Srz!8r#m&
zaN@gTwa25loEsG%hssq!ONSwo+PYVHCE|*zoOog-6#e#z%+0=GUR{9tAiIAVy8JhL
zua!$Epo7J3s`I9QTNf2pD_U~;boI!So2xwoiwZGWRfum`18vWa+Y6_}dYv{_u+Chl
z3)GrP{;~4awQoLoq;g8K;(Jik&WwIh(%z!SGnaW+qplC_>AOOy`Mo)^{EKJY02NNm
zvVp6jxV<H(7+Qo}xpeH*Om^ll;S}|lL8iNopi$O57+-TApG_r5JEiu5MbZC#erQoJ
zZo%$W3h#`<=K%iFko+F=O~6vgJM+VRXfK<Fd&l0TEMU1ifBMDbxz+Z9@;%V$B&rlx
z^d3)qes`Z_&UX6z_`(JHiK<1!eMTzBh%t$+PIlJT-|S;BfR;C<&?BMla5s3Wa!8N5
zy=xvYd3L4s7Zciv;9o9vo4(d90(0yg)u^$wcuCuZwO3hYJV$|#G{>rGw|!)Ewy<C?
zBhSb`<YsF&KsDUV@<u2SS-riHWe%SlT+^@rxeHWPDWBeLZAQM@t|OXlE^?SCxi1ft
zzJOIBY|xiEeudtu52>$#cs^Z8`rtW64Wv4==#wAkz5cv4d+*g)b`39R5<lP0lqb~u
zAg8SC|Fn0l;ZUw^+oq*b7NwSjWz*^<*@SE++chdtOGqu5WK%K<QJ83Cm(s>s>CLNz
zC@LC+kPKtf2H9tfu^n11dm@an8}nYbY`@mu@5gt1>)#y59QX6gGtYfr_jR4;d7XEg
zzEXcO-=hn9brzcmN;rz@Z=n4=s_3d`vu8ZiZYKVyk$WW*c!jKb#~BB)BxjsJ*x(<@
zGr=+_-6;AvM|dox8-VE?yB2JEZw~L|U)M8mm~!UG@YDUdGR%EPv^_^N<Jc*f@Jv_?
zwsOcdkeHB=9xZSrJAbmV;Pk^YcLmEx&od3Jj~$y4jVTV+&Z8-cN)vgNbgUaB>A2)=
zu|DybJuksI&jS*Yi<sBt0voEd6Frd70^avFVsJ7yFJI$fA2kzZ$ngkQIGuh3Pj#Lq
zSdyy$6mWC)fwsz<jP(cD1LKHX>DFiE-kBwuo*f<j_=e(){8%qRew&dfH%Y5Dr>Trb
zxjf&F)ATWCm#=LOCd}KJ%|B`{5&^bH)@?<3N6;1i(-TSF&jP@`G{1JdeRi;&Ja8j7
zH$ns(C2nv-pAo=eQ)GC_DOPWInl;%!7FLHyfYBl=5q!HeT)FSffl!Mj=Q5Ocd0<}%
z@!J7(J2=Tj)mxtNEDjb*iiNquRMeVl#QwHud<=}PMklWc>ubQ2JKBK?AqbL)Vd+k-
z5PRtllHvSlLmRIThB!<#zjd6)ieG`<2v}UW%7Lso4wEZLj9+)5i9gK;n)u&;ye4IP
z-cNoJr<@zT0}2bWu6EYfoGi#q$Ooq7UZ-yx?ENrVF~6=H`vrmM?v?XM!qSs|{Uj{|
z>HKz4WAaxa&o@u}&*&AK<-$Js&%d@JQC(PE{|9FafYJzlm~sL)1)8o>Hwq>jQ47Sx
zmdD><U2nIlfMQko$OAQ}?~9#v-^M@(=|+g!QAGv`?^mpISV;A|u}(?ONzRzdzbO<$
zJ9v^4KyyS%zo{i`&YU$TCV+#<jb+2XD1qOLj2>}pIBpbvM4xpLC`RRlQus7{fD2>F
z!sg(<)ihvkbC{clB-!CgwVE<O2IRnxlLW9!x^s6b?FMM&RJFi!V{`|LRmia+jt|_=
ziIV_wB913F^GGoNs$|qO5s0NAfk1^Z)od{of+(aBXN`CWa05x`>kgZ~qF|YF+suC?
z9CuF(0+l23dY$$`60pW0?|1)NV;TBBU5a~E9r_bW4m~kj`V%tN&4BUs9Wa2oNXJo#
z*&w&o9Cs->AB0ZO@Y>gR$Vfs4wZ8yzS^-tUkEKs`Z39J))x&y4ntje1%V06xebAkU
zxI)6I-Z|C}YsiDhdIgdZ$a#A7gCx9L4?*W(IGxKz+zUAB7Z*qI=tw-&eh}{x+~<(T
zMvWKXyo;SfeDVO^QE37os}t?yIUFd5M#ou$E%d_e72wRPAgDj_uoddjkt%n8s*XRv
zEKb=cAkyCD71XV@E#XXo7SG4K;&V@rkX|=(5Jvg@sJ?A`Q&aEN=yBgVw2h*wT9-33
z>M=c-<LeUZKl4e7)m&d&+Ywh-w|Xdl^wwA(ts>bUDn%k){=yCcdkt*EFcZNVch~QN
zH0u<2;L2MXO8Si*nuOo+DLbG>kF`nZmrE1pGS1SO)^woqUls!4Khm5)K&NehNsUc@
z236Dp%Iv#vj<SMb+B?8zuAAA{J>KuZVY9B0xr&|TWch)Cv4AAgwsAg+b;R)?K!_1>
z^_L%q#f)oRI>xAGHB*v!o|;NzGJ9@4c&~GQUpqK6QYz`|YMBeHM2*h)WKDp+3uo=N
zm=1bw$K`b|Zf%~5{ZJSXP?)bj@kxpksj6nRu7Rp~|Dk&FOql;zLjH^H7*A710TAVm
zwB)h(z!?n0>EtLCdq&x-XTp^y9E0bKr-BVBrQks5jDLM)?ROPsvASBL;1cZHc~=o<
zk!~_PGxaJrS8FucjxB$rsg<nT-L7fN+cqkuQtZrN@m)Yg+<p=J5Ypt)h<WTn!G8Dp
zT=0h)JPa-KpdRuH=|+r6+LNgd1d2gDK>tH}n2&MmC3UR;3Y09|1~6beCcF+8hpcUS
z1}yRrZqLN0M-5xR5{N4kchCPQ5%o(zL4d$$sE8J%p=^g^9lTN#fqGXjfqnCTpbu=S
zlhE*_TGkI4X6?M%d}_DRBwS=L?o)`k+ilM55T}|PqL(xnFxh#J;vj2s7tC<#BuNu4
z&`D(i`ePFzMPL;ad}6@}r5rSIY|UO*hKQ|C@w9_ND>&|-ylgSl3u4gKY59za0~q0-
zDxmcDSN)jEI?joLGa`?lf43QRm)NuuTX+!SiSfl)8PQ4XwjqC~P}K77EcY<8pJ~?k
z2abfa=wM7PY)t&wWO^yE2&MCMeM({s7z&@mdo?7qd4(iZUsvbgShKnyv>i7usoupd
zGt&Qah`KthaF%SxxHP8QAF2NVmqgST&hL`Q)Tz9~J^Q;U2-MYd2l204Dgn~$wcF$5
z+2y9~OM8jySGFOmJKW_hJKYq=nduhIJZ0b(aWghrs5$S}6j*vf#dsm|$?Bb53A47U
zXG13^3Ij}>Zuj9E?o~y0&|3<TEY}~(BjPkO4T@jwgy}3hZBzVO?x9HB?xJ79#%a|*
zGz80HndQzB8YjTA<MW2}_Av5Pc}o3fQK{!3Y0MVB-E4DHVHtcY6c>o2*9iW_|J_6U
z)5ig=m0V2mk!NOp<t-ctB2Ddyf|X*={3lz}M*)@vFs-yR^z1s39b*+|e0+KyUBk08
z9;Ji`gE5SG!<oxLp8>+o55y#6H^(&wj-ppooQVvIho{bueK2zJ+JCjRimyrRN*^9S
zM(oV4Q7)J`>Gq`(T%#S@p`S>J@dZRvlhf+hs<wiWh_s3BO1%<r@VIV=-D7dNe&Rtc
zP&ohE1Iei9r@abGZ;mX4nlxE&{6Cu<wywHl!Vu(j?I5WsYK&2DcR+!cX-G*EvAdL!
zWA~<{Rq55O#N0=RKcC>_3jCbMzh%q^`qk{rq*@_D`ulqFwxwL&5ZB{s(W3w0Y*b#a
z$uZv<pgqg<2J~`HBv!woRkd<eCLebF<EZB+1x`#CpS|q^&>#_~Z&qPxMb<5kKJG+1
zXxk!xDY>x-(P!CGg3o~@Z&N4|BX!yFE;^5K1^u<ICn|v~ndk<lX$5}Mc)|g?IE`@V
zV4ys{;c<>7X{{<dkJ?X;Ay+51A>dcF_*sr@@^e+O)&dBT<=zFO5~|$sksWG2a{XVn
zT>I(IR8bPWK6_Q@S#oSY97H|0sfI>2rJi(qE+bnT0mm;+PA9XGwGnFG9LmDq;Bgmd
z?9lKMuu>bXZy^BUT9E^%$aI`=*Q4@&zpJPv$G(_H^&@G$=+3?1%y+!DRPnI1?77W0
z*0k15CY3w+Y}*h&(%jsfUkdwM+R=S+ky7{evI#R@R8=ilG?daAP^V;XP$^8`=*Q`H
zoANstmVsv9H~TusV|oPs4aHT>MGEzyDQ`Knb`rNdn5`{@PM@Jj#&B=pY=2>XDLA6%
z{p4Ax%J>8gu|AmeIu)~E0|7AtRoukJ<Gl={>fiYiU1@0R1{Q!{RVs+(b}BI_X|k*!
zGJ78-`d0}+=;ATCYMDDan_beFq%&N}gUvu3yuB1-r|H%zhYC8nK1C~>5x74G3Fp+`
z){CEL1KuGX6F>$khT;G&GRS@`KfO`s#}Q=#IVK;DNR?}I%kBI@`Fca`QK|l?n6SG3
z@qs3)FQc)sr|r^#ic74#_6Xo}4YHJ+NsrPPCgqAw(#e*oBRnpY^Yw5NUMG#+OfTrV
zC@7|GJXT^!e4|V$h^<Homq!Nrk+>HKPfN>ZyFkZPyg_P12Ifg!bXYHw*^C-~bhz1)
zBCTsIz9tVuAO}-R4<*(tWF=Vs?`$u2ZWdKNc?F~P!iwdk`8Xu%f-j&C?^<==>nlw$
zMG7abBKCati!vaJ9#tCji5gF-NCM6g@{5yhS$}13hx6h_5k^yK)cj6SN4jh4VD7cd
zSeK&dH&U}IE_PaGl~pNkTXjhl@?8#-C*81}5!CTsLg-oi8D3QkcQ`trRbd297bt@s
z=bM{Z`1~kGkyDUmjF+Dln$8IGOF4Z}?;Pw77556pp4@TV1#5;TvZmTp#yGV=*=a8-
zm%WB=@*bC1qVK?-$F=39QUb<wCr6`Z$^ox~AA9=SRZX=wJCjoyvb~e&rTDJXU65v#
zk(}{X_=pTTm1$Y!T9+bjd><DhO2LII3D1x^p`d>7fpCB@4aZqnrpKfvU@~@QT4pv|
zd=^&o%=$}X>=^m`RK_NSSNFr(ru(a!#{vDdonDu^)dk=GgSs{xuAm0uSb<nD&t&MP
z9VGjcZiPR;r+wn}mEZ5@Mync}EvbB3C0_&Tv!n-s1?>lKAIn?kfuz~e+&q=$8XWEP
zrGWhSS1i4^UPHUxcSz4&uLkOD%Bj`Nt=-zuUI1S(8x#lI^1R=V5NlfpZNX`IR#e44
zFz1JUNb$_m8SR`|Z}+YN&~bF9wvoXd>c;GL%Yu&mKQ*55FW6w?eE~x3$=5a?2xh*_
zzTaLrl5JCWIHZF|BJm~kS-n?CbBSjl3{3kO5dV&h^K;q7*HRqxab$I|tgpsubFUys
zds8DU3ktysWiMKt|2f=e9=KzUivV5E9aQjxS=F4`%sfuHY3*^q2TtDAE$jaPouicK
zo1GH&3x49#9RQD$c!ROR^LZK?lw`!Mqo$EFgjd4d*CsfBc8i_AJ{ONEr12o+SLaza
zAEtznOS&VHiAd11ae})#xWGNesKGyW5$j$iI9PU{MSLIziIe=VpO*gHLT}BhpFx;a
zZd|*9I9ePmbPm*(w8<^;K?^HyC9u$O6P{3EtAWnn-5I1#7eVUOeeF8mN+e_9T7gSy
z>!n9t_Ng1>1P`Qs7MoLKEtIAUgSnO~E&lHI|2+6VZ+yjU{r@@Di#RzBrRrQ<T<c%Y
z<dyIlNhr3h=H?OMHeilikyy1xNE#c+#SIM^iysEQTSa3V@@@R)3nA%q8-$K1n>8Bz
vj}d5SkuBD|&F&P$|7Dk&zvlnK&7_ZTJzKx6I8_QBjBCH4>E7qN?Jxfq;c;}s

literal 0
HcmV?d00001

diff --git a/docs/source/dev/kernel/paged_attention.rst b/docs/source/dev/kernel/paged_attention.rst
new file mode 100644
index 0000000000000..6fcadeeec27b6
--- /dev/null
+++ b/docs/source/dev/kernel/paged_attention.rst
@@ -0,0 +1,525 @@
+vLLM Paged Attention
+====================
+
+-  Currently, vLLM utilizes its own implementation of a multi-head query
+   attention kernel (``csrc/attention/attention_kernels.cu``). 
+   This kernel is designed to be compatible with
+   vLLM's paged KV caches, where the key and value cache are stored in
+   separate blocks (note that this block concept differs from the GPU
+   thread block. So in a later document, I will refer to vLLM paged
+   attention block as "block", while refer to GPU thread block as
+   "thread block").
+-  To achieve high performance, this kernel relies on a specially
+   designed memory layout and access method, specifically when threads
+   read data from global memory to shared memory. The purpose of this
+   document is to provide a high-level explanation of the kernel
+   implementation step by step, aiding those who wish to learn about the
+   vLLM multi-head query attention kernel. After going through this 
+   document, users will likely have a better understanding and feel easier
+   to follow the actual implementation.
+-  Please note that this document may not cover all details, such as how
+   to calculate the correct index for the corresponding data or the dot
+   multiplication implementation. However, after reading this document
+   and becoming familiar with the high-level logic flow, it should be
+   easier for you to read the actual code and understand the details.
+
+Inputs
+------
+
+-  The kernel function takes a list of arguments for the current thread
+   to perform its assigned work. The three most important arguments are
+   the input pointers ``q``, ``k_cache``, and ``v_cache``, which point
+   to query, key, and value data on global memory that need to be read
+   and processed. The output pointer ``out`` points to global memory
+   where the result should be written. These four pointers actually
+   refer to multi-dimensional arrays, but each thread only accesses the
+   portion of data assigned to it. I have omitted all other runtime
+   parameters here for simplicity.
+
+   .. code:: cpp
+
+      template<
+      typename scalar_t,
+      int HEAD_SIZE,
+      int BLOCK_SIZE,
+      int NUM_THREADS,
+      int PARTITION_SIZE = 0>
+      __device__ void paged_attention_kernel(
+      ... // Other side args.
+      const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
+      const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+      const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+      const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+      ... // Other side args.
+      )
+
+-  There are also a list of template arguments above the function
+   signature that are determined during compilation time. ``scalar_t``
+   represents the data type of the query, key, and value data elements,
+   such as FP16. ``HEAD_SIZE`` indicates the number of elements in each
+   head. ``BLOCK_SIZE`` refers to the number of tokens in each block.
+   ``NUM_THREADS`` denotes the number of threads in each thread block.
+   ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For
+   simplicity, we assume this is 0 and tensor parallel is disabled).
+-  With these arguments, we need to perform a sequence of preparations.
+   This includes calculating the current head index, block index, and
+   other necessary variables. However, for now, we can ignore these
+   preparations and proceed directly to the actual calculations. It will
+   be easier to understand them once we grasp the entire flow.
+
+Concepts
+--------
+
+-  Just before we dive into the calculation flow, I want to describe a
+   few concepts that are needed for later sections. However, you may
+   skip this section and return later if you encounter any confusing
+   terminologies.
+-  **Sequence**: A sequence represents a client request. For example,
+   the data pointed to by ``q`` has a shape of
+   ``[num_seqs, num_heads, head_size]``. That represents there are total
+   ``num_seqs`` of query sequence data are pointed by ``q``. Since this 
+   kernel is a single query attention kernel, each sequence only has one
+   query token. Hence, the ``num_seqs`` equals the total number of tokens 
+   that are processed in the batch.
+-  **Context**: The context consists of the generated tokens from the
+   sequence. For instance, ``["What", "is", "your"]`` are the context
+   tokens, and the input query token is ``"name"``. The model might
+   generate the token ``"?"``.
+-  **Vec**: The vec is a list of elements that are fetched and
+   calculated together. For query and key data, the vec size
+   (``VEC_SIZE``) is determined so that each thread group can fetch and
+   calculate 16 bytes of data at a time. For value data, the vec size
+   (``V_VEC_SIZE``) is determined so that each thread can fetch and
+   calculate 16 bytes of data at a time. For example, if the
+   ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the 
+   ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8.
+-  **Thread group**: The thread group is a small group of
+   threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one
+   query token and one key token at a time. Each thread handles only a
+   portion of the token data. The total number of elements processed by
+   one thread group is referred as ``x``. For example, if the thread
+   group contains 2 threads and the head size is 8, then thread 0
+   handles the query and key elements at index 0, 2, 4, 6, while thread
+   1 handles the elements at index 1, 3, 5, 7.
+-  **Block**: The key and value cache data in vLLM are split into
+   blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``)
+   of tokens at one head. Each block may contain only a portion of the
+   whole context tokens. For example, if the block size is 16 and the
+   head size is 128, then for one head, one block can store 16 \* 128 =
+   2048 elements.
+-  **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that
+   execute simultaneously on a stream multiprocessor (SM). In this
+   kernel, each warp processes the calculation between one query token
+   and key tokens of one entire block at a time (it may process multiple
+   blocks in multiple iterations). For example, if there are 4 warps and
+   6 blocks for one context, the assignment would be like warp 0 handles
+   the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+   handles the 2nd block and warp 3 handles the 3rd block.
+-  **Thread block**: A thread block is a group of
+   threads(\ ``NUM_THREADS``) that can access the same shared memory.
+   Each thread block contains multiple warps(\ ``NUM_WARPS``), and in
+   this kernel, each thread block processes the calculation between one
+   query token and key tokens of a whole context.
+-  **Grid**: A grid is a collection of thread blocks and defines the
+   shape of the collection. In this kernel, the shape is
+   ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread
+   block only handles the calculation for one head, one sequence, and
+   one partition.
+
+Query
+-----
+
+-  This section will introduce how query data is stored in memory and
+   fetched by each thread. As mentioned above, each thread group fetches
+   one query token data, while each thread itself only handles a part of
+   one query token data. Within each warp, every thread group will fetch
+   the same query token data, but will multiply it with different key
+   token data.
+
+   .. code:: cpp
+
+      const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+
+   .. figure:: ../../assets/kernel/query.png
+      :alt: query
+      :width: 70%
+      :align: center
+
+      Query data of one token at one head
+
+-  Each thread defines its own ``q_ptr`` which points to the assigned
+   query token data on global memory. For example, if ``VEC_SIZE`` is 4
+   and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains
+   total of 128 elements divided into 128 / 4 = 32 vecs.
+
+   .. figure:: ../../assets/kernel/q_vecs.png
+      :alt: q_vecs
+      :width: 70%
+      :align: center
+
+      ``q_vecs`` for one thread group
+
+   .. code:: cpp
+
+      __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+
+-  Next, we need to read the global memory data pointed to by ``q_ptr``
+   into shared memory as ``q_vecs``. It is important to note that each
+   vecs is assigned to a different row. For example, if the
+   ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs,
+   while thread 1 handles the 1st row vecs. By reading the query data in
+   this way, neighboring threads like thread 0 and thread 1 can read
+   neighbor memory, achieving the memory coalescing to improve
+   performance.
+
+Key
+---
+
+-  Similar to the "Query" section, this section introduces memory layout
+   and assignment for keys. While each thread group only handle one
+   query token one kernel run, it may handle multiple key tokens across
+   multiple iterations. Meanwhile, each warp will process multiple blocks
+   of key tokens in multiple iterations, ensuring that all context
+   tokens are processed by the entire thread group after the kernel run.
+   In this context, "handle" refers to performing the dot multiplication
+   between query data and key data.
+
+   .. code:: cpp
+
+      const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                          + kv_head_idx * kv_head_stride
+                          + physical_block_offset * x;
+
+-  Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different
+   key token at different iterations. As shown above, that ``k_ptr``
+   points to key token data based on ``k_cache`` at assigned block,
+   assigned head and assigned token.
+
+   .. figure:: ../../assets/kernel/key.png
+      :alt: key
+      :width: 70%
+      :align: center
+
+      Key data of all context tokens at one head
+
+-  The diagram above illustrates the memory layout for key data. It
+   assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is
+   8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each
+   rectangle represents all the elements for one key token at one head,
+   which will be processed by one thread group. The left half shows the
+   total 16 blocks of key token data for warp 0, while the right half
+   represents the remaining key token data for other warps or
+   iterations. Inside each rectangle, there are a total 32 vecs (128
+   elements for one token) that will be processed by 2 threads (one
+   thread group) separately.
+
+   .. figure:: ../../assets/kernel/k_vecs.png
+      :alt: k_vecs
+      :width: 70%
+      :align: center
+
+      ``k_vecs`` for one thread
+
+   .. code:: cpp
+
+      K_vec k_vecs[NUM_VECS_PER_THREAD]
+
+-  Next, we need to read the key token data from ``k_ptr`` and store
+   them on register memory as ``k_vecs``. We use register memory for
+   ``k_vecs`` because it will only be accessed by one thread once,
+   whereas ``q_vecs`` will be accessed by multiple threads multiple
+   times. Each ``k_vecs`` will contain multiple vectors for later
+   calculation. Each vec will be set at each inner iteration. The
+   assignment of vecs allows neighboring threads in a warp to read
+   neighboring memory together, which again promotes the memory
+   coalescing. For instance, thread 0 will read vec 0, while thread 1
+   will read vec 1. In the next inner loop, thread 0 will read vec 2,
+   while thread 1 will read vec 3, and so on.
+-  You may still be a little confused about the overall flow. Don't
+   worry, please keep reading the next "QK" section. It will illustrate
+   the query and key calculation flow in a clearer and higher-level
+   manner.
+
+QK
+---
+
+-  As shown the pseudo code below, before the entire for loop block, we
+   fetch the query data for one token and store it in ``q_vecs``. Then,
+   in the outer for loop, we iterate through different ``k_ptrs`` that
+   point to different tokens and prepare the ``k_vecs`` in the inner for
+   loop. Finally, we perform the dot multiplication between the
+   ``q_vecs`` and each ``k_vecs``.
+
+   .. code:: cpp
+
+      q_vecs = ...
+      for ... {
+         k_ptr = ...
+         for ... {
+            k_vecs[i] = ...
+         }
+         ...
+         float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+      }
+
+-  As mentioned before, for each thread, it only fetches part of the
+   query and key token data at a time. However, there will be a cross
+   thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk``
+   returned here is not just between part of the query and key token dot
+   multiplication, but actually a full result between entire query and
+   key token data.
+-  For example, if the value of ``HEAD_SIZE`` is 128 and
+   ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain
+   total 64 elements. However, the returned ``qk`` is actually the
+   result of dot multiplication between 128 query elements and 128 key
+   elements. If you want to learn more about the details of the dot
+   multiplication and reduction, you may refer to the implementation of
+   ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not
+   cover it in this document.
+
+Softmax
+-------
+
+-  Next, we need to calculate the normalized softmax for all ``qk``\ s,
+   as shown above, where each :math:`x` represents a ``qk``. To do this,
+   we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and
+   the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction
+   should be performed across the entire thread block, encompassing
+   results between the query token and all context key tokens.
+
+   .. math::
+      :nowrap:
+
+      \begin{gather*}
+      m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+      \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+      \end{gather*}
+
+``qk_max`` and ``logits``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Just right after we get the ``qk`` result, we can set the temporary
+   ``logits`` result with ``qk`` (In the end, the ``logits`` should
+   store the normalized softmax result). Also we can compare and collect
+   the ``qk_max`` for all ``qk``\ s that are calculated by current
+   thread group.
+
+   .. code:: cpp
+
+      if (thread_group_offset == 0) {
+         const bool mask = token_idx >= context_len;
+         logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+      }
+
+-  Please note that the ``logits`` here is on shared memory, so each
+   thread group will set the fields for its own assigned context tokens.
+   Overall, the size of logits should be number of context tokens.
+
+   .. code:: cpp
+
+      for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+      }
+
+      if (lane == 0) {
+         red_smem[warp_idx] = qk_max;
+      }
+
+-  Then we need to get the reduced ``qk_max`` across each warp. The main
+   idea is to make threads in warp to communicate with each other and
+   get the final max ``qk`` .
+
+   .. code:: cpp
+
+      for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+      }
+      qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+
+-  Finally, we can get the reduced ``qk_max`` from whole thread block by
+   compare the ``qk_max`` from all warps in this thread block. Then we
+   need to broadcast the final result to each thread.
+
+``exp_sum``
+~~~~~~~~~~~
+
+-  Similar to ``qk_max``, we need to get the reduced sum value from the
+   entire thread block too.
+
+   .. code:: cpp
+
+      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+          float val = __expf(logits[i] - qk_max);
+          logits[i] = val;
+          exp_sum += val;
+      }
+      ...
+      exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+
+-  Firstly, sum all exp values from each thread group, and meanwhile,
+   convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``.
+   Please note, the ``qk_max`` here is already the max ``qk`` across the
+   whole thread block. And then we can do reduction for ``exp_sum``
+   across whole thread block just like the ``qk_max``.
+
+   .. code:: cpp
+
+      const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+         logits[i] *= inv_sum;
+      }
+
+-  Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain
+   the final normalized softmax result as ``logits``. This ``logits``
+   variable will be used for dot multiplication with the value data in
+   later steps. Now, it should store the normalized softmax result of
+   ``qk`` for all assigned context tokens.
+
+Value
+-----
+
+.. figure:: ../../assets/kernel/value.png
+   :alt: value
+   :width: 70%
+   :align: center
+
+   Value data of all context tokens at one head
+
+.. figure:: ../../assets/kernel/logits_vec.png
+   :alt: logits_vec
+   :width: 50%
+   :align: center
+
+   ``logits_vec`` for one thread
+
+.. figure:: ../../assets/kernel/v_vec.png
+   :alt: v_vec
+   :width: 70%
+   :align: center
+
+   List of ``v_vec`` for one thread
+
+-  Now we need to retrieve the value data and perform dot multiplication
+   with ``logits``. Unlike query and key, there is no thread group
+   concept for value data. As shown in diagram, different from key token
+   memory layout, elements from the same column correspond to the same
+   value token. For one block of value data, there are ``HEAD_SIZE`` of
+   rows and ``BLOCK_SIZE`` of columns that are split into multiple
+   ``v_vecs``.
+-  Each thread always fetches ``V_VEC_SIZE`` elements from the same
+   ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread
+   retrieves multiple ``v_vec``\ s from different rows and the same
+   columns through multiple inner iterations. For each ``v_vec``, it
+   needs to be dot multiplied with the corresponding ``logits_vec``,
+   which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with
+   multiple inner iterations, each warp will process one block of value
+   tokens. And with multiple outer iterations, the whole context value
+   tokens are processd
+
+   .. code:: cpp
+
+      float accs[NUM_ROWS_PER_THREAD];
+      for ... { // Iteration over different blocks.
+          logits_vec = ...
+          for ... { // Iteration over different rows.
+              v_vec = ...
+              ...
+              accs[i] += dot(logits_vec, v_vec);
+          }
+      }
+
+-  As shown in the above pseudo code, in the outer loop, similar to
+   ``k_ptr``, ``logits_vec`` iterates over different blocks and reads
+   ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each
+   thread reads ``V_VEC_SIZE`` elements from the same tokens as a
+   ``v_vec`` and performs dot multiplication. It is important to note
+   that in each inner iteration, the thread fetches different head
+   position elements for the same tokens. The dot result is then
+   accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped
+   to a head position assigned to the current thread.
+-  For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each
+   thread fetches 8 value elements for 8 tokens at a time. Each element
+   is from different tokens at the same head position. If ``HEAD_SIZE``
+   is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to
+   fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are
+   a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle
+   a whole block of value tokens. And each ``accs`` in each thread
+   contains 8 elements that accumulated at 8 different head positions.
+   For the thread 0, the ``accs`` variable will have 8 elements, which
+   are 0th, 16th … 112th elements of a value head that are accumulated
+   from all assigned 8 tokens.
+
+LV
+---
+-  Now, we need to perform reduction for ``accs`` within each warp. This
+   process allows each thread to accumulate the ``accs`` for the
+   assigned head positions of all tokens in one block.
+
+   .. code:: cpp
+
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+         float acc = accs[i];
+         for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+            acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+         }
+         accs[i] = acc;
+      }
+
+-  Next, we perform reduction for ``accs`` across all warps, allowing
+   each thread to have the accumulation of ``accs`` for the assigned
+   head positions of all context tokens. Please note that each ``accs``
+   in every thread only stores the accumulation for a portion of
+   elements of the entire head for all context tokens. However, overall,
+   all results for output have been calculated but are just stored in
+   different thread register memory.
+
+   .. code:: cpp
+
+      float* out_smem = reinterpret_cast<float*>(shared_mem);
+      for (int i = NUM_WARPS; i > 1; i /= 2) {
+          // Upper warps write to shared memory.
+          ...
+              float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                      ...
+              dst[row_idx] = accs[i];
+          }
+
+          // Lower warps update the output.
+              const float* src = &out_smem[warp_idx * HEAD_SIZE];
+          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                      ...
+              accs[i] += src[row_idx];
+          }
+
+              // Write out the accs.
+      }
+
+Output
+------
+
+-  Now we can write all of calculated result from local register memory
+   to final output global memory.
+
+   .. code:: cpp
+
+      scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+                      + head_idx * max_num_partitions * HEAD_SIZE
+                      + partition_idx * HEAD_SIZE;
+
+-  First, we need to define the ``out_ptr`` variable, which points to
+   the start address of the assigned sequence and assigned head.
+
+   .. code:: cpp
+
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          from_float(*(out_ptr + row_idx), accs[i]);
+      }
+      }
+
+-  Finally, we need to iterate over different assigned head positions
+   and write out the corresponding accumulated result based on the
+   ``out_ptr``.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e90481845c4ff..c0250bf99f7ae 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -98,6 +98,7 @@ Documentation
    :caption: Developer Documentation
 
    dev/engine/engine_index
+   dev/kernel/paged_attention
 
 Indices and tables
 ==================

From 9cbc7e5f3be72552d6041f81738921a9597643e8 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Tue, 5 Mar 2024 02:37:58 +0800
Subject: [PATCH 051/113] enable --gpu-memory-utilization in
 benchmark_throughput.py (#3175)

Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
---
 benchmarks/benchmark_throughput.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1f0bfe06a67cb..72bdc4b3b4540 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -74,6 +74,7 @@ def run_vllm(
     kv_cache_dtype: str,
     device: str,
     enable_prefix_caching: bool,
+    gpu_memory_utilization: float = 0.9,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(model=model,
@@ -84,6 +85,7 @@ def run_vllm(
               trust_remote_code=trust_remote_code,
               dtype=dtype,
               max_model_len=max_model_len,
+              gpu_memory_utilization=gpu_memory_utilization,
               enforce_eager=enforce_eager,
               kv_cache_dtype=kv_cache_dtype,
               device=device,
@@ -206,13 +208,12 @@ def main(args: argparse.Namespace):
                                    args.output_len)
 
     if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
-                                args.quantization, args.tensor_parallel_size,
-                                args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device,
-                                args.enable_prefix_caching)
+        elapsed_time = run_vllm(
+            requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype, args.device,
+            args.enable_prefix_caching, args.gpu_memory_utilization)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -287,6 +288,12 @@ def main(args: argparse.Namespace):
         'The "auto" option will use FP16 precision '
         'for FP32 and FP16 models, and BF16 precision '
         'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
     parser.add_argument("--enforce-eager",
                         action="store_true",
                         help="enforce eager execution")

From 76e8a70476ef9daa970349c14c117fe91e8b4544 Mon Sep 17 00:00:00 2001
From: ttbachyinsda <ttbachyinsda@outlook.com>
Date: Tue, 5 Mar 2024 03:17:12 +0800
Subject: [PATCH 052/113] [Minor fix] The domain dns.google may cause a
 socket.gaierror exception (#3176)

Co-authored-by: guofangze <guofangze@kuaishou.com>
---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index a4f9bfe6aac99..9cdf623379516 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -173,7 +173,7 @@ def get_ip() -> str:
     # try ipv4
     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     try:
-        s.connect(("dns.google", 80))  # Doesn't need to be reachable
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
         return s.getsockname()[0]
     except OSError:
         # try ipv6

From 22de45235c6dd14e901e089971635ec655d5fbe0 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 4 Mar 2024 11:54:06 -0800
Subject: [PATCH 053/113] Push logprob generation to LLMEngine (#3065)

Co-authored-by: Avnish Narayan <avnish@anyscale.com>
---
 tests/entrypoints/test_openai_server.py       |  61 ++-
 tests/samplers/test_logprobs.py               |  42 +-
 tests/worker/spec_decode/utils.py             |  12 +-
 vllm/config.py                                |   2 +
 vllm/engine/arg_utils.py                      |  10 +-
 vllm/engine/async_llm_engine.py               |  29 +-
 vllm/engine/llm_engine.py                     |  42 +-
 vllm/entrypoints/openai/serving_chat.py       | 236 ++++++-----
 vllm/entrypoints/openai/serving_completion.py | 391 +++++++++---------
 vllm/entrypoints/openai/serving_engine.py     |  23 +-
 vllm/model_executor/layers/sampler.py         |  15 +-
 vllm/sequence.py                              |  25 +-
 vllm/worker/spec_decode/multi_step_worker.py  |   2 +-
 13 files changed, 555 insertions(+), 335 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index e426cf7eed72b..f4a6e44d88a87 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -213,14 +213,14 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
                                                            messages=messages,
                                                            max_tokens=10,
                                                            logprobs=True,
-                                                           top_logprobs=10)
+                                                           top_logprobs=5)
     assert chat_completion.id is not None
     assert chat_completion.choices is not None and len(
         chat_completion.choices) == 1
     assert chat_completion.choices[0].message is not None
     assert chat_completion.choices[0].logprobs is not None
     assert chat_completion.choices[0].logprobs.top_logprobs is not None
-    assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 10
+    assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5
     message = chat_completion.choices[0].message
     assert message.content is not None and len(message.content) >= 10
     assert message.role == "assistant"
@@ -229,7 +229,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
     # test multi-turn dialogue
     messages.append({"role": "user", "content": "express your result in json"})
     chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
         messages=messages,
         max_tokens=10,
     )
@@ -237,6 +237,61 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI,
     assert message.content is not None and len(message.content) >= 0
 
 
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_too_many_logprobs(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role": "user",
+        "content": "what is 1+1?"
+    }]
+
+    # Default max_logprobs is 5, so this should raise an error
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.chat.completions.create(model=model_name,
+                                                      messages=messages,
+                                                      max_tokens=10,
+                                                      logprobs=True,
+                                                      top_logprobs=10,
+                                                      stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=model_name,
+                                             messages=messages,
+                                             max_tokens=10,
+                                             logprobs=True,
+                                             top_logprobs=10,
+                                             stream=False)
+
+    with pytest.raises((openai.BadRequestError, openai.APIError)):
+        stream = await client.completions.create(model=model_name,
+                                                 prompt="Test",
+                                                 max_tokens=10,
+                                                 logprobs=10,
+                                                 stream=True)
+        async for chunk in stream:
+            ...
+
+    with pytest.raises(openai.BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt="Test",
+                                        max_tokens=10,
+                                        logprobs=10,
+                                        stream=False)
+
+    # the server should still work afterwards
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           stream=False)
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
     "model_name",
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 0ea3704462fcb..1abb55f021214 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,5 +1,6 @@
 import pytest
 import torch
+from tests.conftest import VllmRunner
 
 from vllm import SamplingParams
 
@@ -16,6 +17,7 @@ def test_get_prompt_logprobs(
     example_prompts,
 ):
     max_tokens = 5
+    num_top_logprobs = 6
     hf_model = hf_runner(model, dtype=dtype)
     hf_logprobs = hf_model.generate_greedy_logprobs(
         example_prompts,
@@ -23,19 +25,32 @@ def test_get_prompt_logprobs(
     )
     del hf_model
 
-    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
     vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                          logprobs=5,
+                                          logprobs=num_top_logprobs,
                                           prompt_logprobs=5,
                                           temperature=0.0)
     vllm_results = vllm_model.model.generate(
         example_prompts, sampling_params=vllm_sampling_params)
-    del vllm_model
 
     # Test whether logprobs are included in the results.
     for result in vllm_results:
         assert result.prompt_logprobs is not None
         assert result.outputs[0].logprobs is not None
+        assert len(result.outputs[0].logprobs) == max_tokens
+        for logprobs in result.outputs[0].logprobs:
+            assert len(logprobs) == num_top_logprobs
+        output_text = result.outputs[0].text
+        output_string_from_most_likely_tokens = []
+        for top_logprobs in result.outputs[0].logprobs:
+            top_logprob = next(iter(top_logprobs.values()))
+            output_string_from_most_likely_tokens.append(
+                top_logprob.decoded_token)
+        output_string_from_most_likely_tokens = "".join(
+            output_string_from_most_likely_tokens)
+        assert output_text == output_string_from_most_likely_tokens, (
+            "The output text from the top logprob for each token position "
+            "should be the same as the output text in the result.")
 
     # Test whether prompt logprobs are consistent with HF
     for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
@@ -43,14 +58,29 @@ def test_get_prompt_logprobs(
         vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
         for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
             for token_id, logprob in vllm_prompt_logprob_dict.items():
-                torch.testing.assert_close(logprob,
+                torch.testing.assert_close(logprob.logprob,
                                            hf_logprob[0][i][token_id].item(),
                                            atol=1e-2,
                                            rtol=1e-2)
         vllm_sample_logprobs = vllm_result.outputs[0].logprobs
-        for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs):
-            for token_id, logprob in vllm_sample_logprob_dict.items():
+        for i, top_logprobs in enumerate(vllm_sample_logprobs):
+            for token_id, sample_logprob in top_logprobs.items():
+                logprob = sample_logprob.logprob
                 torch.testing.assert_close(logprob,
                                            hf_logprob[i][-1][token_id].item(),
                                            atol=1e-2,
                                            rtol=1e-2)
+                assert isinstance(sample_logprob.decoded_token, str), \
+                    ("The token should be decoded by the time it is returned "
+                    " to the user.")
+
+
+def test_max_logprobs():
+    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
diff --git a/tests/worker/spec_decode/utils.py b/tests/worker/spec_decode/utils.py
index 8d74509fea488..fa8767cf898aa 100644
--- a/tests/worker/spec_decode/utils.py
+++ b/tests/worker/spec_decode/utils.py
@@ -4,7 +4,7 @@
 from vllm.worker.worker import Worker
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.engine.arg_utils import EngineArgs
-from vllm.sequence import SequenceGroupMetadata, SequenceData
+from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData
 from vllm.sampling_params import SamplingParams
 from vllm.worker.cache_engine import CacheEngine
 from vllm.model_executor.utils import set_random_seed
@@ -166,13 +166,15 @@ def create_seq_group_metadata_from_prompts(
 
 
 def assert_logprobs_dict_allclose(
-        actual_logprobs: List[Dict[int, float]],
-        expected_logprobs: List[Dict[int, float]]) -> None:
+        actual_logprobs: List[Dict[int, Logprob]],
+        expected_logprobs: List[Dict[int, Logprob]]) -> None:
     for single_step_actual_logprobs, single_step_expected_logprobs in zip(
             actual_logprobs, expected_logprobs):
         assert set(single_step_actual_logprobs.keys()) == set(
             single_step_expected_logprobs.keys())
         for token_id in single_step_actual_logprobs:
-            actual = torch.tensor(single_step_actual_logprobs[token_id])
-            expected = torch.tensor(single_step_expected_logprobs[token_id])
+            actual = torch.tensor(
+                single_step_actual_logprobs[token_id].logprob)
+            expected = torch.tensor(
+                single_step_expected_logprobs[token_id].logprob)
             assert torch.allclose(actual, expected)
diff --git a/vllm/config.py b/vllm/config.py
index e39fd7265689f..ef9a920f29c2a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -79,6 +79,7 @@ def __init__(
         quantization: Optional[str] = None,
         enforce_eager: bool = False,
         max_context_len_to_capture: Optional[int] = None,
+        max_logprobs: int = 5,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -93,6 +94,7 @@ def __init__(
         self.quantization = quantization
         self.enforce_eager = enforce_eager
         self.max_context_len_to_capture = max_context_len_to_capture
+        self.max_logprobs = max_logprobs
 
         if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
             # download model from ModelScope hub,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6882e8be34d11..c3dccdd5bb50b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -31,6 +31,7 @@ class EngineArgs:
     max_num_batched_tokens: Optional[int] = None
     max_num_seqs: int = 256
     max_paddings: int = 256
+    max_logprobs: int = 5  # OpenAI default value
     disable_log_stats: bool = False
     revision: Optional[str] = None
     code_revision: Optional[str] = None
@@ -212,6 +213,12 @@ def add_cli_args(
                             type=int,
                             default=EngineArgs.max_paddings,
                             help='maximum number of paddings in a batch')
+        parser.add_argument(
+            '--max-logprobs',
+            type=int,
+            default=EngineArgs.max_logprobs,
+            help=('max number of log probs to return logprobs is specified in'
+                  ' SamplingParams'))
         parser.add_argument('--disable-log-stats',
                             action='store_true',
                             help='disable logging statistics')
@@ -300,7 +307,8 @@ def create_engine_configs(
             self.trust_remote_code, self.download_dir, self.load_format,
             self.dtype, self.seed, self.revision, self.code_revision,
             self.tokenizer_revision, self.max_model_len, self.quantization,
-            self.enforce_eager, self.max_context_len_to_capture)
+            self.enforce_eager, self.max_context_len_to_capture,
+            self.max_logprobs)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9e52d20ca4980..df66139fddcd1 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -47,7 +47,7 @@ def __init__(self, request_id: str) -> None:
         self._queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: RequestOutput) -> None:
+    def put(self, item: Union[RequestOutput, Exception]) -> None:
         if self._finished:
             return
         self._queue.put_nowait(item)
@@ -110,6 +110,17 @@ def process_request_output(self,
                 logger.info(f"Finished request {request_id}.")
             self.abort_request(request_id)
 
+    def process_exception(self,
+                          request_id: str,
+                          exception: Exception,
+                          *,
+                          verbose: bool = False) -> None:
+        """Propagate an exception from the engine."""
+        self._request_streams[request_id].put(exception)
+        if verbose:
+            logger.info(f"Finished request {request_id}.")
+        self.abort_request(request_id)
+
     def add_request(self, request_id: str,
                     **engine_add_request_kwargs) -> AsyncStream:
         """Add a request to be sent to the engine on the next background
@@ -377,10 +388,18 @@ async def engine_step(self) -> bool:
         for new_request in new_requests:
             # Add the request into the vLLM engine's waiting queue.
             # TODO: Maybe add add_request_batch to reduce Ray overhead
-            if self.engine_use_ray:
-                await self.engine.add_request.remote(**new_request)
-            else:
-                await self.engine.add_request_async(**new_request)
+            try:
+                if self.engine_use_ray:
+                    await self.engine.add_request.remote(**new_request)
+                else:
+                    await self.engine.add_request_async(**new_request)
+            except ValueError as e:
+                # TODO: use a vLLM specific error for failed validation
+                self._request_tracker.process_exception(
+                    new_request["request_id"],
+                    e,
+                    verbose=self.log_requests,
+                )
 
         if finished_requests:
             await self._engine_abort(finished_requests)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 8a2573034c940..703756996b7f7 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -18,7 +18,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
+from vllm.sequence import (Logprob, SamplerOutput, Sequence, SequenceGroup,
                            SequenceGroupOutput, SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                                TokenizerGroup)
@@ -473,6 +473,13 @@ def add_request(
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
+        max_logprobs = self.get_model_config().max_logprobs
+        if (sampling_params.logprobs
+                and sampling_params.logprobs > max_logprobs) or (
+                    sampling_params.prompt_logprobs
+                    and sampling_params.prompt_logprobs > max_logprobs):
+            raise ValueError(f"Cannot request more than "
+                             f"{max_logprobs} logprobs.")
         if arrival_time is None:
             arrival_time = time.monotonic()
         prompt_token_ids = self.encode_request(
@@ -583,6 +590,13 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         # Process prompt logprobs
         prompt_logprobs = outputs.prompt_logprobs
         if prompt_logprobs is not None:
+            # We can pick any sequence for the prompt.
+            seq = next(iter(seq_group.seqs_dict.values()))
+            all_token_ids = seq.get_token_ids()
+            for i, prompt_logprobs_for_token in enumerate(prompt_logprobs):
+                self._decode_logprobs(seq, seq_group.sampling_params,
+                                      prompt_logprobs_for_token,
+                                      all_token_ids[:i])
             seq_group.prompt_logprobs = prompt_logprobs
 
         # Process samples
@@ -930,12 +944,36 @@ def _get_stats(self,
             time_e2e_requests=time_e2e_requests,
         )
 
+    def _decode_logprobs(self, seq: Sequence, prms: SamplingParams,
+                         logprobs: Dict[int, Logprob],
+                         all_input_ids: List[int]) -> None:
+        if not logprobs:
+            return
+        for token_id, sample_logprob in logprobs.items():
+            if (sample_logprob.decoded_token is None and token_id != -1):
+                all_input_ids_with_logprob = all_input_ids[:-1] + [token_id]
+                _, new_text, prefix_offset, read_offset = detokenize_incrementally(
+                    self.get_tokenizer_for_seq(seq),
+                    all_input_ids=all_input_ids_with_logprob,
+                    prev_tokens=seq.tokens,
+                    prefix_offset=seq.prefix_offset,
+                    read_offset=seq.read_offset,
+                    skip_special_tokens=prms.skip_special_tokens,
+                    spaces_between_special_tokens=prms.
+                    spaces_between_special_tokens,
+                )
+                sample_logprob.decoded_token = new_text
+
     def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
         """Decodes the new token for a sequence."""
+        all_input_ids = seq.get_token_ids()
+        self._decode_logprobs(seq, prms, seq.output_logprobs[-1],
+                              all_input_ids)
+
         (new_tokens, new_output_text, prefix_offset,
          read_offset) = detokenize_incrementally(
              self.get_tokenizer_for_seq(seq),
-             all_input_ids=seq.get_token_ids(),
+             all_input_ids=all_input_ids,
              prev_tokens=seq.tokens,
              prefix_offset=seq.prefix_offset,
              read_offset=seq.read_offset,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f4ad0aa5a0184..ba352f18f6454 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -82,8 +82,12 @@ async def create_chat_completion(
             return self.chat_completion_stream_generator(
                 request, result_generator, request_id)
         else:
-            return await self.chat_completion_full_generator(
-                request, raw_request, result_generator, request_id)
+            try:
+                return await self.chat_completion_full_generator(
+                    request, raw_request, result_generator, request_id)
+            except ValueError as e:
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response(str(e))
 
     def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
         if request.add_generation_prompt:
@@ -99,117 +103,133 @@ async def chat_completion_stream_generator(
         model_name = request.model
         created_time = int(time.monotonic())
         chunk_object_type = "chat.completion.chunk"
-
-        # Send first response for each request.n (index) with the role
-        role = self.get_chat_request_role(request)
-        for i in range(request.n):
-            choice_data = ChatCompletionResponseStreamChoice(
-                index=i,
-                delta=DeltaMessage(role=role),
-                logprobs=None,
-                finish_reason=None)
-            chunk = ChatCompletionStreamResponse(id=request_id,
-                                                 object=chunk_object_type,
-                                                 created=created_time,
-                                                 choices=[choice_data],
-                                                 model=model_name)
-            data = chunk.model_dump_json(exclude_unset=True)
-            yield f"data: {data}\n\n"
-
-        # Send response to echo the input portion of the last message
-        if request.echo:
-            last_msg_content = ""
-            if request.messages and isinstance(
-                    request.messages, list) and request.messages[-1].get(
-                        "content") and request.messages[-1].get(
-                            "role") == role:
-                last_msg_content = request.messages[-1]["content"]
-
-            if last_msg_content:
-                for i in range(request.n):
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=i,
-                        delta=DeltaMessage(content=last_msg_content),
-                        finish_reason=None)
-                    chunk = ChatCompletionStreamResponse(
-                        id=request_id,
-                        object=chunk_object_type,
-                        created=created_time,
-                        choices=[choice_data],
-                        logprobs=None,
-                        model=model_name)
-                    data = chunk.model_dump_json(exclude_unset=True)
-                    yield f"data: {data}\n\n"
+        first_iteration = True
 
         # Send response for each token for each request.n (index)
         previous_texts = [""] * request.n
         previous_num_tokens = [0] * request.n
         finish_reason_sent = [False] * request.n
-        async for res in result_generator:
-            res: RequestOutput
-            for output in res.outputs:
-                i = output.index
-
-                if finish_reason_sent[i]:
-                    continue
-
-                delta_token_ids = output.token_ids[previous_num_tokens[i]:]
-                top_logprobs = output.logprobs[
-                    previous_num_tokens[i]:] if output.logprobs else None
-
-                if request.logprobs:
-                    logprobs = self._create_logprobs(
-                        token_ids=delta_token_ids,
-                        top_logprobs=top_logprobs,
-                        num_output_top_logprobs=request.logprobs,
-                        initial_text_offset=len(previous_texts[i]),
-                    )
-                else:
-                    logprobs = None
-
-                delta_text = output.text[len(previous_texts[i]):]
-                previous_texts[i] = output.text
-                previous_num_tokens[i] = len(output.token_ids)
-                if output.finish_reason is None:
-                    # Send token-by-token response for each request.n
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=i,
-                        delta=DeltaMessage(content=delta_text),
-                        logprobs=logprobs,
-                        finish_reason=None)
-                    chunk = ChatCompletionStreamResponse(
-                        id=request_id,
-                        object=chunk_object_type,
-                        created=created_time,
-                        choices=[choice_data],
-                        model=model_name)
-                    data = chunk.model_dump_json(exclude_unset=True)
-                    yield f"data: {data}\n\n"
-                else:
-                    # Send the finish response for each request.n only once
-                    prompt_tokens = len(res.prompt_token_ids)
-                    final_usage = UsageInfo(
-                        prompt_tokens=prompt_tokens,
-                        completion_tokens=previous_num_tokens[i],
-                        total_tokens=prompt_tokens + previous_num_tokens[i],
-                    )
-                    choice_data = ChatCompletionResponseStreamChoice(
-                        index=i,
-                        delta=DeltaMessage(content=delta_text),
-                        logprobs=logprobs,
-                        finish_reason=output.finish_reason)
-                    chunk = ChatCompletionStreamResponse(
-                        id=request_id,
-                        object=chunk_object_type,
-                        created=created_time,
-                        choices=[choice_data],
-                        model=model_name)
-                    if final_usage is not None:
-                        chunk.usage = final_usage
-                    data = chunk.model_dump_json(exclude_unset=True,
-                                                 exclude_none=True)
-                    yield f"data: {data}\n\n"
-                    finish_reason_sent[i] = True
+        try:
+            async for res in result_generator:
+                res: RequestOutput
+                # We need to do it here, because if there are exceptions in
+                # the result_generator, it needs to be sent as the FIRST
+                # response (by the try...catch).
+                if first_iteration:
+                    # Send first response for each request.n (index) with the role
+                    role = self.get_chat_request_role(request)
+                    for i in range(request.n):
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=DeltaMessage(role=role),
+                            logprobs=None,
+                            finish_reason=None)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+                        data = chunk.model_dump_json(exclude_unset=True)
+                        yield f"data: {data}\n\n"
+
+                    # Send response to echo the input portion of the last message
+                    if request.echo:
+                        last_msg_content = ""
+                        if request.messages and isinstance(
+                                request.messages,
+                                list) and request.messages[-1].get(
+                                    "content") and request.messages[-1].get(
+                                        "role") == role:
+                            last_msg_content = request.messages[-1]["content"]
+
+                        if last_msg_content:
+                            for i in range(request.n):
+                                choice_data = ChatCompletionResponseStreamChoice(
+                                    index=i,
+                                    delta=DeltaMessage(
+                                        content=last_msg_content),
+                                    finish_reason=None)
+                                chunk = ChatCompletionStreamResponse(
+                                    id=request_id,
+                                    object=chunk_object_type,
+                                    created=created_time,
+                                    choices=[choice_data],
+                                    logprobs=None,
+                                    model=model_name)
+                                data = chunk.model_dump_json(
+                                    exclude_unset=True)
+                                yield f"data: {data}\n\n"
+                    first_iteration = False
+
+                for output in res.outputs:
+                    i = output.index
+
+                    if finish_reason_sent[i]:
+                        continue
+
+                    delta_token_ids = output.token_ids[previous_num_tokens[i]:]
+                    top_logprobs = output.logprobs[
+                        previous_num_tokens[i]:] if output.logprobs else None
+
+                    if request.logprobs:
+                        logprobs = self._create_logprobs(
+                            token_ids=delta_token_ids,
+                            top_logprobs=top_logprobs,
+                            num_output_top_logprobs=request.logprobs,
+                            initial_text_offset=len(previous_texts[i]),
+                        )
+                    else:
+                        logprobs = None
+
+                    delta_text = output.text[len(previous_texts[i]):]
+                    previous_texts[i] = output.text
+                    previous_num_tokens[i] = len(output.token_ids)
+                    if output.finish_reason is None:
+                        # Send token-by-token response for each request.n
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=DeltaMessage(content=delta_text),
+                            logprobs=logprobs,
+                            finish_reason=None)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+                        data = chunk.model_dump_json(exclude_unset=True)
+                        yield f"data: {data}\n\n"
+                    else:
+                        # Send the finish response for each request.n only once
+                        prompt_tokens = len(res.prompt_token_ids)
+                        final_usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=previous_num_tokens[i],
+                            total_tokens=prompt_tokens +
+                            previous_num_tokens[i],
+                        )
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=DeltaMessage(content=delta_text),
+                            logprobs=logprobs,
+                            finish_reason=output.finish_reason)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+                        if final_usage is not None:
+                            chunk.usage = final_usage
+                        data = chunk.model_dump_json(exclude_unset=True,
+                                                     exclude_none=True)
+                        yield f"data: {data}\n\n"
+                        finish_reason_sent[i] = True
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
         # Send the final done message after all response.n are finished
         yield "data: [DONE]\n\n"
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 99a10196b5f73..a8244fd150753 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -26,107 +26,6 @@
     [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs]
 
 
-async def completion_stream_generator(
-    request: CompletionRequest,
-    raw_request: Request,
-    on_abort,
-    result_generator: AsyncIterator[Tuple[int, RequestOutput]],
-    create_logprobs_fn: TypeCreateLogProbsFn,
-    request_id: str,
-    created_time: int,
-    model_name: str,
-    num_prompts: int,
-) -> AsyncGenerator[str, None]:
-    previous_texts = [""] * request.n * num_prompts
-    previous_num_tokens = [0] * request.n * num_prompts
-    has_echoed = [False] * request.n * num_prompts
-
-    async for prompt_idx, res in result_generator:
-
-        # Abort the request if the client disconnects.
-        if await raw_request.is_disconnected():
-            await on_abort(f"{request_id}-{prompt_idx}")
-            raise StopAsyncIteration()
-
-        for output in res.outputs:
-            i = output.index + prompt_idx * request.n
-            # TODO(simon): optimize the performance by avoiding full text O(n^2) sending.
-
-            if request.echo and request.max_tokens == 0:
-                # only return the prompt
-                delta_text = res.prompt
-                delta_token_ids = res.prompt_token_ids
-                top_logprobs = res.prompt_logprobs
-                has_echoed[i] = True
-            elif request.echo and request.max_tokens > 0 and not has_echoed[i]:
-                # echo the prompt and first token
-                delta_text = res.prompt + output.text
-                delta_token_ids = res.prompt_token_ids + output.token_ids
-                top_logprobs = res.prompt_logprobs + (output.logprobs or [])
-                has_echoed[i] = True
-            else:
-                # return just the delta
-                delta_text = output.text[len(previous_texts[i]):]
-                delta_token_ids = output.token_ids[previous_num_tokens[i]:]
-                top_logprobs = output.logprobs[
-                    previous_num_tokens[i]:] if output.logprobs else None
-
-            if request.logprobs is not None:
-                assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested"
-                logprobs = create_logprobs_fn(
-                    token_ids=delta_token_ids,
-                    top_logprobs=top_logprobs,
-                    num_output_top_logprobs=request.logprobs,
-                    initial_text_offset=len(previous_texts[i]),
-                )
-            else:
-                logprobs = None
-
-            previous_texts[i] = output.text
-            previous_num_tokens[i] = len(output.token_ids)
-            finish_reason = output.finish_reason
-            response_json = CompletionStreamResponse(
-                id=request_id,
-                created=created_time,
-                model=model_name,
-                choices=[
-                    CompletionResponseStreamChoice(
-                        index=i,
-                        text=delta_text,
-                        logprobs=logprobs,
-                        finish_reason=finish_reason,
-                    )
-                ]).model_dump_json()
-            yield f"data: {response_json}\n\n"
-
-            if output.finish_reason is not None:  # return final usage
-                logprobs = LogProbs() if request.logprobs is not None else None
-                prompt_tokens = len(res.prompt_token_ids)
-                completion_tokens = len(output.token_ids)
-                final_usage = UsageInfo(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=prompt_tokens + completion_tokens,
-                )
-                response_json = CompletionStreamResponse(
-                    id=request_id,
-                    created=created_time,
-                    model=model_name,
-                    choices=[
-                        CompletionResponseStreamChoice(
-                            index=i,
-                            text="",
-                            logprobs=logprobs,
-                            finish_reason=output.finish_reason,
-                        )
-                    ],
-                    usage=final_usage,
-                ).model_dump_json()
-                yield f"data: {response_json}\n\n"
-
-    yield "data: [DONE]\n\n"
-
-
 def parse_prompt_format(prompt) -> Tuple[bool, list]:
     # get the prompt, openai supports the following
     # "a string, array of strings, array of tokens, or array of token arrays."
@@ -151,73 +50,6 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]:
     return prompt_is_tokens, prompts
 
 
-def request_output_to_completion_response(
-    final_res_batch: List[RequestOutput],
-    request: CompletionRequest,
-    create_logprobs_fn: TypeCreateLogProbsFn,
-    request_id: str,
-    created_time: int,
-    model_name: str,
-) -> CompletionResponse:
-    choices = []
-    num_prompt_tokens = 0
-    num_generated_tokens = 0
-    for final_res in final_res_batch:
-        assert final_res is not None
-        prompt_token_ids = final_res.prompt_token_ids
-        prompt_logprobs = final_res.prompt_logprobs
-        prompt_text = final_res.prompt
-
-        for output in final_res.outputs:
-            if request.echo and request.max_tokens == 0:
-                token_ids = prompt_token_ids
-                top_logprobs = prompt_logprobs
-                output_text = prompt_text
-            elif request.echo and request.max_tokens > 0:
-                token_ids = prompt_token_ids + output.token_ids
-                top_logprobs = prompt_logprobs + output.logprobs
-                output_text = prompt_text + output.text
-            else:
-                token_ids = output.token_ids
-                top_logprobs = output.logprobs
-                output_text = output.text
-
-            if request.logprobs is not None:
-                logprobs = create_logprobs_fn(
-                    token_ids=token_ids,
-                    top_logprobs=top_logprobs,
-                    num_output_top_logprobs=request.logprobs,
-                )
-            else:
-                logprobs = None
-
-            choice_data = CompletionResponseChoice(
-                index=len(choices),
-                text=output_text,
-                logprobs=logprobs,
-                finish_reason=output.finish_reason,
-            )
-            choices.append(choice_data)
-
-        num_prompt_tokens += len(prompt_token_ids)
-        num_generated_tokens += sum(
-            len(output.token_ids) for output in final_res.outputs)
-
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        completion_tokens=num_generated_tokens,
-        total_tokens=num_prompt_tokens + num_generated_tokens,
-    )
-
-    return CompletionResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        choices=choices,
-        usage=usage,
-    )
-
-
 def merge_async_iterators(*iterators):
     """Merge multiple asynchronous iterators into a single iterator.
 
@@ -230,8 +62,11 @@ def merge_async_iterators(*iterators):
     finished = [False] * len(iterators)
 
     async def producer(i, iterator):
-        async for item in iterator:
-            await queue.put((i, item))
+        try:
+            async for item in iterator:
+                await queue.put((i, item))
+        except Exception as e:
+            await queue.put(e)
         finished[i] = True
 
     _tasks = [
@@ -242,6 +77,8 @@ async def producer(i, iterator):
     async def consumer():
         while not all(finished) or not queue.empty():
             item = await queue.get()
+            if isinstance(item, Exception):
+                raise item
             yield item
         await asyncio.gather(*_tasks)
 
@@ -312,6 +149,7 @@ async def create_completion(self, request: CompletionRequest,
                                          prompt_token_ids=input_ids,
                                          lora_request=lora_request))
         except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
         result_generator: AsyncIterator[Tuple[
@@ -325,27 +163,28 @@ async def create_completion(self, request: CompletionRequest,
 
         # Streaming response
         if stream:
-            return completion_stream_generator(request,
-                                               raw_request,
-                                               self.engine.abort,
-                                               result_generator,
-                                               self._create_logprobs,
-                                               request_id,
-                                               created_time,
-                                               model_name,
-                                               num_prompts=len(prompts))
+            return self.completion_stream_generator(request,
+                                                    raw_request,
+                                                    result_generator,
+                                                    request_id,
+                                                    created_time,
+                                                    model_name,
+                                                    num_prompts=len(prompts))
 
         # Non-streaming response
         final_res_batch: RequestOutput = [None] * len(prompts)
-        async for i, res in result_generator:
-            if await raw_request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await self.engine.abort(f"{request_id}-{i}")
-                return self.create_error_response("Client disconnected")
-            final_res_batch[i] = res
-        response = request_output_to_completion_response(
-            final_res_batch, request, self._create_logprobs, request_id,
-            created_time, model_name)
+        try:
+            async for i, res in result_generator:
+                if await raw_request.is_disconnected():
+                    # Abort the request if the client disconnects.
+                    await self.engine.abort(f"{request_id}-{i}")
+                    return self.create_error_response("Client disconnected")
+                final_res_batch[i] = res
+            response = self.request_output_to_completion_response(
+                final_res_batch, request, request_id, created_time, model_name)
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
         # When user requests streaming but we don't stream, we still need to
         # return a streaming response with a single event.
@@ -359,3 +198,179 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
             return fake_stream_generator()
 
         return response
+
+    async def completion_stream_generator(
+        self,
+        request: CompletionRequest,
+        raw_request: Request,
+        result_generator: AsyncIterator[Tuple[int, RequestOutput]],
+        request_id: str,
+        created_time: int,
+        model_name: str,
+        num_prompts: int,
+    ) -> AsyncGenerator[str, None]:
+        previous_texts = [""] * request.n * num_prompts
+        previous_num_tokens = [0] * request.n * num_prompts
+        has_echoed = [False] * request.n * num_prompts
+
+        try:
+            async for prompt_idx, res in result_generator:
+
+                # Abort the request if the client disconnects.
+                if await raw_request.is_disconnected():
+                    await self.engine.abort(f"{request_id}-{prompt_idx}")
+                    raise StopAsyncIteration()
+
+                for output in res.outputs:
+                    i = output.index + prompt_idx * request.n
+                    # TODO(simon): optimize the performance by avoiding full text O(n^2) sending.
+
+                    if request.echo and request.max_tokens == 0:
+                        # only return the prompt
+                        delta_text = res.prompt
+                        delta_token_ids = res.prompt_token_ids
+                        top_logprobs = res.prompt_logprobs
+                        has_echoed[i] = True
+                    elif request.echo and request.max_tokens > 0 and not has_echoed[
+                            i]:
+                        # echo the prompt and first token
+                        delta_text = res.prompt + output.text
+                        delta_token_ids = res.prompt_token_ids + output.token_ids
+                        top_logprobs = res.prompt_logprobs + (output.logprobs
+                                                              or [])
+                        has_echoed[i] = True
+                    else:
+                        # return just the delta
+                        delta_text = output.text[len(previous_texts[i]):]
+                        delta_token_ids = output.token_ids[
+                            previous_num_tokens[i]:]
+                        top_logprobs = output.logprobs[previous_num_tokens[
+                            i]:] if output.logprobs else None
+
+                    if request.logprobs is not None:
+                        assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested"
+                        logprobs = self._create_logprobs(
+                            token_ids=delta_token_ids,
+                            top_logprobs=top_logprobs,
+                            num_output_top_logprobs=request.logprobs,
+                            initial_text_offset=len(previous_texts[i]),
+                        )
+                    else:
+                        logprobs = None
+
+                    previous_texts[i] = output.text
+                    previous_num_tokens[i] = len(output.token_ids)
+                    finish_reason = output.finish_reason
+                    response_json = CompletionStreamResponse(
+                        id=request_id,
+                        created=created_time,
+                        model=model_name,
+                        choices=[
+                            CompletionResponseStreamChoice(
+                                index=i,
+                                text=delta_text,
+                                logprobs=logprobs,
+                                finish_reason=finish_reason,
+                            )
+                        ]).model_dump_json()
+                    yield f"data: {response_json}\n\n"
+
+                    if output.finish_reason is not None:  # return final usage
+                        logprobs = LogProbs(
+                        ) if request.logprobs is not None else None
+                        prompt_tokens = len(res.prompt_token_ids)
+                        completion_tokens = len(output.token_ids)
+                        final_usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=prompt_tokens + completion_tokens,
+                        )
+                        response_json = CompletionStreamResponse(
+                            id=request_id,
+                            created=created_time,
+                            model=model_name,
+                            choices=[
+                                CompletionResponseStreamChoice(
+                                    index=i,
+                                    text="",
+                                    logprobs=logprobs,
+                                    finish_reason=output.finish_reason,
+                                )
+                            ],
+                            usage=final_usage,
+                        ).model_dump_json()
+                        yield f"data: {response_json}\n\n"
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            data = self.create_streaming_error_response(str(e))
+            print("yield", f"data: {data}\n\n")
+            yield f"data: {data}\n\n"
+
+        print("yield", "data: [DONE]\n\n")
+        yield "data: [DONE]\n\n"
+
+    def request_output_to_completion_response(
+        self,
+        final_res_batch: List[RequestOutput],
+        request: CompletionRequest,
+        request_id: str,
+        created_time: int,
+        model_name: str,
+    ) -> CompletionResponse:
+        choices = []
+        num_prompt_tokens = 0
+        num_generated_tokens = 0
+        for final_res in final_res_batch:
+            assert final_res is not None
+            prompt_token_ids = final_res.prompt_token_ids
+            prompt_logprobs = final_res.prompt_logprobs
+            prompt_text = final_res.prompt
+
+            for output in final_res.outputs:
+                if request.echo and request.max_tokens == 0:
+                    token_ids = prompt_token_ids
+                    top_logprobs = prompt_logprobs
+                    output_text = prompt_text
+                elif request.echo and request.max_tokens > 0:
+                    token_ids = prompt_token_ids + output.token_ids
+                    top_logprobs = prompt_logprobs + output.logprobs
+                    output_text = prompt_text + output.text
+                else:
+                    token_ids = output.token_ids
+                    top_logprobs = output.logprobs
+                    output_text = output.text
+
+                if request.logprobs is not None:
+                    logprobs = self._create_logprobs(
+                        token_ids=token_ids,
+                        top_logprobs=top_logprobs,
+                        num_output_top_logprobs=request.logprobs,
+                    )
+                else:
+                    logprobs = None
+
+                choice_data = CompletionResponseChoice(
+                    index=len(choices),
+                    text=output_text,
+                    logprobs=logprobs,
+                    finish_reason=output.finish_reason,
+                )
+                choices.append(choice_data)
+
+            num_prompt_tokens += len(prompt_token_ids)
+            num_generated_tokens += sum(
+                len(output.token_ids) for output in final_res.outputs)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+
+        return CompletionResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+        )
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 09945471e9af0..230d13d97dbba 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,4 +1,5 @@
 import asyncio
+import json
 from dataclasses import dataclass
 from http import HTTPStatus
 from typing import Dict, List, Optional, Union
@@ -11,6 +12,7 @@
                                               ModelCard, ModelList,
                                               ModelPermission)
 from vllm.lora.request import LoRARequest
+from vllm.sequence import Logprob
 
 logger = init_logger(__name__)
 
@@ -83,7 +85,7 @@ async def show_available_models(self) -> ModelList:
     def _create_logprobs(
         self,
         token_ids: List[int],
-        top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None,
+        top_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None,
         num_output_top_logprobs: Optional[int] = None,
         initial_text_offset: int = 0,
     ) -> LogProbs:
@@ -95,10 +97,10 @@ def _create_logprobs(
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is not None:
-                token_logprob = step_top_logprobs[token_id]
+                token_logprob = step_top_logprobs[token_id].logprob
             else:
                 token_logprob = None
-            token = self.tokenizer.convert_ids_to_tokens(token_id)
+            token = step_top_logprobs[token_id].decoded_token
             logprobs.tokens.append(token)
             logprobs.token_logprobs.append(token_logprob)
             if len(logprobs.text_offset) == 0:
@@ -110,7 +112,7 @@ def _create_logprobs(
 
             if num_output_top_logprobs:
                 logprobs.top_logprobs.append({
-                    self.tokenizer.convert_ids_to_tokens(i): p
+                    p.decoded_token: p.logprob
                     for i, p in step_top_logprobs.items()
                 } if step_top_logprobs else None)
         return logprobs
@@ -124,6 +126,19 @@ def create_error_response(
                              type=err_type,
                              code=status_code.value)
 
+    def create_streaming_error_response(
+            self,
+            message: str,
+            err_type: str = "BadRequestError",
+            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
+        json_str = json.dumps({
+            "error":
+            self.create_error_response(message=message,
+                                       err_type=err_type,
+                                       status_code=status_code).model_dump()
+        })
+        return json_str
+
     async def _check_model(self, request) -> Optional[ErrorResponse]:
         if request.model == self.served_model:
             return
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 71655b216fb3d..b48dde0318d09 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -8,8 +8,9 @@
     tensor_model_parallel_gather)
 from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput,
-                           SequenceData, SequenceGroupOutput, SequenceOutput)
+from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
+                           SamplerOutput, SequenceData, SequenceGroupOutput,
+                           SequenceOutput)
 from vllm.utils import is_neuron
 
 
@@ -528,7 +529,10 @@ def _get_logprobs(
                     prompt_logprobs_dict.update(
                         zip(top_token_ids[sample_idx, :num_logprobs].tolist(),
                             top_logprobs[sample_idx, :num_logprobs].tolist()))
-                group_prompt_logprobs.append(prompt_logprobs_dict)
+                group_prompt_logprobs.append({
+                    token_id: Logprob(logprob)
+                    for token_id, logprob in prompt_logprobs_dict.items()
+                })
                 sample_idx += 1
                 query_result_idx += 1
             result_prompt_logprobs.append(group_prompt_logprobs)
@@ -553,7 +557,10 @@ def _get_logprobs(
                                       parent_id, :num_logprobs].tolist(),
                         top_logprobs[sample_idx +
                                      parent_id, :num_logprobs].tolist()))
-            group_sample_logprobs.append(sample_logprobs_dict)
+            group_sample_logprobs.append({
+                token_id: Logprob(logprob)
+                for token_id, logprob in sample_logprobs_dict.items()
+            })
         result_sample_logprobs.append(group_sample_logprobs)
         sample_idx += len(seq_ids)
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 04a9a90a68bcc..a110ab6b748f8 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -8,8 +8,16 @@
 from vllm.sampling_params import SamplingParams
 from vllm.lora.request import LoRARequest
 
-PromptLogprobs = List[Optional[Dict[int, float]]]
-SampleLogprobs = List[Dict[int, float]]
+
+@dataclass
+class Logprob:
+    """Infos for supporting OpenAI compatible logprobs."""
+    logprob: float
+    decoded_token: Optional[str] = None
+
+
+PromptLogprobs = List[Optional[Dict[int, Logprob]]]
+SampleLogprobs = List[Dict[int, Logprob]]
 
 
 class SequenceStatus(enum.Enum):
@@ -196,12 +204,12 @@ def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
     def append_token_id(
         self,
         token_id: int,
-        logprobs: Dict[int, float],
+        logprobs: Dict[int, Logprob],
     ) -> None:
         assert token_id in logprobs
         self._append_tokens_to_blocks([token_id])
         self.output_logprobs.append(logprobs)
-        self.data.append_token_id(token_id, logprobs[token_id])
+        self.data.append_token_id(token_id, logprobs[token_id].logprob)
 
     def get_len(self) -> int:
         return self.data.get_len()
@@ -456,7 +464,7 @@ def __init__(
         self,
         parent_seq_id: int,
         output_token: int,
-        logprobs: Dict[int, float],
+        logprobs: Dict[int, Logprob],
     ) -> None:
         self.parent_seq_id = parent_seq_id
         self.output_token = output_token
@@ -470,9 +478,10 @@ def __repr__(self) -> str:
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, SequenceOutput):
             raise NotImplementedError()
-        return (self.parent_seq_id == other.parent_seq_id
-                and self.output_token == other.output_token
-                and self.logprobs == other.logprobs)
+        equal = (self.parent_seq_id == other.parent_seq_id
+                 and self.output_token == other.output_token)
+        log_probs_equal = other.logprobs == self.logprobs
+        return equal and log_probs_equal
 
 
 class SequenceGroupOutput:
diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py
index 591d1b1300c88..ab3e28389a04c 100644
--- a/vllm/worker/spec_decode/multi_step_worker.py
+++ b/vllm/worker/spec_decode/multi_step_worker.py
@@ -77,7 +77,7 @@ def _append_new_tokens(
                 token_id = seq_output.output_token
                 token_logprob = seq_output.logprobs[token_id]
 
-                seq.append_token_id(token_id, token_logprob)
+                seq.append_token_id(token_id, token_logprob.logprob)
 
     def _shallow_copy_inputs(
         self, seq_group_metadata_list: List[SequenceGroupMetadata]

From ff578cae54d23812b53b6c9b94b8bd0bb293a1fe Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 4 Mar 2024 14:01:40 -0800
Subject: [PATCH 054/113] Add health check, make async Engine more robust
 (#3015)

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
---
 tests/async_engine/test_async_llm_engine.py |  32 +++---
 tests/async_engine/test_request_tracker.py  |  38 +++----
 vllm/engine/async_llm_engine.py             | 113 +++++++++++++++-----
 vllm/engine/llm_engine.py                   |  20 ++++
 4 files changed, 138 insertions(+), 65 deletions(-)

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 1edb19c550010..1e31ff7373031 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -25,12 +25,8 @@ async def step_async(self):
         return [RequestOutput(
             request_id=self.request_id)] if self.request_id else []
 
-    async def encode_request_async(
-        self,
-        *args,
-        **kwargs,
-    ):
-        return [1]
+    async def encode_request_async(self, *args, **kwargs):
+        pass
 
     def generate(self, request_id):
         self.request_id = request_id
@@ -43,13 +39,16 @@ def add_request(self, **kwargs):
         self.add_request_calls += 1
 
     async def add_request_async(self, **kwargs):
-        del kwargs  # Unused
         self.add_request_calls += 1
+        return
 
     def abort_request(self, request_id):
         del request_id  # Unused
         self.abort_request_calls += 1
 
+    def has_unfinished_requests(self):
+        return self.request_id is not None
+
 
 class MockAsyncLLMEngine(AsyncLLMEngine):
 
@@ -72,20 +71,21 @@ async def test_new_requests_event():
     await engine.add_request("2", "", None)
     engine.engine.generate("2")
     await asyncio.sleep(0)
-    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls == 2
     await asyncio.sleep(0)
-    assert engine.engine.step_calls == 3
+    assert engine.engine.add_request_calls == 2
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
     engine.engine.stop_generating()
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 4
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 4
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls
 
     await engine.add_request("3", "", None)
     await asyncio.sleep(0.01)
     assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == 5
+    assert engine.engine.step_calls == old_step_calls + 1
     await asyncio.sleep(0.01)
     assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == 5
+    assert engine.engine.step_calls == old_step_calls + 1
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
index 4043558bae919..7b1f4a9e1eb2f 100644
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -4,25 +4,14 @@
 from vllm.outputs import RequestOutput
 
 
-class DummyEvent:
-
-    def __init__(self):
-        self.flag = False
-
-    def set(self):
-        self.flag = True
-
-    def clear(self):
-        self.flag = False
-
-
-def test_request_tracker():
+@pytest.mark.asyncio
+async def test_request_tracker():
     tracker = RequestTracker()
-    tracker.new_requests_event = DummyEvent()
     stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
     assert len(new) == 1
     assert new[0]["request_id"] == "1"
     assert not finished
@@ -30,9 +19,10 @@ def test_request_tracker():
 
     stream_2 = tracker.add_request("2")
     stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
     assert len(new) == 2
     assert new[0]["request_id"] == "2"
     assert new[1]["request_id"] == "3"
@@ -43,7 +33,7 @@ def test_request_tracker():
     # request_ids must be unique
     with pytest.raises(KeyError):
         tracker.add_request("1")
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
 
     tracker.abort_request("1")
     new, finished = tracker.get_new_and_finished_requests()
@@ -54,7 +44,8 @@ def test_request_tracker():
 
     stream_4 = tracker.add_request("4")
     tracker.abort_request("4")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
     assert len(finished) == 1
     assert "4" in finished
@@ -62,11 +53,12 @@ def test_request_tracker():
     assert stream_4.finished
 
     stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
     tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], bool(finished)))
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
     new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
     assert len(finished) == 1
     assert "2" in finished
     assert len(new) == 1
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index df66139fddcd1..65ab0c0634176 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1,8 +1,9 @@
 import asyncio
+import os
 import time
 from functools import partial
 from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
-                    Union, AsyncIterator)
+                    Union, AsyncIterator, Callable)
 
 from vllm.lora.request import LoRARequest
 from vllm.config import ModelConfig
@@ -14,28 +15,31 @@
 from vllm.sampling_params import SamplingParams
 
 logger = init_logger(__name__)
+ENGINE_ITERATION_TIMEOUT_S = int(
+    os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60"))
 
 
 class AsyncEngineDeadError(RuntimeError):
     pass
 
 
-def _raise_exception_on_finish(task: asyncio.Task,
-                               request_tracker: "RequestTracker") -> None:
+def _raise_exception_on_finish(
+        task: asyncio.Task, error_callback: Callable[[Exception],
+                                                     None]) -> None:
     msg = ("Task finished unexpectedly. This should never happen! "
            "Please open an issue on Github.")
+
+    exception = None
     try:
-        try:
-            task.result()
-        except asyncio.CancelledError:
-            return
-        except Exception as exc:
-            raise AsyncEngineDeadError(
-                msg + " See stack trace above for the actual cause.") from exc
+        task.result()
+        # NOTE: This will be thrown if task exits normally (which it should not)
         raise AsyncEngineDeadError(msg)
-    except Exception as exc:
-        request_tracker.propagate_exception(exc)
-        raise exc
+    except Exception as e:
+        exception = e
+        logger.error("Engine background task failed", exc_info=e)
+        error_callback(exception)
+        raise AsyncEngineDeadError(
+            msg + " See stack trace above for the actual cause.") from e
 
 
 class AsyncStream:
@@ -78,13 +82,13 @@ def __init__(self) -> None:
         self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
         self._new_requests: asyncio.Queue[Tuple[AsyncStream,
                                                 dict]] = asyncio.Queue()
-        self.new_requests_event = None
+        self.new_requests_event = asyncio.Event()
 
     def __contains__(self, item):
         return item in self._request_streams
 
-    def init_event(self):
-        self.new_requests_event = asyncio.Event()
+    def __len__(self) -> int:
+        return len(self._request_streams)
 
     def propagate_exception(self,
                             exc: Exception,
@@ -93,9 +97,11 @@ def propagate_exception(self,
         (all if request_id is None)."""
         if request_id is not None:
             self._request_streams[request_id].put(exc)
+            self.abort_request(request_id)
         else:
-            for stream in self._request_streams.values():
+            for rid, stream in self._request_streams.items():
                 stream.put(exc)
+                self.abort_request(rid)
 
     def process_request_output(self,
                                request_output: RequestOutput,
@@ -172,12 +178,15 @@ def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
             self._request_streams[stream.request_id] = stream
             new_requests.append(new_request)
 
-        self.new_requests_event.clear()
-
         return new_requests, finished_requests
 
     async def wait_for_new_requests(self):
-        await self.new_requests_event.wait()
+        if not self.has_new_requests():
+            await self.new_requests_event.wait()
+        self.new_requests_event.clear()
+
+    def has_new_requests(self):
+        return not self._new_requests.empty()
 
 
 class _AsyncLLMEngine(LLMEngine):
@@ -285,6 +294,10 @@ async def _run_workers_async(
         all_outputs = await asyncio.gather(*coros)
         return all_outputs
 
+    async def check_health_async(self):
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
 
 class AsyncLLMEngine:
     """An asynchronous wrapper for LLMEngine.
@@ -335,27 +348,48 @@ def __init__(self,
         # collected
         self._background_loop_unshielded = None
         self.start_engine_loop = start_engine_loop
-        self._request_tracker = RequestTracker()
+        self._request_tracker: Optional[RequestTracker] = None
+        self._errored_with: Optional[BaseException] = None
 
     @property
     def is_running(self) -> bool:
         return (self.background_loop is not None
-                and not self.background_loop.done())
+                and not self._background_loop_unshielded.done())
+
+    @property
+    def is_stopped(self) -> bool:
+        return self.errored or (self.background_loop is not None
+                                and self._background_loop_unshielded.done())
+
+    @property
+    def errored(self) -> bool:
+        return self._errored_with is not None
+
+    def set_errored(self, exc: Exception) -> None:
+        self._errored_with = exc
+
+    def _error_callback(self, exc: Exception) -> None:
+        self.set_errored(exc)
+        self._request_tracker.propagate_exception(exc)
 
     def get_tokenizer(self):
         return self.engine.tokenizer.tokenizer
 
     def start_background_loop(self) -> None:
         """Start the background loop."""
+        if self.errored:
+            raise AsyncEngineDeadError(
+                "Background loop has errored already.") from self._errored_with
         if self.is_running:
             raise RuntimeError("Background loop is already running.")
-        self._request_tracker.init_event()
+        # Initialize the RequestTracker here so it uses the right event loop.
+        self._request_tracker = RequestTracker()
 
         self._background_loop_unshielded = asyncio.get_event_loop(
         ).create_task(self.run_engine_loop())
         self._background_loop_unshielded.add_done_callback(
             partial(_raise_exception_on_finish,
-                    request_tracker=self._request_tracker))
+                    error_callback=self._error_callback))
         self.background_loop = asyncio.shield(self._background_loop_unshielded)
 
     def _init_engine(self, *args,
@@ -423,12 +457,23 @@ async def _engine_abort(self, request_ids: Iterable[str]):
             self.engine.abort_request(request_ids)
 
     async def run_engine_loop(self):
-        # Initialize the RequestTracker here so it uses the right event loop.
         has_requests_in_progress = False
         while True:
             if not has_requests_in_progress:
+                logger.debug("Waiting for new requests...")
                 await self._request_tracker.wait_for_new_requests()
-            has_requests_in_progress = await self.engine_step()
+                logger.debug("Got new requests!")
+
+            # Abort if iteration takes too long due to unrecoverable errors
+            # (eg. NCCL timeouts).
+            try:
+                has_requests_in_progress = await asyncio.wait_for(
+                    self.engine_step(), ENGINE_ITERATION_TIMEOUT_S)
+            except asyncio.TimeoutError as exc:
+                logger.error(
+                    "Engine iteration timed out. This should never happen!")
+                self.set_errored(exc)
+                raise
             await asyncio.sleep(0)
 
     async def add_request(
@@ -647,3 +692,19 @@ async def do_log_stats(self) -> None:
             await self.engine.do_log_stats.remote()
         else:
             self.engine.do_log_stats()
+
+    async def check_health(self):
+        """Raises an error if engine is unhealthy."""
+        t = time.perf_counter()
+        logger.debug("Starting health check...")
+        if self.is_stopped:
+            raise AsyncEngineDeadError("Background loop is stopped.")
+
+        if self.engine_use_ray:
+            try:
+                await self.engine.check_health.remote()
+            except ray.exceptions.RayActorError as e:
+                raise RuntimeError("Engine is dead.") from e
+        else:
+            await self.engine.check_health_async()
+        logger.debug(f"Health check took {time.perf_counter()-t}s")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 703756996b7f7..1f518cbf39b21 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1119,3 +1119,23 @@ def _compiled_ray_dag(self):
                 for worker in self.workers
             ])
         return forward_dag.experimental_compile()
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
+    def _check_if_any_actor_is_dead(self):
+        if not self.parallel_config.worker_use_ray:
+            return
+
+        if not self.workers:
+            return
+
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")

From 9a4548bae73a8831f668116d8a6e88491d933a4e Mon Sep 17 00:00:00 2001
From: Chen Wang <Chen.Wang1@ibm.com>
Date: Mon, 4 Mar 2024 18:51:56 -0500
Subject: [PATCH 055/113] Fix the openai benchmarking requests to work with
 latest OpenAI apis (#2992)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/backend_request_func.py | 70 ++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index e7f74e2feaf86..d7cac22ce7a99 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -275,10 +275,80 @@ async def async_request_openai_completions(
     return output
 
 
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/chat/completions"
+    ), "OpenAI Chat API URL must end with 'v1/chat/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": request_func_input.prompt,
+                },
+            ],
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+
+                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            body = json.loads(chunk)
+                            if "content" in body["choices"][0]["delta"]:
+                                generated_text += body["choices"][0]["delta"][
+                                    "content"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 ASYNC_REQUEST_FUNCS = {
     "tgi": async_request_tgi,
     "vllm": async_request_vllm,
     "deepspeed-mii": async_request_deepspeed_mii,
     "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
     "tensorrt-llm": async_request_trt_llm,
 }

From 05af6da8d927f70d15ab1ed25b01df3c967ad961 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Mon, 4 Mar 2024 21:14:53 -0500
Subject: [PATCH 056/113] [ROCm] enable cupy in order to enable  cudagraph mode
 for AMD GPUs (#3123)

Co-authored-by: lcskrishna <lollachaitanya@gmail.com>
---
 Dockerfile.rocm       | 30 +++++++++++++++++++++++++-----
 vllm/worker/worker.py |  4 +---
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 54ae06be6e101..a45265d79a6ac 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
 # In that case, we need to use the python reference attention implementation in vllm
 ARG BUILD_FA="1"
 
+# whether to build cupy on rocm
+ARG BUILD_CUPY="1"
+
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
 
@@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
     && cd ..; \
     fi
 
-COPY ./ /app/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
-
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
 RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
     rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 
+# build cupy
+RUN if [ "$BUILD_CUPY" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
+    && cd cupy \
+    && pip install mpi4py-mpich \
+    && pip install scipy==1.9.3 \
+    && pip install cython==0.29.* \
+    && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
+    && export CUPY_INSTALL_USE_HIP=1 \
+    && export ROCM_HOME=/opt/rocm \
+    && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
+    && pip install . \
+    && cd ..; \
+    fi
+
+COPY ./ /app/vllm
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+
 RUN cd /app \
     && cd vllm \
     && pip install -U -r requirements-rocm.txt \
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 9df518d155ec2..157e8c45836b1 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -19,7 +19,6 @@
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
 from vllm.lora.request import LoRARequest
-from vllm.utils import is_hip
 
 
 class Worker:
@@ -267,8 +266,7 @@ def init_distributed_environment(
                 "cupy.distributed is already initialized but the cupy world "
                 "size does not match parallel_config.world_size "
                 f"({cupy_world_size} vs. {parallel_config.world_size}).")
-    elif (parallel_config.world_size > 1 and cupy_port is not None
-          and not is_hip()):
+    elif (parallel_config.world_size > 1 and cupy_port is not None):
         # NOTE(woosuk): We don't initialize CuPy process group when world size
         # is 1.
         # TODO(woosuk): Support multi-node connection.

From 8999ec3c1632c91c194ab27df6bf274f5bcb0b5f Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 5 Mar 2024 15:35:43 -0800
Subject: [PATCH 057/113] Store `eos_token_id` in `Sequence` for easy access
 (#3166)

---
 tests/test_cache_block_hashing.py     |  3 +-
 vllm/core/scheduler.py                |  7 ++---
 vllm/engine/llm_engine.py             | 30 +++++++++-----------
 vllm/model_executor/layers/sampler.py |  1 -
 vllm/outputs.py                       | 41 ++++++++++++++-------------
 vllm/sequence.py                      | 11 ++++---
 6 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 7c4ade7f8c8ed..c2067e52b59c0 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -54,7 +54,8 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
         for prompt in prompts:
             hashes[-1].append([])
             prompt_token_ids = tokenizer.encode(prompt)
-            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
+                           tokenizer.tokenizer.eos_token_id)
 
             num_blocks = len(prompt_token_ids) // block_size
             for idx in range(num_blocks):
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 1ae58f525b0fb..c96c6d62ef19d 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -59,10 +59,9 @@ def is_empty(self) -> bool:
                 and not self.blocks_to_swap_out and not self.blocks_to_copy)
 
     def _sort_by_lora_ids(self) -> bool:
-        self.scheduled_seq_groups = sorted(
-            self.scheduled_seq_groups,
-            key=lambda g: (g.lora_request.lora_int_id
-                           if g.lora_request else 0, g.request_id))
+        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
+                                           key=lambda g:
+                                           (g.lora_int_id, g.request_id))
 
     @property
     def lora_requests(self) -> Set[LoRARequest]:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1f518cbf39b21..52dc96e2b82e1 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -491,8 +491,10 @@ def add_request(
         # Create the sequences.
         block_size = self.cache_config.block_size
         seq_id = next(self.seq_counter)
+        eos_token_id = self.tokenizer.get_lora_tokenizer(
+            lora_request).eos_token_id
         seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
-                       lora_request)
+                       eos_token_id, lora_request)
 
         # Defensive copy of SamplingParams, which are used by the sampler,
         # this doesn't deep-copy LogitsProcessor objects
@@ -548,15 +550,13 @@ def _check_beam_search_early_stopping(
         if early_stopping is True:
             return True
 
-        current_worst_score = (current_worst_seq.get_beam_search_score(
+        current_worst_score = current_worst_seq.get_beam_search_score(
             length_penalty=length_penalty,
-            eos_token_id=self.get_tokenizer_for_seq(
-                current_worst_seq).eos_token_id))
+            eos_token_id=current_worst_seq.eos_token_id)
         if early_stopping is False:
-            highest_attainable_score = (best_running_seq.get_beam_search_score(
+            highest_attainable_score = best_running_seq.get_beam_search_score(
                 length_penalty=length_penalty,
-                eos_token_id=self.get_tokenizer_for_seq(
-                    best_running_seq).eos_token_id))
+                eos_token_id=best_running_seq.eos_token_id)
         else:
             assert early_stopping == "never"
             if length_penalty > 0.0:
@@ -570,8 +570,7 @@ def _check_beam_search_early_stopping(
                 highest_attainable_score = (
                     best_running_seq.get_beam_search_score(
                         length_penalty=length_penalty,
-                        eos_token_id=self.get_tokenizer_for_seq(
-                            best_running_seq).eos_token_id,
+                        eos_token_id=best_running_seq.eos_token_id,
                         seq_len=max_possible_length))
             else:
                 # Otherwise, beam search will prefer shorter sequences. The
@@ -580,8 +579,7 @@ def _check_beam_search_early_stopping(
                 highest_attainable_score = (
                     best_running_seq.get_beam_search_score(
                         length_penalty=length_penalty,
-                        eos_token_id=self.get_tokenizer_for_seq(
-                            best_running_seq).eos_token_id))
+                        eos_token_id=best_running_seq.eos_token_id))
         return current_worst_score >= highest_attainable_score
 
     def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
@@ -679,8 +677,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
         all_finished_seqs = existing_finished_seqs + new_finished_seqs
         # Sort the finished sequences by their scores.
         all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty,
-            eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
+            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
                                reverse=True)
         for seq, parent, is_new in all_finished_seqs[:beam_width]:
             if is_new:
@@ -707,8 +704,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
                               if not seq.is_finished()]
         # Sort the running sequences by their scores.
         running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
-            length_penalty=length_penalty,
-            eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
+            length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
                                 reverse=True)
 
         # Check if we can stop the beam search.
@@ -1014,8 +1010,8 @@ def _check_stop(self, seq: Sequence,
             return
 
         # Check if the sequence has generated the EOS token.
-        if ((not sampling_params.ignore_eos) and seq.get_last_token_id()
-                == self.get_tokenizer_for_seq(seq).eos_token_id):
+        if ((not sampling_params.ignore_eos)
+                and seq.get_last_token_id() == seq.eos_token_id):
             seq.status = SequenceStatus.FINISHED_STOPPED
             return
 
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index b48dde0318d09..320cb443524ca 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -516,7 +516,6 @@ def _get_logprobs(
         if (i < sampling_metadata.num_prompts
                 and sampling_params.prompt_logprobs is not None):
             num_logprobs = sampling_params.prompt_logprobs
-            prompt_len = sampling_metadata.prompt_lens[i]
             prompt_tokens = sampling_metadata.seq_data[
                 seq_ids[0]].prompt_token_ids
             group_prompt_logprobs: PromptLogprobs = [None]
diff --git a/vllm/outputs.py b/vllm/outputs.py
index a6de2a5a2257b..4f9eddee11cd4 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -90,29 +90,30 @@ def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
         # Get the top-n sequences.
         n = seq_group.sampling_params.n
         seqs = seq_group.get_seqs()
-        if seq_group.sampling_params.use_beam_search:
-            sorting_key = lambda seq: seq.get_beam_search_score(
-                seq_group.sampling_params.length_penalty)
+        if n == 1:
+            top_n_seqs = seqs
         else:
-            sorting_key = lambda seq: seq.get_cumulative_logprob()
-        sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
-        top_n_seqs = sorted_seqs[:n]
+            if seq_group.sampling_params.use_beam_search:
+                sorting_key = lambda seq: seq.get_beam_search_score(
+                    seq_group.sampling_params.length_penalty)
+            else:
+                sorting_key = lambda seq: seq.get_cumulative_logprob()
+            sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
+            top_n_seqs = sorted_seqs[:n]
 
         # Create the outputs.
-        outputs: List[CompletionOutput] = []
-        for seq in top_n_seqs:
-            logprobs = seq.output_logprobs
-            if seq_group.sampling_params.logprobs is None:
-                # NOTE: We need to take care of this case because the sequence
-                # always has the logprobs of the sampled tokens even if the
-                # logprobs are not requested.
-                logprobs = None
-            finshed_reason = SequenceStatus.get_finished_reason(seq.status)
-            output = CompletionOutput(seqs.index(seq), seq.output_text,
-                                      seq.get_output_token_ids(),
-                                      seq.get_cumulative_logprob(), logprobs,
-                                      finshed_reason)
-            outputs.append(output)
+        # NOTE: We need omit logprobs here explicitly because the sequence
+        # always has the logprobs of the sampled tokens even if the
+        # logprobs are not requested.
+        include_logprobs = seq_group.sampling_params.logprobs
+        outputs = [
+            CompletionOutput(seqs.index(seq), seq.output_text,
+                             seq.get_output_token_ids(),
+                             seq.get_cumulative_logprob(),
+                             seq.output_logprobs if include_logprobs else None,
+                             SequenceStatus.get_finished_reason(seq.status))
+            for seq in top_n_seqs
+        ]
 
         # Every sequence in the sequence group should have the same prompt.
         prompt = seq_group.prompt
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a110ab6b748f8..97b72fdc4cbeb 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -142,11 +142,13 @@ def __init__(
         prompt: str,
         prompt_token_ids: List[int],
         block_size: int,
+        eos_token_id: int,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.seq_id = seq_id
         self.prompt = prompt
         self.block_size = block_size
+        self.eos_token_id = eos_token_id
         self.lora_request = lora_request
 
         self.data = SequenceData(prompt_token_ids)
@@ -362,12 +364,9 @@ def get_seqs(
         self,
         status: Optional[SequenceStatus] = None,
     ) -> List[Sequence]:
-        if status is None:
-            return list(self.seqs_dict.values())
-        else:
-            return [
-                seq for seq in self.seqs_dict.values() if seq.status == status
-            ]
+        return list(self.seqs_dict.values()) if status is None else [
+            seq for seq in self.seqs_dict.values() if seq.status == status
+        ]
 
     def get_unfinished_seqs(self) -> List[Sequence]:
         return [

From 2efce05dc3c7c1e367617465f8f661a058499e37 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 5 Mar 2024 16:17:20 -0800
Subject: [PATCH 058/113] [Fix] Avoid pickling entire LLMEngine for Ray workers
 (#3207)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
---
 vllm/engine/llm_engine.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 52dc96e2b82e1..8484014c9a13f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -158,6 +158,11 @@ def __init__(
         if USE_RAY_COMPILED_DAG:
             self.forward_dag = self._compiled_ray_dag()
 
+    def __reduce__(self):
+        # This is to ensure that the LLMEngine is not referenced in
+        # the closure used to initialize Ray worker actors
+        raise RuntimeError("LLMEngine should not be pickled!")
+
     def get_tokenizer_for_seq(self, sequence: Sequence):
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
 
@@ -280,6 +285,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         parallel_config = copy.deepcopy(self.parallel_config)
         scheduler_config = copy.deepcopy(self.scheduler_config)
         device_config = copy.deepcopy(self.device_config)
+        lora_config = copy.deepcopy(self.lora_config)
+        kv_cache_dtype = self.cache_config.cache_dtype
 
         for rank, (worker, (node_id,
                             _)) in enumerate(zip(self.workers,
@@ -295,22 +302,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                     local_rank,
                     rank,
                     distributed_init_method,
-                    lora_config=self.lora_config,
-                    kv_cache_dtype=self.cache_config.cache_dtype,
+                    lora_config=lora_config,
+                    kv_cache_dtype=kv_cache_dtype,
                 ))
 
         driver_rank = 0
         driver_local_rank = node_workers[driver_node_id].index(driver_rank)
         self.driver_worker = Worker(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
             driver_local_rank,
             driver_rank,
             distributed_init_method,
             lora_config=self.lora_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
+            kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=True,
         )
 

From 24aecf421a4ad5989697010963074904fead9a1b Mon Sep 17 00:00:00 2001
From: SangBin Cho <rkooo567@gmail.com>
Date: Wed, 6 Mar 2024 11:23:34 +0900
Subject: [PATCH 059/113] [Tests] Add block manager and scheduler tests (#3108)

---
 .buildkite/test-pipeline.yaml    |   3 +
 tests/core/__init__.py           |   0
 tests/core/test_block_manager.py | 262 +++++++++++++++++++++++++++++++
 tests/core/test_scheduler.py     | 170 ++++++++++++++++++++
 tests/core/utils.py              |  27 ++++
 5 files changed, 462 insertions(+)
 create mode 100644 tests/core/__init__.py
 create mode 100644 tests/core/test_block_manager.py
 create mode 100644 tests/core/test_scheduler.py
 create mode 100644 tests/core/utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c65ab04b8ddda..15f971b66e3bd 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -13,6 +13,9 @@ steps:
 
 - label: Basic Correctness Test
   command: pytest -v -s --forked basic_correctness
+  
+- label: Core Test
+  command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
   command: pytest -v -s --forked test_comm_ops.py
diff --git a/tests/core/__init__.py b/tests/core/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
new file mode 100644
index 0000000000000..ecdf3025cffdf
--- /dev/null
+++ b/tests/core/test_block_manager.py
@@ -0,0 +1,262 @@
+import pytest
+import time
+from typing import List
+
+from vllm import SamplingParams
+from vllm.block import PhysicalTokenBlock
+from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
+from vllm.utils import Device
+from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
+
+from .utils import create_dummy_prompt
+
+
+def test_block_allocator_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
+
+    # Allocate all available cpu blocks.
+    num_free = num_cpu_blocks
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        num_free -= 1
+        assert block not in cpu_allocator.free_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_free
+
+    with pytest.raises(ValueError):
+        cpu_allocator.allocate()
+
+
+def test_block_allocator_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
+
+    # Allocate all available cpu blocks.
+    blocks: List[PhysicalTokenBlock] = []
+    for _ in range(num_cpu_blocks):
+        block = cpu_allocator.allocate()
+        blocks.append(block)
+        assert block not in cpu_allocator.free_blocks
+
+    # Free all allocated cpu blocks.
+    num_free = 0
+    assert cpu_allocator.get_num_free_blocks() == num_free
+    for block in blocks:
+        cpu_allocator.free(block)
+        num_free += 1
+        assert block in cpu_allocator.free_blocks
+        assert cpu_allocator.get_num_free_blocks() == num_free
+
+        with pytest.raises(ValueError):
+            cpu_allocator.free(block)
+
+
+def test_allocate():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate same sequence group to all available gpu blocks.
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+    # Allocate same sequence group to all available gpu blocks.
+    # Use watermark to reserve one gpu block.
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=1 / num_gpu_blocks)
+    for i in range(num_gpu_blocks - 1):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+
+def test_append_slot_single_seq():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate single seq to gpu block.
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+
+    # Nothing to append. Sequence has no new logical blocks.
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slot(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks == after_blocks
+
+    # Add block_size number of new tokens and append slot.
+    for i in range(block_size):
+        token_id = i + 5
+        prompt.append_token_id(token_id, {token_id: 0.0})
+
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    assert not block_manager.append_slot(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+
+
+def test_append_slot_cow():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate prompt to gpu block.
+    prompt = Sequence(1, "one two three", [1, 2, 3], block_size)
+    child = prompt.fork(2)
+    token_id = 4
+    child.append_token_id(token_id, {token_id: 0.0})
+    seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
+                              time.time(), time.perf_counter)
+    block_manager.allocate(seq_group)
+
+    # Append slot for child token.
+    # Last block being modified is shared. Copy on write occurs.
+    assert block_manager.can_append_slot(seq_group)
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    src_block, dst_block = block_manager.append_slot(child)
+    assert src_block != dst_block
+
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_blocks - after_blocks == 1
+
+
+def test_fork():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1",
+                                            block_size - 1,
+                                            block_size=block_size)
+    block_manager.allocate(seq_group)
+
+    # Fork prompt and copy block tables.
+    child = prompt.fork(2)
+    block_manager.fork(prompt, child)
+    assert block_manager.get_block_table(
+        prompt) == block_manager.get_block_table(child)
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: 0.0})
+    block_manager.append_slot(child)
+    assert block_manager.get_block_table(
+        prompt) != block_manager.get_block_table(child)
+
+
+def test_swap():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: 0.0})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    assert list(mapping.keys()) == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    cpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_in(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    assert list(mapping.keys()) == cpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+def test_free():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    prompt, seq_group = create_dummy_prompt("1", block_size)
+    block_manager.allocate(seq_group)
+
+    # Free allocated seq.
+    prompt_blocks = len(block_manager.get_block_table(prompt))
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    block_manager.free(prompt)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert after_blocks == before_blocks + prompt_blocks
+
+    # Block table for freed seq is deleted.
+    with pytest.raises(KeyError):
+        block_manager.get_block_table(prompt)
+
+
+def test_reset():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      watermark=0)
+
+    # Allocate same seq group on all available gpu blocks.
+    original_blocks = block_manager.get_num_free_gpu_blocks()
+    for i in range(num_gpu_blocks):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        block_manager.allocate(seq_group)
+    assert block_manager.get_num_free_gpu_blocks() == 0
+
+    # Resetting block manager frees all allocated blocks.
+    block_manager.reset()
+    assert block_manager.get_num_free_gpu_blocks() == original_blocks
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
new file mode 100644
index 0000000000000..6322b2f2d5e9e
--- /dev/null
+++ b/tests/core/test_scheduler.py
@@ -0,0 +1,170 @@
+from typing import List
+import pytest  # noqa
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.sequence import SequenceGroup
+
+from .utils import create_dummy_prompt
+
+
+def test_scheduler_add_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq group to scheduler.
+    num_seq_group = 4
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        assert scheduler.get_num_unfinished_seq_groups() == i + 1
+
+
+def test_scheduler_abort_seq_group():
+    block_size = 4
+    scheduler_config = SchedulerConfig(100, 64, 1, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 4
+    cache_config.num_gpu_blocks = 4
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add multiple seq groups to scheduler.
+    num_seq_group = 4
+    request_ids = set()
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), block_size)
+        scheduler.add_seq_group(seq_group)
+        request_ids.add(str(i))
+
+    # Abort all added seq groups.
+    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
+    scheduler.abort_seq_group(request_ids)
+    assert scheduler.get_num_unfinished_seq_groups() == 0
+
+
+def test_scheduler_schedule_simple():
+    block_size = 4
+    num_seq_group = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    running: List[SequenceGroup] = []
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set(running)
+    assert out.num_batched_tokens == num_seq_group * seq_group.get_seqs(
+    )[0].get_len()
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+    # Schedule seq groups generation.
+    seq_group_meta, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set(running)
+    assert out.num_batched_tokens == num_seq_group
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == num_seq_group
+
+
+def test_scheduler_schedule_preempt_abort():
+    block_size = 4
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, 2, max_model_len, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 2
+    cache_config.num_gpu_blocks = 2
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    seq_a, seq_group_a = create_dummy_prompt("1", block_size)
+    seq_b, seq_group_b = create_dummy_prompt("2", block_size)
+    scheduler.add_seq_group(seq_group_a)
+    scheduler.add_seq_group(seq_group_b)
+
+    # Schedule seq groups prompts.
+    seq_group_meta, out = scheduler.schedule()
+    assert out.scheduled_seq_groups == [seq_group_a, seq_group_b]
+    assert out.num_batched_tokens == seq_group_a.get_seqs()[0].get_len() * 2
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 2
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
+    # processed.
+    token_id = 0
+    seq_a.append_token_id(token_id, {token_id: 0.0})
+    seq_b.append_token_id(token_id, {token_id: 0.0})
+
+    # Schedule seq groups generation and preempt seq group b.
+    seq_group_meta, out = scheduler.schedule()
+    assert out.scheduled_seq_groups == [seq_group_a]
+    assert out.num_batched_tokens == 1
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 2
+
+    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
+    scheduler.abort_seq_group("1")
+    seq_group_meta, out = scheduler.schedule()
+    assert out.scheduled_seq_groups == [seq_group_b]
+    assert out.num_batched_tokens == seq_group_b.get_seqs()[0].get_len()
+    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
+            and not out.blocks_to_swap_out)
+    assert len(seq_group_meta) == 1
+    assert scheduler.get_num_unfinished_seq_groups() == 1
+
+
+def test_scheduler_max_seqs():
+    block_size = 4
+    num_seq_group = 4
+    max_seq_group = 2
+    max_model_len = 16
+    scheduler_config = SchedulerConfig(64, max_seq_group, max_model_len, 256)
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 8
+    cache_config.num_gpu_blocks = 8
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    all_seq_groups: List[SequenceGroup] = []
+    # Add seq groups to scheduler.
+    for i in range(num_seq_group):
+        _, seq_group = create_dummy_prompt(str(i), prompt_length=block_size)
+        all_seq_groups.append(seq_group)
+
+    # Append 1 seq group
+    scheduler.add_seq_group(all_seq_groups[0])
+
+    # Schedule seq groups prompts.
+    _, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]])
+
+    # Schedule seq groups generation.
+    _, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set([all_seq_groups[0]])
+
+    # Append 2 more seq group
+    scheduler.add_seq_group(all_seq_groups[1])
+    scheduler.add_seq_group(all_seq_groups[2])
+
+    # Schedule seq groups prompts.
+    # Only 1 seq group should be scheduled since max_seq_group is 2
+    # and one is prompting.
+    _, out = scheduler.schedule()
+    assert set(out.scheduled_seq_groups) == set([all_seq_groups[1]])
diff --git a/tests/core/utils.py b/tests/core/utils.py
new file mode 100644
index 0000000000000..9c0cfe1a7cf66
--- /dev/null
+++ b/tests/core/utils.py
@@ -0,0 +1,27 @@
+import time
+from typing import Tuple
+
+from vllm import SamplingParams
+from vllm.sequence import Sequence, SequenceGroup
+
+
+def create_dummy_prompt(
+        request_id: str,
+        prompt_length: int,
+        block_size: int = None) -> Tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = prompt_length
+
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size".
+    prompt_tokens = list(range(prompt_length))
+    prompt_str = " ".join([str(t) for t in prompt_tokens])
+    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
+    seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
+                              time.time(), None, None)
+
+    return prompt, seq_group
+
+
+def round_up_to_next_block(seq_len: int, block_size: int) -> int:
+    return (seq_len + block_size - 1) // block_size

From a33ce60c6629e8c22aaf002ae8478a685e726e3e Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Wed, 6 Mar 2024 01:04:23 -0800
Subject: [PATCH 060/113] [Testing] Fix core tests (#3224)

---
 tests/core/test_block_manager.py | 49 ++++++++++++++++++++------------
 tests/core/test_scheduler.py     |  6 ++--
 tests/core/utils.py              |  2 +-
 vllm/sequence.py                 |  2 +-
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index ecdf3025cffdf..04d01f7724e4f 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -6,7 +6,7 @@
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
 from vllm.utils import Device
-from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
+from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
 
 from .utils import create_dummy_prompt
 
@@ -22,7 +22,8 @@ def test_block_allocator_allocate():
     for _ in range(num_cpu_blocks):
         block = cpu_allocator.allocate()
         num_free -= 1
-        assert block not in cpu_allocator.free_blocks
+
+        assert block.block_hash not in cpu_allocator.evictor
         assert cpu_allocator.get_num_free_blocks() == num_free
 
     with pytest.raises(ValueError):
@@ -39,7 +40,7 @@ def test_block_allocator_free():
     for _ in range(num_cpu_blocks):
         block = cpu_allocator.allocate()
         blocks.append(block)
-        assert block not in cpu_allocator.free_blocks
+        assert block.block_hash not in cpu_allocator.evictor
 
     # Free all allocated cpu blocks.
     num_free = 0
@@ -47,7 +48,7 @@ def test_block_allocator_free():
     for block in blocks:
         cpu_allocator.free(block)
         num_free += 1
-        assert block in cpu_allocator.free_blocks
+        assert block.block_hash in cpu_allocator.evictor
         assert cpu_allocator.get_num_free_blocks() == num_free
 
         with pytest.raises(ValueError):
@@ -106,7 +107,7 @@ def test_append_slot_single_seq():
     # Add block_size number of new tokens and append slot.
     for i in range(block_size):
         token_id = i + 5
-        prompt.append_token_id(token_id, {token_id: 0.0})
+        prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
 
     assert block_manager.can_append_slot(seq_group)
     before_blocks = block_manager.get_num_free_gpu_blocks()
@@ -119,25 +120,37 @@ def test_append_slot_cow():
     block_size = 4
     num_cpu_blocks = 4
     num_gpu_blocks = 4
-    block_manager = BlockSpaceManager(block_size,
-                                      num_cpu_blocks,
-                                      num_gpu_blocks,
+    block_manager = BlockSpaceManager(block_size=block_size,
+                                      num_cpu_blocks=num_cpu_blocks,
+                                      num_gpu_blocks=num_gpu_blocks,
                                       watermark=0)
 
-    # Allocate prompt to gpu block.
-    prompt = Sequence(1, "one two three", [1, 2, 3], block_size)
-    child = prompt.fork(2)
-    token_id = 4
-    child.append_token_id(token_id, {token_id: 0.0})
+    # Allocate prompt to gpu block. There is one slot left in the block.
+    prompt = Sequence(seq_id=1,
+                      prompt="one two three",
+                      prompt_token_ids=[1, 2, 3],
+                      block_size=block_size)
+
+    # Fork the sequence, such that a COW will be required when we append a new
+    # token id.
+    child = prompt.fork(new_seq_id=2)
+
+    # Allocate space for the sequence group.
     seq_group = SequenceGroup("1", [prompt, child], SamplingParams(),
                               time.time(), time.perf_counter)
     block_manager.allocate(seq_group)
 
-    # Append slot for child token.
-    # Last block being modified is shared. Copy on write occurs.
+    # Fork and append a new token id. We expect a COW to be scheduled.
+    token_id = 4
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.fork(prompt, child)
+
     assert block_manager.can_append_slot(seq_group)
     before_blocks = block_manager.get_num_free_gpu_blocks()
-    src_block, dst_block = block_manager.append_slot(child)
+
+    maybe_src_dst_block = block_manager.append_slot(child)
+    assert maybe_src_dst_block is not None
+    src_block, dst_block = maybe_src_dst_block
     assert src_block != dst_block
 
     after_blocks = block_manager.get_num_free_gpu_blocks()
@@ -165,7 +178,7 @@ def test_fork():
         prompt) == block_manager.get_block_table(child)
     token_id = 4
     # Append token to child. Block is shared so copy on write occurs.
-    child.append_token_id(token_id, {token_id: 0.0})
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
     block_manager.append_slot(child)
     assert block_manager.get_block_table(
         prompt) != block_manager.get_block_table(child)
@@ -189,7 +202,7 @@ def test_swap():
     # tokens will be written in the next forward pass.
     token_id = 0
     prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: 0.0})
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
 
     # Swap seq group from GPU -> CPU.
     gpu_blocks = block_manager.get_block_table(prompt)
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 6322b2f2d5e9e..ebfeb8ba04812 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -3,7 +3,7 @@
 
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.scheduler import Scheduler
-from vllm.sequence import SequenceGroup
+from vllm.sequence import SequenceGroup, Logprob
 
 from .utils import create_dummy_prompt
 
@@ -108,8 +108,8 @@ def test_scheduler_schedule_preempt_abort():
     # Append "generated" tokens, allowing the sequence to mark prompt tokens as
     # processed.
     token_id = 0
-    seq_a.append_token_id(token_id, {token_id: 0.0})
-    seq_b.append_token_id(token_id, {token_id: 0.0})
+    seq_a.append_token_id(token_id, {token_id: Logprob(0.0)})
+    seq_b.append_token_id(token_id, {token_id: Logprob(0.0)})
 
     # Schedule seq groups generation and preempt seq group b.
     seq_group_meta, out = scheduler.schedule()
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 9c0cfe1a7cf66..6469789e89386 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -18,7 +18,7 @@ def create_dummy_prompt(
     prompt_str = " ".join([str(t) for t in prompt_tokens])
     prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
     seq_group = SequenceGroup(request_id, [prompt], SamplingParams(),
-                              time.time(), None, None)
+                              time.time(), None)
 
     return prompt, seq_group
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 97b72fdc4cbeb..19dafe3cb0fc9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -142,7 +142,7 @@ def __init__(
         prompt: str,
         prompt_token_ids: List[int],
         block_size: int,
-        eos_token_id: int,
+        eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.seq_id = seq_id

From 4cb3b924cdeb6b809f0a0311f9833253d9162699 Mon Sep 17 00:00:00 2001
From: Chujie Zheng <chujiezhengchn@gmail.com>
Date: Wed, 6 Mar 2024 14:41:42 -0800
Subject: [PATCH 061/113] Add tqdm `dynamic_ncols=True` (#3242)

---
 vllm/entrypoints/llm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 62f1d172377f6..1f463bdaaedc3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -191,7 +191,9 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
-            pbar = tqdm(total=num_requests, desc="Processed prompts")
+            pbar = tqdm(total=num_requests,
+                        desc="Processed prompts",
+                        dynamic_ncols=True)
         # Run the engine.
         outputs: List[RequestOutput] = []
         while self.llm_engine.has_unfinished_requests():

From d3c04b6a39df016504c28ec3fc27ea58ca802a28 Mon Sep 17 00:00:00 2001
From: TechxGenus <jianghao0728@mail.ustc.edu.cn>
Date: Thu, 7 Mar 2024 08:19:14 +0800
Subject: [PATCH 062/113] Add GPTQ support for Gemma (#3200)

---
 vllm/model_executor/models/gemma.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 03948132d32c3..bf1f164ff700d 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -325,11 +325,17 @@ def load_weights(self,
                 if shard_name not in name:
                     continue
                 name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
                 # GemmaRMSNorm is different from Llama's in that it multiplies
                 # (1 + weight) to the output, instead of just weight.
                 if "norm.weight" in name:

From cbf4c05b156c8705c6bb1a94b9edc0a5b4d26e20 Mon Sep 17 00:00:00 2001
From: Chen Wang <Chen.Wang1@ibm.com>
Date: Thu, 7 Mar 2024 03:39:28 -0500
Subject: [PATCH 063/113] Update requirements-dev.txt to include package for
 benchmarking scripts. (#3181)

Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>
---
 requirements-dev.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 55e102374fd73..dfcbfa4253f1c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -21,3 +21,6 @@ einops # required for MPT
 openai
 requests
 ray
+
+# Benchmarking
+aiohttp

From 2daf23ab0cf00da157b1255faddcf0a269283d36 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 7 Mar 2024 01:45:50 -0800
Subject: [PATCH 064/113] Separate attention backends (#3005)

---
 .gitignore                                    |   3 +
 setup.py                                      |  48 +++-
 tests/kernels/test_prefix_prefill.py          |   2 +-
 vllm/__init__.py                              |  30 ++-
 .../layers/attention/__init__.py              |   5 +
 .../layers/attention/attention.py             |  59 +++++
 .../backends}/__init__.py                     |   0
 .../layers/attention/backends/flash_attn.py   | 124 ++++++++++
 .../backends/xformers.py}                     | 216 +++++-------------
 .../layers/attention/ops/__init__.py          |   0
 .../layers/attention/ops/paged_attn.py        | 138 +++++++++++
 .../ops}/prefix_prefill.py                    |   0
 vllm/model_executor/models/baichuan.py        |  13 +-
 vllm/model_executor/models/bloom.py           |  10 +-
 vllm/model_executor/models/chatglm.py         |   4 +-
 vllm/model_executor/models/deepseek.py        |  10 +-
 vllm/model_executor/models/falcon.py          |  28 +--
 vllm/model_executor/models/gemma.py           |  10 +-
 vllm/model_executor/models/gpt2.py            |   6 +-
 vllm/model_executor/models/gpt_bigcode.py     |  10 +-
 vllm/model_executor/models/gpt_j.py           |   4 +-
 vllm/model_executor/models/gpt_neox.py        |   4 +-
 vllm/model_executor/models/internlm2.py       |  10 +-
 vllm/model_executor/models/llama.py           |  12 +-
 vllm/model_executor/models/mixtral.py         |   4 +-
 vllm/model_executor/models/mixtral_quant.py   |   4 +-
 vllm/model_executor/models/mpt.py             |  12 +-
 vllm/model_executor/models/olmo.py            |   8 +-
 vllm/model_executor/models/opt.py             |   8 +-
 vllm/model_executor/models/orion.py           |  10 +-
 vllm/model_executor/models/phi.py             |   4 +-
 vllm/model_executor/models/qwen.py            |   4 +-
 vllm/model_executor/models/qwen2.py           |  12 +-
 vllm/model_executor/models/stablelm.py        |  10 +-
 vllm/model_executor/models/starcoder2.py      |   4 +-
 35 files changed, 558 insertions(+), 268 deletions(-)
 create mode 100644 vllm/model_executor/layers/attention/__init__.py
 create mode 100644 vllm/model_executor/layers/attention/attention.py
 rename vllm/model_executor/layers/{triton_kernel => attention/backends}/__init__.py (100%)
 create mode 100644 vllm/model_executor/layers/attention/backends/flash_attn.py
 rename vllm/model_executor/layers/{attention.py => attention/backends/xformers.py} (56%)
 create mode 100644 vllm/model_executor/layers/attention/ops/__init__.py
 create mode 100644 vllm/model_executor/layers/attention/ops/paged_attn.py
 rename vllm/model_executor/layers/{triton_kernel => attention/ops}/prefix_prefill.py (100%)

diff --git a/.gitignore b/.gitignore
index b5195629e5cf3..0b14c98270c41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,3 +184,6 @@ _build/
 
 # Benchmark dataset
 *.json
+
+# Third-party Python packages.
+vllm/thirdparty_files/
diff --git a/setup.py b/setup.py
index 745b5a9b2d02a..57d7a139e8237 100644
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,7 @@
 import os
 import re
 import subprocess
+import sys
 import warnings
 from pathlib import Path
 from typing import List, Set
@@ -14,6 +15,8 @@
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
 
 ROOT_DIR = os.path.dirname(__file__)
+# This is a temporary directory to store third-party packages.
+THIRDPARTY_SUBDIR = "vllm/thirdparty_files"
 
 # If you are developing the C++ backend of vLLM, consider building vLLM with
 # `python setup.py develop` since it will give you incremental builds.
@@ -324,8 +327,46 @@ def get_torch_arch_list() -> Set[str]:
                     "nvcc": NVCC_FLAGS_PUNICA,
                 },
             ))
-elif _is_neuron():
-    neuronxcc_version = get_neuronxcc_version()
+
+    # Download the FlashAttention package.
+    # Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/setup.py#L518-L530
+    flash_attn_version = "2.5.6"
+    install_dir = os.path.join(ROOT_DIR, THIRDPARTY_SUBDIR)
+    subprocess.check_call(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "-q",
+            f"--target={install_dir}",
+            "einops",  # Dependency of flash-attn.
+            f"flash-attn=={flash_attn_version}",
+            "--no-dependencies",  # Required to avoid re-installing torch.
+        ],
+        env=dict(os.environ, CC="gcc"),
+    )
+
+    # Copy the FlashAttention package into the vLLM package after build.
+    class build_ext(BuildExtension):
+
+        def run(self):
+            super().run()
+            target_dir = os.path.join(self.build_lib, THIRDPARTY_SUBDIR)
+            if not os.path.exists(target_dir):
+                os.makedirs(target_dir)
+            self.copy_tree(install_dir, target_dir)
+
+    class BinaryDistribution(setuptools.Distribution):
+
+        def has_ext_modules(self):
+            return True
+
+else:
+    build_ext = BuildExtension
+    BinaryDistribution = setuptools.Distribution
+    if _is_neuron():
+        neuronxcc_version = get_neuronxcc_version()
 
 vllm_extension_sources = [
     "csrc/cache_kernels.cu",
@@ -468,6 +509,7 @@ def get_requirements() -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
+    cmdclass={"build_ext": build_ext} if not _is_neuron() else {},
+    distclass=BinaryDistribution,
     package_data=package_data,
 )
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index c068b38a66910..e881cd1ec3753 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -3,7 +3,7 @@
 import time
 
 import torch
-from vllm.model_executor.layers.triton_kernel.prefix_prefill import (
+from vllm.model_executor.layers.attention.ops.prefix_prefill import (
     context_attention_fwd)
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
diff --git a/vllm/__init__.py b/vllm/__init__.py
index f1e30f5eb6e6e..59f1345b58d42 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,12 +1,28 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_cluster
-from vllm.entrypoints.llm import LLM
-from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.sampling_params import SamplingParams
+
+# Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/ray/__init__.py#L11
+def _configure_system():
+    import os
+    import sys
+
+    # Importing flash-attn.
+    thirdparty_files = os.path.join(os.path.abspath(os.path.dirname(__file__)),
+                                    "thirdparty_files")
+    sys.path.insert(0, thirdparty_files)
+
+
+_configure_system()
+# Delete configuration function.
+del _configure_system
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
+from vllm.engine.async_llm_engine import AsyncLLMEngine  # noqa: E402
+from vllm.engine.llm_engine import LLMEngine  # noqa: E402
+from vllm.engine.ray_utils import initialize_cluster  # noqa: E402
+from vllm.entrypoints.llm import LLM  # noqa: E402
+from vllm.outputs import CompletionOutput, RequestOutput  # noqa: E402
+from vllm.sampling_params import SamplingParams  # noqa: E402
 
 __version__ = "0.3.3"
 
diff --git a/vllm/model_executor/layers/attention/__init__.py b/vllm/model_executor/layers/attention/__init__.py
new file mode 100644
index 0000000000000..1c42a3d28f976
--- /dev/null
+++ b/vllm/model_executor/layers/attention/__init__.py
@@ -0,0 +1,5 @@
+from vllm.model_executor.layers.attention.attention import Attention
+
+__all__ = [
+    "Attention",
+]
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
new file mode 100644
index 0000000000000..830e82e10f7ad
--- /dev/null
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -0,0 +1,59 @@
+"""Attention layer."""
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.utils import is_hip
+
+
+class Attention(nn.Module):
+    """Attention layer.
+
+    This class takes query, key, and value tensors as input. The input tensors
+    can either contain prompt tokens or generation tokens.
+    The class does the following:
+
+    1. Store the input key and value tensors in the KV cache.
+    2. Perform (multi-head/multi-query/grouped-query) attention.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and
+                torch.get_default_dtype() in (torch.float16, torch.bfloat16)):
+            # Ampere or later NVIDIA GPUs.
+            # NOTE(woosuk): FlashAttention does not support FP32.
+            from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend
+            self.backend = FlashAttentionBackend(num_heads, head_size, scale,
+                                                 num_kv_heads, alibi_slopes,
+                                                 sliding_window)
+        else:
+            # Turing and Volta NVIDIA GPUs or AMD GPUs.
+            # Or FP32 on any GPU.
+            from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend
+            self.backend = XFormersBackend(num_heads, head_size, scale,
+                                           num_kv_heads, alibi_slopes,
+                                           sliding_window)
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        return self.backend.forward(query, key, value, key_cache, value_cache,
+                                    input_metadata)
diff --git a/vllm/model_executor/layers/triton_kernel/__init__.py b/vllm/model_executor/layers/attention/backends/__init__.py
similarity index 100%
rename from vllm/model_executor/layers/triton_kernel/__init__.py
rename to vllm/model_executor/layers/attention/backends/__init__.py
diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
new file mode 100644
index 0000000000000..512f4e49c7eb2
--- /dev/null
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -0,0 +1,124 @@
+"""Attention layer with Flash and PagedAttention."""
+from typing import List, Optional
+
+# NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/.
+from flash_attn import flash_attn_func
+import torch
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention.ops.paged_attn import (
+    PagedAttentionImpl)
+
+
+class FlashAttentionBackend:
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+        self.sliding_window = ((self.sliding_window, self.sliding_window) if
+                               self.sliding_window is not None else (-1, -1))
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention and PagedAttention.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for the inputs.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+        batch_size, seq_len, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        # Reshape the keys and values and store them in the cache.
+        # If key_cache and value_cache are not provided, the new key and value
+        # vectors will not be cached. This happens during the initial memory
+        # profiling run.
+        if key_cache is not None and value_cache is not None:
+            PagedAttentionImpl.reshape_and_cache(key, value, key_cache,
+                                                 value_cache, input_metadata)
+
+        if input_metadata.is_prompt:
+            # Prompt run.
+            if (key_cache is None or value_cache is None
+                    or input_metadata.block_tables.numel() == 0):
+                # normal attention
+                query = query.unflatten(0, (batch_size, seq_len))
+                key = key.unflatten(0, (batch_size, seq_len))
+                value = value.unflatten(0, (batch_size, seq_len))
+                output = flash_attn_func(
+                    query,
+                    key,
+                    value,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    window_size=self.sliding_window,
+                    alibi_slopes=self.alibi_slopes,
+                )
+            else:
+                # prefix-enabled attention
+                output = PagedAttentionImpl.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    input_metadata,
+                    self.num_heads,
+                    self.num_kv_heads,
+                    self.alibi_slopes,
+                )
+        else:
+            # Decoding run.
+            output = PagedAttentionImpl.forward_decode(
+                query,
+                key_cache,
+                value_cache,
+                input_metadata,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+            )
+
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention/backends/xformers.py
similarity index 56%
rename from vllm/model_executor/layers/attention.py
rename to vllm/model_executor/layers/attention/backends/xformers.py
index 2a82325b80213..bad2a648b6703 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention/backends/xformers.py
@@ -1,37 +1,19 @@
-"""Multi-head attention."""
+"""Attention layer with xFormers and PagedAttention."""
+import importlib
 from typing import List, Optional
 
-import importlib
 import torch
-import torch.nn as nn
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
                                          LowerTriangularMaskWithTensorBias)
 
-from vllm._C import ops
-from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.triton_kernel.prefix_prefill import (
-    context_attention_fwd)
+from vllm.model_executor.layers.attention.ops.paged_attn import (
+    PagedAttentionImpl)
 from vllm.utils import is_hip
 
-_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-
-
-class PagedAttention(nn.Module):
-    """MHA/MQA/GQA layer with PagedAttention.
 
-    This class takes query, key, and value tensors as input. The input tensors
-    can either contain prompt tokens or generation tokens.
-    The class does the following:
-
-    1. Reshape and store the input key and value tensors in the KV cache.
-    2. Perform (multi-head/multi-query/grouped-query) attention using either
-        xformers or the PagedAttention custom op.
-    3. Return the output tensor.
-    """
+class XFormersBackend:
 
     def __init__(
         self,
@@ -42,7 +24,6 @@ def __init__(
         alibi_slopes: Optional[List[float]] = None,
         sliding_window: Optional[int] = None,
     ) -> None:
-        super().__init__()
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -50,48 +31,17 @@ def __init__(
         self.sliding_window = sliding_window
         if alibi_slopes is not None:
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.alibi_slopes = alibi_slopes
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        suppored_head_sizes = PagedAttentionImpl.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
 
-        if self.head_size not in _SUPPORTED_HEAD_SIZES:
-            raise ValueError(f"head_size ({self.head_size}) is not supported. "
-                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")
-
-        self.use_ref_attention = self.check_use_ref_attention()
-
-    def check_use_ref_attention(self) -> bool:
-        if not is_hip():
-            return False
-        # For ROCm, check whether flash attention is installed or not.
-        # if not, use_ref_attention needs to be True
-        return importlib.util.find_spec("flash_attn") is None
-
-    def ref_masked_attention(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-    ) -> torch.Tensor:
-        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-
-        seq_len, _, _ = query.shape
-        attn_mask = torch.triu(torch.ones(seq_len,
-                                          seq_len,
-                                          dtype=query.dtype,
-                                          device=query.device),
-                               diagonal=1)
-        attn_mask = attn_mask * torch.finfo(query.dtype).min
-
-        attn_weights = self.scale * torch.einsum("qhd,khd->hqk", query,
-                                                 key).float()
-        attn_weights = attn_weights + attn_mask.float()
-        attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
-        out = torch.einsum("hqk,khd->qhd", attn_weights, value)
-        return out
+        self.use_ref_attention = _check_use_ref_attention()
 
     def forward(
         self,
@@ -102,7 +52,7 @@ def forward(
         value_cache: Optional[torch.Tensor],
         input_metadata: InputMetadata,
     ) -> torch.Tensor:
-        """PagedAttention forward pass.
+        """Forward pass with xFormers and PagedAttention.
 
         Args:
             query: shape = [batch_size, seq_len, num_heads * head_size]
@@ -127,19 +77,14 @@ def forward(
         # vectors will not be cached. This happens during the initial memory
         # profiling run.
         if key_cache is not None and value_cache is not None:
-            cache_ops.reshape_and_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                input_metadata.slot_mapping.flatten(),
-                input_metadata.kv_cache_dtype,
-            )
+            PagedAttentionImpl.reshape_and_cache(key, value, key_cache,
+                                                 value_cache, input_metadata)
 
         if input_metadata.is_prompt:
-            # normal attention
+            # Prompt run.
             if (key_cache is None or value_cache is None
                     or input_metadata.block_tables.numel() == 0):
+                # normal attention
                 if self.num_kv_heads != self.num_heads:
                     # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
                     # project the key and value tensors to the desired number of
@@ -175,13 +120,19 @@ def forward(
                             seq_len, query.dtype)
 
                 if self.use_ref_attention:
-                    output = self.ref_masked_attention(
+                    output = _ref_masked_attention(
                         query,
                         key,
                         value,
+                        self.num_heads,
+                        self.num_kv_heads,
+                        self.head_size,
+                        self.scale,
                     )
-                    # Using view got RuntimeError: view size is not compatible with input tensor's size and stride
-                    # (at least one dimension spans across two contiguous subspaces). Use reshape instead
+                    # Using view got RuntimeError: view size is not compatible
+                    # with input tensor's size and stride (at least one
+                    # dimension spans across two contiguous subspaces).
+                    # Use reshape instead.
                     return output.reshape(batch_size, seq_len, hidden_size)
 
                 # TODO(woosuk): Too many view operations. Let's try to reduce
@@ -206,27 +157,21 @@ def forward(
                     (is_hip()) else None,
                 )
                 output = out.view_as(query)
+
             else:
                 # prefix-enabled attention
-                output = torch.empty_like(query)
-                context_attention_fwd(
+                output = PagedAttentionImpl.forward_prefix(
                     query,
                     key,
                     value,
-                    output,
                     key_cache,
                     value_cache,
-                    input_metadata.block_tables,  # [BS, max_block_per_request]
-                    input_metadata.start_loc,
-                    input_metadata.prompt_lens,
-                    input_metadata.context_lens,
-                    input_metadata.max_seq_len,
-                    getattr(self, "alibi_slopes", None),
+                    input_metadata,
+                    self.alibi_slopes,
                 )
-
         else:
             # Decoding run.
-            output = _paged_attention(
+            output = PagedAttentionImpl.forward_decode(
                 query,
                 key_cache,
                 value_cache,
@@ -274,76 +219,37 @@ def _make_alibi_bias(
     return attn_bias
 
 
-def _paged_attention(
+def _check_use_ref_attention() -> bool:
+    if not is_hip():
+        return False
+    # For ROCm, check whether flash attention is installed or not.
+    # if not, use_ref_attention needs to be True
+    return importlib.util.find_spec("flash_attn") is None
+
+
+def _ref_masked_attention(
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    input_metadata: InputMetadata,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    num_heads: int,
     num_kv_heads: int,
+    head_size: int,
     scale: float,
-    alibi_slopes: Optional[torch.Tensor],
 ) -> torch.Tensor:
-    output = torch.empty_like(query)
-
-    block_size = value_cache.shape[3]
-    num_seqs, num_heads, head_size = query.shape
-    max_num_partitions = (
-        (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
-        _PARTITION_SIZE)
-    # NOTE(woosuk): We use a simple heuristic to decide whether to use
-    # PagedAttention V1 or V2. If the number of partitions is 1, we use
-    # V1 to avoid the overhead of reduction. Also, if the number of
-    # sequences or heads is large, we use V1 since there is enough work
-    # to parallelize.
-    # TODO(woosuk): Tune this heuristic.
-    # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-    use_v1 = input_metadata.max_context_len <= 8192 and (
-        max_num_partitions == 1 or num_seqs * num_heads > 512)
-    if use_v1:
-        # Run PagedAttention V1.
-        ops.paged_attention_v1(
-            output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
-            block_size,
-            input_metadata.max_context_len,
-            alibi_slopes,
-            input_metadata.kv_cache_dtype,
-        )
-    else:
-        # Run PagedAttention V2.
-        assert _PARTITION_SIZE % block_size == 0
-        tmp_output = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions, head_size),
-            dtype=output.dtype,
-            device=output.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_heads, max_num_partitions),
-            dtype=torch.float32,
-            device=output.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-        ops.paged_attention_v2(
-            output,
-            exp_sums,
-            max_logits,
-            tmp_output,
-            query,
-            key_cache,
-            value_cache,
-            num_kv_heads,
-            scale,
-            input_metadata.block_tables,
-            input_metadata.context_lens,
-            block_size,
-            input_metadata.max_context_len,
-            alibi_slopes,
-            input_metadata.kv_cache_dtype,
-        )
-    return output
+    query = query.view(-1, num_heads, head_size)
+    key = key.view(-1, num_kv_heads, head_size)
+    value = value.view(-1, num_kv_heads, head_size)
+
+    seq_len, _, _ = query.shape
+    attn_mask = torch.triu(torch.ones(seq_len,
+                                      seq_len,
+                                      dtype=query.dtype,
+                                      device=query.device),
+                           diagonal=1)
+    attn_mask = attn_mask * torch.finfo(query.dtype).min
+
+    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
+    attn_weights = attn_weights + attn_mask.float()
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
+    return out
diff --git a/vllm/model_executor/layers/attention/ops/__init__.py b/vllm/model_executor/layers/attention/ops/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py
new file mode 100644
index 0000000000000..c5a9618c2395b
--- /dev/null
+++ b/vllm/model_executor/layers/attention/ops/paged_attn.py
@@ -0,0 +1,138 @@
+from typing import List, Optional
+
+import torch
+
+from vllm._C import cache_ops
+from vllm._C import ops
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention.ops.prefix_prefill import (
+    context_attention_fwd)
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+class PagedAttentionImpl:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> None:
+        cache_ops.reshape_and_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            input_metadata.slot_mapping.flatten(),
+            input_metadata.kv_cache_dtype,
+        )
+
+    @staticmethod
+    def forward_decode(
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+        num_kv_heads: int,
+        scale: float,
+        alibi_slopes: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+
+        block_size = value_cache.shape[3]
+        num_seqs, num_heads, head_size = query.shape
+        max_num_partitions = (
+            (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
+            _PARTITION_SIZE)
+        # NOTE(woosuk): We use a simple heuristic to decide whether to use
+        # PagedAttention V1 or V2. If the number of partitions is 1, we use
+        # V1 to avoid the overhead of reduction. Also, if the number of
+        # sequences or heads is large, we use V1 since there is enough work
+        # to parallelize.
+        # TODO(woosuk): Tune this heuristic.
+        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+        use_v1 = input_metadata.max_context_len <= 8192 and (
+            max_num_partitions == 1 or num_seqs * num_heads > 512)
+        if use_v1:
+            # Run PagedAttention V1.
+            ops.paged_attention_v1(
+                output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                input_metadata.block_tables,
+                input_metadata.context_lens,
+                block_size,
+                input_metadata.max_context_len,
+                alibi_slopes,
+                input_metadata.kv_cache_dtype,
+            )
+        else:
+            # Run PagedAttention V2.
+            assert _PARTITION_SIZE % block_size == 0
+            tmp_output = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions, head_size),
+                dtype=output.dtype,
+                device=output.device,
+            )
+            exp_sums = torch.empty(
+                size=(num_seqs, num_heads, max_num_partitions),
+                dtype=torch.float32,
+                device=output.device,
+            )
+            max_logits = torch.empty_like(exp_sums)
+            ops.paged_attention_v2(
+                output,
+                exp_sums,
+                max_logits,
+                tmp_output,
+                query,
+                key_cache,
+                value_cache,
+                num_kv_heads,
+                scale,
+                input_metadata.block_tables,
+                input_metadata.context_lens,
+                block_size,
+                input_metadata.max_context_len,
+                alibi_slopes,
+                input_metadata.kv_cache_dtype,
+            )
+        return output
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+        alibi_slopes: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        output = torch.empty_like(query)
+        context_attention_fwd(
+            query,
+            key,
+            value,
+            output,
+            key_cache,
+            value_cache,
+            input_metadata.block_tables,  # [BS, max_block_per_request]
+            input_metadata.start_loc,
+            input_metadata.prompt_lens,
+            input_metadata.context_lens,
+            input_metadata.max_seq_len,
+            alibi_slopes,
+        )
+        return output
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/attention/ops/prefix_prefill.py
similarity index 100%
rename from vllm/model_executor/layers/triton_kernel/prefix_prefill.py
rename to vllm/model_executor/layers/attention/ops/prefix_prefill.py
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 550dec6487f9e..6da0082b94285 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -27,7 +27,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -151,10 +151,10 @@ def __init__(
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
             scaling = self.head_dim**-0.5
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       scaling,
-                                       alibi_slopes=alibi_slopes)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  scaling,
+                                  alibi_slopes=alibi_slopes)
         else:
             self.rotary_emb = get_rope(
                 self.head_dim,
@@ -163,8 +163,7 @@ def __init__(
                 base=self.rope_theta,
             )
             self.scaling = self.head_dim**-0.5
-            self.attn = PagedAttention(self.num_heads, self.head_dim,
-                                       self.scaling)
+            self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 4adfb6b78102f..0548b2b140b1b 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -107,10 +107,10 @@ def __init__(
         alibi_slopes = alibi_slopes[head_start:head_end].tolist()
 
         scaling = self.head_dim**-0.5
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scaling,
-                                   alibi_slopes=alibi_slopes)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              alibi_slopes=alibi_slopes)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index dca8d724f976b..1c5dcfacaff2b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -10,7 +10,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -87,7 +87,7 @@ def __init__(
             base=10000 * rope_ratio,
             is_neox_style=False,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 6dba952736921..f2dca3df27cfb 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -29,7 +29,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -229,10 +229,10 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 2b5e022312e3b..3c148be5b10f4 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -28,7 +28,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -150,10 +150,10 @@ def __init__(
                 max_position=max_position_embeddings,
                 base=rope_theta,
             )
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads)
         elif self.use_alibi:
             tp_rank = get_tensor_model_parallel_rank()
             head_start = tp_rank * self.num_heads
@@ -161,16 +161,16 @@ def __init__(
             alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
                             self.inv_norm_factor)
             alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads,
-                                       alibi_slopes=alibi_slopes)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads,
+                                  alibi_slopes=alibi_slopes)
         else:
-            self.attn = PagedAttention(self.num_heads,
-                                       self.head_dim,
-                                       scale=self.inv_norm_factor,
-                                       num_kv_heads=self.num_kv_heads)
+            self.attn = Attention(self.num_heads,
+                                  self.head_dim,
+                                  scale=self.inv_norm_factor,
+                                  num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index bf1f164ff700d..386a36cf492d6 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -23,7 +23,7 @@
 from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import GeluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -123,10 +123,10 @@ def __init__(self,
             base=self.rope_theta,
             is_neox_style=True,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 661da0fe0434e..3f7b21e5a4133 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -73,9 +73,7 @@ def __init__(
             bias=True,
             linear_method=linear_method,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scale)
+        self.attn = Attention(self.num_heads, self.head_dim, scale=self.scale)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index ef4c1d4143c88..5c30d47d93e36 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -26,7 +26,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -85,10 +85,10 @@ def __init__(
             bias=True,
             linear_method=linear_method,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scale,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scale,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 5bab30d9d442e..b8c6822e9825e 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -24,7 +24,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -86,7 +86,7 @@ def __init__(
             base=rope_theta,
             is_neox_style=False,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
+        self.attn = Attention(self.num_heads, self.head_size, scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 8f7e1063e0c1d..98107350e60b9 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -24,7 +24,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -87,7 +87,7 @@ def __init__(
             max_position=max_position_embeddings,
             base=rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
+        self.attn = Attention(self.num_heads, self.head_size, scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index ebf1d8a89a022..0ae0a85643456 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -7,7 +7,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -114,10 +114,10 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d35887cc0f6a3..4c163dfdab537 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,7 +30,7 @@
 from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -139,11 +139,11 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=sliding_window)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              sliding_window=sliding_window)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0100624a44d78..d47834e519697 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -29,7 +29,7 @@
 
 from vllm.config import LoRAConfig
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.fused_moe import fused_moe
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -197,7 +197,7 @@ def __init__(self,
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index a8dadce24aa1d..25c7f1978c0dc 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -32,7 +32,7 @@
 from transformers import MixtralConfig
 
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                ReplicatedLinear,
@@ -214,7 +214,7 @@ def __init__(self,
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 22a876e2ef691..16ecac3d0529a 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -8,7 +8,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -105,11 +105,11 @@ def __init__(
 
         self.head_dim = self.d_model // self.total_num_heads
         scaling = self.head_dim**-0.5
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scaling,
-                                   alibi_slopes=alibi_slopes,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scaling,
+                              alibi_slopes=alibi_slopes,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 9d563039208c8..fa7a6d850051e 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -43,7 +43,7 @@
 from torch import nn
 
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     LinearMethodBase,
@@ -126,9 +126,9 @@ def __init__(
                 base=rope_theta,
             )
         self.scaling = self.head_dim**-0.5
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scaling)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling)
 
         # Attention output projection.
         self.attn_out = RowParallelLinear(
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 393b2dcabcd5a..782f43ce265bd 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -89,9 +89,9 @@ def __init__(
             bias=bias,
             linear_method=linear_method,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   scale=self.scaling)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              scale=self.scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 0b067d4fc8802..6039b1cdc3534 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -12,7 +12,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -118,10 +118,10 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index d143261968288..039dc7a9b7675 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -43,7 +43,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearMethodBase,
                                                QKVParallelLinear,
@@ -108,7 +108,7 @@ def __init__(self,
             max_position=max_position_embeddings,
             base=rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)
+        self.attn = Attention(self.num_heads, self.head_size, scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 37af84c7cd53f..d4d5a4e8bb9a5 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -12,7 +12,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -104,7 +104,7 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)
+        self.attn = Attention(self.num_heads, self.head_dim, self.scaling)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e823e6f8c3dbe..3586a7fb82778 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -30,7 +30,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
@@ -135,11 +135,11 @@ def __init__(self,
             max_position=max_position,
             base=self.rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_kv_heads,
-                                   sliding_window=self.sliding_window)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              sliding_window=self.sliding_window)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 44c57e5a6d4f9..d1a547f815616 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
@@ -122,10 +122,10 @@ def __init__(self,
             max_position=self.config.max_position_embeddings,
             base=self.config.rope_theta,
         )
-        self.attn = PagedAttention(self.num_heads,
-                                   self.head_dim,
-                                   self.scaling,
-                                   num_kv_heads=self.num_key_value_heads)
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_key_value_heads)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 1eda07b724cae..efa235233372f 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -25,7 +25,7 @@
 
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -103,7 +103,7 @@ def __init__(self,
             base=int(self.rope_theta),
             is_neox_style=True,
         )
-        self.attn = PagedAttention(
+        self.attn = Attention(
             self.num_heads,
             self.head_dim,
             self.scaling,

From 385da2dae2b90e5273da8dfce881727bd9c574a1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Thu, 7 Mar 2024 11:42:42 -0800
Subject: [PATCH 065/113] Measure model memory usage (#3120)

---
 vllm/utils.py               | 25 +++++++++++++++++++++++++
 vllm/worker/model_runner.py | 18 ++++++++++++------
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 9cdf623379516..5b94067cec777 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -3,6 +3,7 @@
 import socket
 import subprocess
 import uuid
+import gc
 from platform import uname
 from typing import List, Tuple, Union
 from packaging.version import parse, Version
@@ -309,3 +310,27 @@ def create_kv_caches_with_random(
                 f"Does not support value cache of type {cache_dtype}")
         value_caches.append(value_cache)
     return key_caches, value_caches
+
+
+class measure_cuda_memory:
+
+    def __init__(self, device=None):
+        self.device = device
+
+    def current_memory_usage(self) -> float:
+        # Return the memory usage in bytes.
+        torch.cuda.reset_peak_memory_stats(self.device)
+        mem = torch.cuda.max_memory_allocated(self.device)
+        return mem
+
+    def __enter__(self):
+        self.initial_memory = self.current_memory_usage()
+        # This allows us to call methods of the context manager if needed
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.final_memory = self.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
+
+        # Force garbage collection
+        gc.collect()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index aff8ebc903623..b01f865f1bb03 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -21,7 +21,7 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
-from vllm.utils import in_wsl
+from vllm.utils import in_wsl, measure_cuda_memory
 
 logger = init_logger(__name__)
 
@@ -85,11 +85,17 @@ def __init__(
             self.model_config.enforce_eager = True
 
     def load_model(self) -> None:
-        self.model = get_model(self.model_config,
-                               self.device_config,
-                               lora_config=self.lora_config,
-                               parallel_config=self.parallel_config,
-                               scheduler_config=self.scheduler_config)
+        with measure_cuda_memory() as m:
+            self.model = get_model(self.model_config,
+                                   self.device_config,
+                                   lora_config=self.lora_config,
+                                   parallel_config=self.parallel_config,
+                                   scheduler_config=self.scheduler_config)
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info(
+            f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB"
+        )
 
         vocab_size = self.model.config.vocab_size
 

From 8cbba4622c8c526b207b17e3ba51e18e2c766419 Mon Sep 17 00:00:00 2001
From: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com>
Date: Thu, 7 Mar 2024 18:03:22 -0500
Subject: [PATCH 066/113] Possible fix for conflict between Automated Prefix
 Caching (#2762) and multi-LoRA support (#1804) (#3263)

---
 tests/test_cache_block_hashing.py | 46 +++++++++++++++++++++----------
 vllm/sequence.py                  |  3 +-
 2 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index c2067e52b59c0..fb541f38f3489 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -2,8 +2,11 @@
 
 Run `pytest tests/test_cache_block_hashing.py`.
 """
+from typing import List, Optional
+
 import pytest
 
+from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import TokenizerGroup
 from vllm.sequence import Sequence
 
@@ -36,7 +39,10 @@ def flatten_2d(li):
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
 @pytest.mark.parametrize("block_size", [16])
 @pytest.mark.parametrize("max_num_seqs", [256])
-def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
+@pytest.mark.parametrize("concurrent_lora_int_ids",
+                         [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
+def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
+                             concurrent_lora_int_ids: List[Optional[int]]):
 
     tokenizer = TokenizerGroup(
         tokenizer_id="facebook/opt-125m",
@@ -48,20 +54,30 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int):
     hashes = []
 
     for prefix in prefixes:
-        hashes.append([])
-        prompts = [prefix + prompt for prompt in sample_prompts]
-        seq_id = 0
-        for prompt in prompts:
-            hashes[-1].append([])
-            prompt_token_ids = tokenizer.encode(prompt)
-            seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
-                           tokenizer.tokenizer.eos_token_id)
-
-            num_blocks = len(prompt_token_ids) // block_size
-            for idx in range(num_blocks):
-                hashes[-1][-1].append(seq.hash_of_block(idx))
-
-            seq_id += 1
+        for lora_int_id in concurrent_lora_int_ids:
+            lora_request = None
+
+            if lora_int_id is not None:
+                lora_request = LoRARequest(
+                    f"example_lora_{lora_int_id}",
+                    lora_int_id,
+                    f"example/path/to/lora_{lora_int_id}",
+                )
+
+            hashes.append([])
+            prompts = [prefix + prompt for prompt in sample_prompts]
+            seq_id = 0
+            for prompt in prompts:
+                hashes[-1].append([])
+                prompt_token_ids = tokenizer.encode(prompt)
+                seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
+                               tokenizer.tokenizer.eos_token_id, lora_request)
+
+                num_blocks = len(prompt_token_ids) // block_size
+                for idx in range(num_blocks):
+                    hashes[-1][-1].append(seq.hash_of_block(idx))
+
+                seq_id += 1
 
     # Check that hashes made with two prefixes with different first blocks are
     # different everywhere.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 19dafe3cb0fc9..fee96a875dde5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -175,7 +175,8 @@ def hash_of_block(self, logical_idx: int) -> int:
         # TODO: The current hashing function is O(L^2). We should optimize
         # this in the future.
         num_tokens = self.num_hashed_tokens_of_block(logical_idx)
-        return hash(tuple(self.data.get_token_ids()[0:num_tokens]))
+        return hash(
+            (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id))
 
     def num_hashed_tokens_of_block(self, logical_idx: int):
         return logical_idx * self.block_size + self.block_size

From b35cc93420e37b72dc1c4bbedb06012fd294b743 Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 8 Mar 2024 01:37:28 +0100
Subject: [PATCH 067/113] Fix auto prefix bug (#3239)

---
 tests/engine/test_computed_prefix_blocks.py | 34 +++++++++++++++++++++
 vllm/core/block_manager.py                  | 28 +++++++++--------
 vllm/worker/model_runner.py                 |  1 +
 3 files changed, 51 insertions(+), 12 deletions(-)
 create mode 100644 tests/engine/test_computed_prefix_blocks.py

diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
new file mode 100644
index 0000000000000..ed35212cc3f11
--- /dev/null
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -0,0 +1,34 @@
+import pytest
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("block_size", [16])
+def test_computed_prefix_blocks(model: str, block_size: int):
+    # This test checks if we are able to run the engine to completion
+    # without triggering asserts.
+    # We are in a scenario where all blocks from the second request's prompt
+    # are full and already computed when the second request arrives.
+    prompt = (
+        "You are a helpful assistant. How do I build a car from cardboard and "
+        "paper clips? Is there an easy to follow video tutorial available "
+        "online for free?")
+    prompt2 = (
+        " Please recommend to me some resources where I can learn not only to "
+        "handle technical difficulties of building a car, but also "
+        "decoration.")
+
+    engine_args = EngineArgs(model=model,
+                             block_size=block_size,
+                             enable_prefix_caching=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams()
+
+    engine.add_request("0", prompt + prompt2, sampling_params)
+    engine.step()
+    engine.add_request("1", prompt, sampling_params)
+    engine.step()
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index daf83827a7e52..52b120f227eda 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -1,6 +1,6 @@
 """A block manager that manages token blocks."""
 import enum
-from itertools import count
+from itertools import count, takewhile
 from os.path import commonprefix
 from typing import Dict, List, Optional, Set, Tuple
 
@@ -426,23 +426,29 @@ def access_all_blocks_in_seq(
         for block in block_table:
             block.last_accessed = access_time
 
-    def compute_last_full_block_in_seq(self, seq: Sequence):
+    def compute_full_blocks_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
         max_full_block = seq.get_len() // self.block_size - 1
         block_table = self.block_tables[seq.seq_id]
         if max_full_block == -1:
             return
-        block_table[max_full_block].computed = True
+        for i in reversed(range(max_full_block)):
+            if block_table[i].computed:
+                break
+            block_table[i].computed = True
 
-    def get_all_block_ids_till_computed(self, seq: Sequence) -> List[int]:
+    def get_all_computed_blocks(self, seq: Sequence) -> List[int]:
         if seq.seq_id not in self.block_tables:
             return []
         block_table = self.block_tables[seq.seq_id]
-        for block_idx in reversed(range(len(block_table))):
-            if block_table[block_idx].computed:
-                return [b.block_number for b in block_table[:block_idx + 1]]
-        return []
+        # NOTE We exclude the last block to avoid the case where the entire
+        # prompt is cached. This would cause erroneous behavior in model
+        # runner.
+        return [
+            b.block_number
+            for b in takewhile(lambda b: b.computed, block_table[:-1])
+        ]
 
     def get_common_computed_block_ids(self,
                                       seq_group: SequenceGroup) -> List[int]:
@@ -451,14 +457,12 @@ def get_common_computed_block_ids(self,
             return []
 
         ids_list = [
-            self.get_all_block_ids_till_computed(seq)
+            self.get_all_computed_blocks(seq)
             for seq in iter(seq_group.seqs_dict.values())
         ]
         return commonprefix([ids for ids in ids_list if ids != []])
 
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
-        # NOTE: We only mark the last full block because with prefix caching,
-        # all blocks until the marked one are guaranteed to be computed.
         if self.enable_caching:
             for seq in seq_group.seqs_dict.values():
-                self.compute_last_full_block_in_seq(seq)
+                self.compute_full_blocks_in_seq(seq)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index b01f865f1bb03..9023b0c59b3fb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -215,6 +215,7 @@ def _prepare_prompt(
                 slot_mapping[-1].append(slot)
 
         max_prompt_len = max(subquery_lens)
+        assert max_prompt_len > 0
         input_tokens = _make_tensor_with_pad(input_tokens,
                                              max_prompt_len,
                                              pad=0,

From d2339d6840498397f6e373489ed120cd2cce8eb4 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 7 Mar 2024 16:38:12 -0800
Subject: [PATCH 068/113] Connect engine healthcheck to openai server (#3260)

---
 vllm/entrypoints/openai/api_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 993a834e5a720..9f29b4ac92f48 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -160,6 +160,7 @@ async def validation_exception_handler(_, exc):
 @app.get("/health")
 async def health() -> Response:
     """Health check."""
+    await openai_serving_chat.engine.check_health()
     return Response(status_code=200)
 
 

From c59e120c557743b0fc8178ee1796c8a3def78bf4 Mon Sep 17 00:00:00 2001
From: whyiug <whyiug@hotmail.com>
Date: Fri, 8 Mar 2024 13:58:24 +0800
Subject: [PATCH 069/113] Feature add lora support for Qwen2 (#3177)

---
 csrc/punica/bgmv/bgmv_config.h      |  2 ++
 vllm/model_executor/models/qwen2.py | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index d5fee9c40d00c..3eb84ceb4d534 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -21,6 +21,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 2048) \
     f(in_T, out_T, W_T, narrow, 2560) \
     f(in_T, out_T, W_T, narrow, 2752) \
+    f(in_T, out_T, W_T, narrow, 2816) \
     f(in_T, out_T, W_T, narrow, 3072) \
     f(in_T, out_T, W_T, narrow, 3456) \
     f(in_T, out_T, W_T, narrow, 3584) \
@@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 10240) \
     f(in_T, out_T, W_T, narrow, 11008) \
     f(in_T, out_T, W_T, narrow, 12288) \
+    f(in_T, out_T, W_T, narrow, 13696) \
     f(in_T, out_T, W_T, narrow, 13824) \
     f(in_T, out_T, W_T, narrow, 14336) \
     f(in_T, out_T, W_T, narrow, 16384) \
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 3586a7fb82778..4dd63f923e5f2 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -46,6 +46,7 @@
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
+from vllm.config import LoRAConfig
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -264,12 +265,35 @@ def forward(
 
 
 class Qwen2ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(
         self,
         config: Qwen2Config,
         linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
     ) -> None:
+        del lora_config
         super().__init__()
         self.config = config
         self.linear_method = linear_method

From 1ece1ae829dcbc4b1b19b3e2d3042457615e862f Mon Sep 17 00:00:00 2001
From: TianYu GUO <guoty9@mail2.sysu.edu.cn>
Date: Fri, 8 Mar 2024 14:22:59 +0800
Subject: [PATCH 070/113] [Minor Fix] Fix comments in benchmark_serving (#3252)

---
 benchmarks/benchmark_serving.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 7d389a9c7d703..3f5e2d9c8f4dc 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -12,7 +12,7 @@
 On the client side, run:
     python benchmarks/benchmark_serving.py \
         --backend <backend> \
-        --tokenizer <your_model> --dataset <target_dataset> \
+        --model <your_model> --dataset <target_dataset> \
         --request-rate <request_rate>
 """
 import argparse
@@ -171,10 +171,10 @@ async def benchmark(
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
     print(f"Traffic request rate: {request_rate}")
 
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
     benchmark_start_time = time.perf_counter()
     tasks = []
     async for request in get_request(input_requests, request_rate):

From 99c3cfb83c20d45899ab8cbfdddce98c7cffb7b1 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 8 Mar 2024 09:58:01 -0800
Subject: [PATCH 071/113] [Docs] Fix Unmocked Imports (#3275)

---
 docs/source/conf.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 5a45c6f9d1e0a..61d24e1612128 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -72,8 +72,15 @@
 
 # Mock out external dependencies here.
 autodoc_mock_imports = [
-    "torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
-    "vllm.cuda_utils", "vllm._C"
+    "torch",
+    "transformers",
+    "psutil",
+    "prometheus_client",
+    "sentencepiece",
+    "vllm.cuda_utils",
+    "vllm._C",
+    "numpy",
+    "tqdm",
 ]
 
 for mock_target in autodoc_mock_imports:

From 1cb0cc2975d1c42c445c795f955b783e78919502 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 8 Mar 2024 10:52:20 -0800
Subject: [PATCH 072/113] [FIX] Make `flash_attn` optional (#3269)

---
 .gitignore                                    |  3 --
 setup.py                                      | 48 ++-----------------
 vllm/__init__.py                              | 30 +++---------
 .../layers/attention/attention.py             | 37 +++++++++++---
 .../layers/attention/backends/flash_attn.py   |  1 -
 5 files changed, 41 insertions(+), 78 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0b14c98270c41..b5195629e5cf3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,6 +184,3 @@ _build/
 
 # Benchmark dataset
 *.json
-
-# Third-party Python packages.
-vllm/thirdparty_files/
diff --git a/setup.py b/setup.py
index 57d7a139e8237..745b5a9b2d02a 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,6 @@
 import os
 import re
 import subprocess
-import sys
 import warnings
 from pathlib import Path
 from typing import List, Set
@@ -15,8 +14,6 @@
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
 
 ROOT_DIR = os.path.dirname(__file__)
-# This is a temporary directory to store third-party packages.
-THIRDPARTY_SUBDIR = "vllm/thirdparty_files"
 
 # If you are developing the C++ backend of vLLM, consider building vLLM with
 # `python setup.py develop` since it will give you incremental builds.
@@ -327,46 +324,8 @@ def get_torch_arch_list() -> Set[str]:
                     "nvcc": NVCC_FLAGS_PUNICA,
                 },
             ))
-
-    # Download the FlashAttention package.
-    # Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/setup.py#L518-L530
-    flash_attn_version = "2.5.6"
-    install_dir = os.path.join(ROOT_DIR, THIRDPARTY_SUBDIR)
-    subprocess.check_call(
-        [
-            sys.executable,
-            "-m",
-            "pip",
-            "install",
-            "-q",
-            f"--target={install_dir}",
-            "einops",  # Dependency of flash-attn.
-            f"flash-attn=={flash_attn_version}",
-            "--no-dependencies",  # Required to avoid re-installing torch.
-        ],
-        env=dict(os.environ, CC="gcc"),
-    )
-
-    # Copy the FlashAttention package into the vLLM package after build.
-    class build_ext(BuildExtension):
-
-        def run(self):
-            super().run()
-            target_dir = os.path.join(self.build_lib, THIRDPARTY_SUBDIR)
-            if not os.path.exists(target_dir):
-                os.makedirs(target_dir)
-            self.copy_tree(install_dir, target_dir)
-
-    class BinaryDistribution(setuptools.Distribution):
-
-        def has_ext_modules(self):
-            return True
-
-else:
-    build_ext = BuildExtension
-    BinaryDistribution = setuptools.Distribution
-    if _is_neuron():
-        neuronxcc_version = get_neuronxcc_version()
+elif _is_neuron():
+    neuronxcc_version = get_neuronxcc_version()
 
 vllm_extension_sources = [
     "csrc/cache_kernels.cu",
@@ -509,7 +468,6 @@ def get_requirements() -> List[str]:
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": build_ext} if not _is_neuron() else {},
-    distclass=BinaryDistribution,
+    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
     package_data=package_data,
 )
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 59f1345b58d42..f1e30f5eb6e6e 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,28 +1,12 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 
-
-# Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/ray/__init__.py#L11
-def _configure_system():
-    import os
-    import sys
-
-    # Importing flash-attn.
-    thirdparty_files = os.path.join(os.path.abspath(os.path.dirname(__file__)),
-                                    "thirdparty_files")
-    sys.path.insert(0, thirdparty_files)
-
-
-_configure_system()
-# Delete configuration function.
-del _configure_system
-
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs  # noqa: E402
-from vllm.engine.async_llm_engine import AsyncLLMEngine  # noqa: E402
-from vllm.engine.llm_engine import LLMEngine  # noqa: E402
-from vllm.engine.ray_utils import initialize_cluster  # noqa: E402
-from vllm.entrypoints.llm import LLM  # noqa: E402
-from vllm.outputs import CompletionOutput, RequestOutput  # noqa: E402
-from vllm.sampling_params import SamplingParams  # noqa: E402
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.ray_utils import initialize_cluster
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.sampling_params import SamplingParams
 
 __version__ = "0.3.3"
 
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
index 830e82e10f7ad..724dd0511c5aa 100644
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -1,12 +1,16 @@
 """Attention layer."""
+from functools import lru_cache
 from typing import List, Optional
 
 import torch
 import torch.nn as nn
 
+from vllm.logger import init_logger
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.utils import is_hip
 
+logger = init_logger(__name__)
+
 
 class Attention(nn.Module):
     """Attention layer.
@@ -30,17 +34,12 @@ def __init__(
         sliding_window: Optional[int] = None,
     ) -> None:
         super().__init__()
-        if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and
-                torch.get_default_dtype() in (torch.float16, torch.bfloat16)):
-            # Ampere or later NVIDIA GPUs.
-            # NOTE(woosuk): FlashAttention does not support FP32.
+        if _use_flash_attn():
             from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend
             self.backend = FlashAttentionBackend(num_heads, head_size, scale,
                                                  num_kv_heads, alibi_slopes,
                                                  sliding_window)
         else:
-            # Turing and Volta NVIDIA GPUs or AMD GPUs.
-            # Or FP32 on any GPU.
             from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend
             self.backend = XFormersBackend(num_heads, head_size, scale,
                                            num_kv_heads, alibi_slopes,
@@ -57,3 +56,29 @@ def forward(
     ) -> torch.Tensor:
         return self.backend.forward(query, key, value, key_cache, value_cache,
                                     input_metadata)
+
+
+@lru_cache(maxsize=1)
+def _use_flash_attn() -> bool:
+    try:
+        import flash_attn  # noqa: F401
+    except ImportError:
+        logger.info("flash_attn is not found. Using xformers backend.")
+        return False
+
+    if is_hip():
+        # AMD GPUs.
+        return False
+    if torch.cuda.get_device_capability()[0] < 8:
+        # Volta and Turing NVIDIA GPUs.
+        logger.info("flash_attn is not supported on Turing or older GPUs. "
+                    "Using xformers backend.")
+        return False
+    if torch.get_default_dtype() not in (torch.float16, torch.bfloat16):
+        logger.info(
+            "flash_attn only supports torch.float16 or torch.bfloat16. "
+            "Using xformers backend.")
+        return False
+
+    logger.info("Using flash_attn backend.")
+    return True
diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index 512f4e49c7eb2..4abe195f274a7 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -1,7 +1,6 @@
 """Attention layer with Flash and PagedAttention."""
 from typing import List, Optional
 
-# NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/.
 from flash_attn import flash_attn_func
 import torch
 

From c2c5e0909ad4457ad542117939c2629ebe2db609 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 8 Mar 2024 13:33:10 -0800
Subject: [PATCH 073/113] Move model filelocks from `/tmp/` to
 `~/.cache/vllm/locks/` dir (#3241)

---
 vllm/model_executor/weight_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
index 3570366887e78..24d78db8d2637 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@@ -20,6 +20,9 @@
 
 logger = init_logger(__name__)
 
+_xdg_cache_home = os.getenv('XDG_CACHE_HOME', os.path.expanduser('~/.cache'))
+_vllm_filelocks_path = os.path.join(_xdg_cache_home, 'vllm/locks/')
+
 
 class Disabledtqdm(tqdm):
 
@@ -28,7 +31,8 @@ def __init__(self, *args, **kwargs):
 
 
 def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
-    lock_dir = cache_dir if cache_dir is not None else "/tmp"
+    lock_dir = cache_dir if cache_dir is not None else _vllm_filelocks_path
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
     lock_file_name = model_name_or_path.replace("/", "-") + ".lock"
     lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name))
     return lock

From f48c6791b7bfc2579ad575d33ed83912f0bfb011 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Fri, 8 Mar 2024 17:16:14 -0800
Subject: [PATCH 074/113] [FIX] Fix prefix test error on main (#3286)

---
 vllm/model_executor/layers/attention/backends/flash_attn.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py
index 4abe195f274a7..58ccd461b993e 100644
--- a/vllm/model_executor/layers/attention/backends/flash_attn.py
+++ b/vllm/model_executor/layers/attention/backends/flash_attn.py
@@ -103,8 +103,6 @@ def forward(
                     key_cache,
                     value_cache,
                     input_metadata,
-                    self.num_heads,
-                    self.num_kv_heads,
                     self.alibi_slopes,
                 )
         else:

From 8437bae6ef47a690d18c72f0da02c7e5abe83866 Mon Sep 17 00:00:00 2001
From: Cade Daniel <edacih@gmail.com>
Date: Fri, 8 Mar 2024 23:32:46 -0800
Subject: [PATCH 075/113] [Speculative decoding 3/9] Worker which speculates,
 scores, and applies rejection sampling (#3103)

---
 .buildkite/test-pipeline.yaml                 |   5 +-
 tests/{worker => }/spec_decode/__init__.py    |   0
 tests/spec_decode/test_batch_expansion.py     |  95 +++
 tests/spec_decode/test_metrics.py             | 157 +++++
 .../spec_decode/test_multi_step_worker.py     | 162 ++++-
 tests/spec_decode/test_spec_decode_worker.py  | 591 ++++++++++++++++++
 tests/spec_decode/test_utils.py               | 111 ++++
 tests/{worker => }/spec_decode/utils.py       | 115 +++-
 tests/test_sequence.py                        |  50 ++
 .../layers/rejection_sampler.py               |  10 +-
 vllm/model_executor/layers/sampler.py         |   2 +-
 vllm/sequence.py                              |  55 +-
 vllm/spec_decode/batch_expansion.py           | 351 +++++++++++
 vllm/spec_decode/interfaces.py                |  77 +++
 vllm/spec_decode/metrics.py                   | 174 ++++++
 vllm/spec_decode/multi_step_worker.py         | 366 +++++++++++
 vllm/spec_decode/spec_decode_worker.py        | 372 +++++++++++
 vllm/spec_decode/util.py                      |  99 +++
 vllm/worker/model_runner.py                   |  11 +-
 vllm/worker/spec_decode/multi_step_worker.py  | 178 ------
 vllm/worker/worker.py                         |  20 +-
 21 files changed, 2786 insertions(+), 215 deletions(-)
 rename tests/{worker => }/spec_decode/__init__.py (100%)
 create mode 100644 tests/spec_decode/test_batch_expansion.py
 create mode 100644 tests/spec_decode/test_metrics.py
 rename tests/{worker => }/spec_decode/test_multi_step_worker.py (61%)
 create mode 100644 tests/spec_decode/test_spec_decode_worker.py
 create mode 100644 tests/spec_decode/test_utils.py
 rename tests/{worker => }/spec_decode/utils.py (60%)
 create mode 100644 tests/test_sequence.py
 create mode 100644 vllm/spec_decode/batch_expansion.py
 create mode 100644 vllm/spec_decode/interfaces.py
 create mode 100644 vllm/spec_decode/metrics.py
 create mode 100644 vllm/spec_decode/multi_step_worker.py
 create mode 100644 vllm/spec_decode/spec_decode_worker.py
 create mode 100644 vllm/spec_decode/util.py
 delete mode 100644 vllm/worker/spec_decode/multi_step_worker.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 15f971b66e3bd..42a1eacb6de57 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -28,7 +28,7 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Engine Test
-  command: pytest -v -s engine
+  command: pytest -v -s engine test_sequence.py
 
 - label: Entrypoints Test
   command: pytest -v -s entrypoints
@@ -52,6 +52,9 @@ steps:
 - label: Worker Test
   command: pytest -v -s worker
 
+- label: Speculative decoding tests
+  command: pytest -v -s spec_decode
+
 - label: LoRA Test
   command: pytest -v -s lora --forked
 
diff --git a/tests/worker/spec_decode/__init__.py b/tests/spec_decode/__init__.py
similarity index 100%
rename from tests/worker/spec_decode/__init__.py
rename to tests/spec_decode/__init__.py
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
new file mode 100644
index 0000000000000..fddc3995452cc
--- /dev/null
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -0,0 +1,95 @@
+import torch
+import pytest
+
+from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+
+from .utils import mock_worker, create_seq_group_metadata_from_prompts
+
+
+@pytest.mark.parametrize('num_target_seq_ids', [100])
+def test_create_target_seq_id_iterator(num_target_seq_ids: int):
+    """Verify all new sequence ids are greater than all input
+    seq ids.
+    """
+    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+
+    all_seq_ids = [
+        [1, 3, 5, 7],
+        list(range(100)) + [0],
+        [100],
+    ]
+
+    for seq_ids in all_seq_ids:
+        max_seq_id = max(seq_ids)
+        iterator = scorer._create_target_seq_id_iterator(seq_ids)  # pylint: disable=protected-access
+        for _ in range(num_target_seq_ids):
+            assert next(iterator) > max_seq_id
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+def test_get_token_ids_to_score(k: int):
+    """Verify correct tokens are selected for scoring.
+    """
+    proposal_token_ids = torch.tensor(
+        list(range(k)),
+        dtype=torch.int64,
+        device='cuda',
+    )
+
+    expected_output = [
+        [],
+    ]
+    for i in range(proposal_token_ids.shape[0]):
+        expected_output.append(proposal_token_ids[:i + 1].tolist())
+
+    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    actual_output = scorer._get_token_ids_to_score(proposal_token_ids)  # pylint: disable=protected-access
+
+    actual_output = [
+        x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output
+    ]
+
+    assert actual_output == expected_output
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+def test_create_single_target_seq_group_metadata(k: int):
+    """Verify correct creation of a batch-expanded seq group metadata.
+    """
+
+    prompt_tokens = [1, 2, 3]
+    prev_output_tokens = [4, 5, 6]
+
+    token_ids = list(range(k))
+
+    num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1
+
+    final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len(
+        token_ids)
+
+    block_size = 32
+    input_seq_group_metadata = create_seq_group_metadata_from_prompts(
+        [prompt_tokens], 2048 // block_size, block_size, [final_seq_len],
+        [prev_output_tokens], [num_tokens_processed])[0]
+
+    input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0]
+    target_seq_id = 100
+
+    scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000)
+    output = scorer._create_single_target_seq_group_metadata(  # pylint: disable=protected-access
+        input_seq_group_metadata,
+        input_seq_id,
+        target_seq_id,
+        token_ids,
+    )
+
+    assert output.request_id == input_seq_group_metadata.request_id
+    assert len(output.seq_data) == 1
+    assert output.seq_data[target_seq_id].get_prompt_token_ids(
+    ) == prompt_tokens
+    assert output.seq_data[target_seq_id].get_output_token_ids(
+    ) == prev_output_tokens + token_ids
+
+    assert len(output.block_tables) == 1
+    assert output.block_tables[
+        target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id]
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
new file mode 100644
index 0000000000000..941ea37aa81e0
--- /dev/null
+++ b/tests/spec_decode/test_metrics.py
@@ -0,0 +1,157 @@
+import torch
+import math
+import pytest
+
+from unittest.mock import MagicMock
+
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+
+
+def test_initial_call_returns_none():
+    """Expect first call to get metrics to return None.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(rej_sampler)
+    collector.init_gpu_tensors(rank=0)
+    maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert maybe_metrics is None
+
+
+def test_second_call_returns_metrics():
+    """Expect second call to not return None.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
+    ]
+
+    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is not None
+
+
+@pytest.mark.parametrize("rank", [1, 2, 3, 4])
+def test_nonzero_rank_noop(rank):
+    """Verify nonzero ranks don't collect metrics.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collector = AsyncMetricsCollector(rej_sampler)
+    collector.init_gpu_tensors(rank=rank)
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is None
+
+
+def test_noop_until_time():
+    """Verify metrics aren't collected until enough time passes.
+    """
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(0,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(0,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = 0
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0, collect_interval_s - 0.1, collect_interval_s - 0.1,
+        collect_interval_s + 0.1, collect_interval_s + 0.1
+    ]
+
+    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is None
+
+    _ = collector.maybe_collect_rejsample_metrics(k=5)
+    metrics = collector.maybe_collect_rejsample_metrics(k=5)
+    assert metrics is not None
+
+
+@pytest.mark.parametrize("has_data", [True, False])
+def test_initial_metrics_has_correct_values(has_data: bool):
+    """Test correctness of metrics data.
+    """
+    if has_data:
+        num_accepted_tokens = 103
+        num_emitted_tokens = 104
+        num_draft_tokens = 105
+    else:
+        num_accepted_tokens = 0
+        num_emitted_tokens = 0
+        num_draft_tokens = 0
+    k = 5
+
+    num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens(
+        num_draft_tokens, k)
+
+    rej_sampler = MagicMock()
+    rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens,
+                                                   dtype=torch.long,
+                                                   device='cuda')
+    rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens,
+                                                  dtype=torch.long,
+                                                  device='cuda')
+    rej_sampler.num_draft_tokens = num_draft_tokens
+
+    collect_interval_s = 5.0
+    timer = MagicMock()
+    timer.side_effect = [
+        0.0, collect_interval_s + 0.1, collect_interval_s + 0.2
+    ]
+
+    collector = AsyncMetricsCollector(rejection_sampler=rej_sampler,
+                                      timer=timer,
+                                      collect_interval_s=collect_interval_s)
+    collector.init_gpu_tensors(rank=0)
+    _ = collector.maybe_collect_rejsample_metrics(k)
+    metrics = collector.maybe_collect_rejsample_metrics(k)
+
+    assert metrics.num_spec_tokens == k
+    assert metrics.accepted_tokens == num_accepted_tokens
+    assert metrics.draft_tokens == num_draft_tokens
+    assert metrics.emitted_tokens == num_emitted_tokens
+
+    if has_data:
+        assert metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens
+        assert metrics.system_efficiency == num_emitted_tokens / num_possible_tokens
+    else:
+        assert math.isnan(metrics.draft_acceptance_rate)
+        assert math.isnan(metrics.system_efficiency)
diff --git a/tests/worker/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
similarity index 61%
rename from tests/worker/spec_decode/test_multi_step_worker.py
rename to tests/spec_decode/test_multi_step_worker.py
index ea54802903578..88bb7c293fe95 100644
--- a/tests/worker/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -3,14 +3,15 @@
 import pytest
 from unittest.mock import MagicMock
 
-from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.multi_step_worker import MultiStepWorker, DraftModelTop1Proposer
 from vllm.worker.worker import Worker
 from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplerOutput
 
 from .utils import (create_execute_model_data, create_worker,
                     create_seq_group_metadata_from_prompts, zero_kv_cache,
                     patch_execute_model_with_seeds,
-                    assert_logprobs_dict_allclose)
+                    assert_logprobs_dict_allclose, create_batch)
 
 
 @pytest.mark.parametrize('num_steps', list(range(1, 17)))
@@ -259,3 +260,160 @@ def test_same_output_for_multi_step():
             multi_step_output_logprobs, single_step_output_logprobs):
         assert_logprobs_dict_allclose(multi_step_logprobs,
                                       single_step_logprobs)
+
+
+@torch.inference_mode()
+def test_draft_proposals_full_speculation_len():
+    """Verify DraftModelTop1Proposer correctly handles case where all sequences
+    can speculate.
+    """
+    k = 10
+    batch_size = 32
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    draft_worker = MagicMock()
+    proposer = DraftModelTop1Proposer(
+        draft_worker=draft_worker,
+        device=device,
+        max_model_len=2048,
+        vocab_size=vocab_size,
+    )
+    draft_worker.execute_model_multi_step.return_value = [
+        SamplerOutput(
+            outputs=[],
+            sampled_token_probs=torch.rand(batch_size,
+                                           vocab_size,
+                                           device=device,
+                                           dtype=torch.float32),
+            sampled_token_ids=torch.randint(low=0,
+                                            high=vocab_size,
+                                            size=(batch_size, ),
+                                            device=device,
+                                            dtype=torch.long),
+        ) for _ in range(k)
+    ]
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    proposals = proposer.get_proposals(
+        **execute_model_data.to_dict(),
+        max_proposal_len=k,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k])
+
+    assert proposals.proposal_lens.shape == torch.Size([batch_size])
+    assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)]
+
+
+@torch.inference_mode()
+def test_draft_proposals_no_speculations():
+    """Verify DraftModelTop1Proposer correctly handles case where no sequences
+    can speculate.
+    """
+    k = 10
+    batch_size = 32
+    vocab_size = 32_000
+    device = 'cuda:0'
+    prompt_len = 10
+
+    draft_worker = MagicMock()
+    proposer = DraftModelTop1Proposer(
+        draft_worker=draft_worker,
+        device=device,
+        max_model_len=prompt_len + k - 1,
+        vocab_size=vocab_size,
+    )
+
+    execute_model_data, _, _ = create_batch(batch_size,
+                                            k,
+                                            prompt_len=prompt_len)
+
+    proposals = proposer.get_proposals(
+        **execute_model_data.to_dict(),
+        max_proposal_len=k,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([0, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k])
+
+    assert proposals.proposal_lens.shape == torch.Size([batch_size])
+    assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)]
+
+
+@torch.inference_mode()
+def test_draft_proposals_mixed_k():
+    """Verify DraftModelTop1Proposer correctly handles case some sequences can
+    speculate and some can't.
+    """
+    k = 10
+    batch_size = 32
+    vocab_size = 32_000
+    device = 'cuda:0'
+
+    small_prompt_len = 5
+    long_prompt_len = 10
+    prev_output_token_len = 20
+
+    expected_num_proposal_seqs = 6
+    expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs
+
+    prompt_len = [
+        small_prompt_len for _ in range(expected_num_proposal_seqs - 1)
+    ] + [long_prompt_len
+         for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len]
+
+    draft_worker = MagicMock()
+    proposer = DraftModelTop1Proposer(
+        draft_worker=draft_worker,
+        device=device,
+        max_model_len=long_prompt_len + prev_output_token_len + k - 1,
+        vocab_size=vocab_size,
+    )
+
+    draft_worker.execute_model_multi_step.return_value = [
+        SamplerOutput(
+            outputs=[],
+            sampled_token_probs=torch.rand(expected_num_proposal_seqs,
+                                           vocab_size,
+                                           device=device,
+                                           dtype=torch.float32),
+            sampled_token_ids=torch.randint(
+                low=0,
+                high=vocab_size,
+                size=(expected_num_proposal_seqs, ),
+                device=device,
+                dtype=torch.long),
+        ) for _ in range(k)
+    ]
+
+    execute_model_data, _, _ = create_batch(
+        batch_size,
+        k,
+        prompt_len=prompt_len,
+        prev_output_token_len=prev_output_token_len,
+    )
+
+    proposals = proposer.get_proposals(
+        **execute_model_data.to_dict(),
+        max_proposal_len=k,
+    )
+
+    assert torch.is_tensor(proposals.proposal_token_ids)
+    assert torch.is_tensor(proposals.proposal_probs)
+
+    assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k])
+    assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k])
+
+    assert proposals.proposal_lens.shape == torch.Size([batch_size])
+    assert proposals.proposal_lens.tolist() == [
+        k for _ in range(expected_num_proposal_seqs - 1)
+    ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k]
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
new file mode 100644
index 0000000000000..e919711c3ed2c
--- /dev/null
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -0,0 +1,591 @@
+import torch
+import random
+import pytest
+from unittest.mock import MagicMock
+
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker, split_num_cache_blocks_evenly
+from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.model_executor.utils import set_random_seed
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from .utils import mock_worker, create_batch, ExecuteModelData, create_sampler_output_list
+from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics, AsyncMetricsCollector
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_calls_draft_model(k: int, batch_size: int):
+    """Verify SpecDecodeWorker calls the draft worker with correct
+    inputs. Everything else is mocked out.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    exception_secret = 'artifical stop'
+    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+
+    call_args_list = draft_worker.get_spec_proposals.call_args_list
+    assert len(call_args_list) == 1
+
+    for args, _ in call_args_list:
+        (seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
+         blocks_to_copy, actual_k) = args
+        actual_execute_model_data = ExecuteModelData(seq_group_metadata_list,
+                                                     blocks_to_swap_in,
+                                                     blocks_to_swap_out,
+                                                     blocks_to_copy)
+        assert actual_execute_model_data == execute_model_data
+        assert actual_k == k
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_calls_target_model(k: int, batch_size: int):
+    """Verify SpecDecodeWorker calls the target model with correct
+    inputs. Everything else is mocked out.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    vocab_size = 32_000
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, prompts, prev_output_tokens = create_batch(
+        batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    exception_secret = 'artifical stop'
+    target_worker.execute_model.side_effect = ValueError(exception_secret)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+
+    seen_contexts = []
+
+    call_args_list = target_worker.execute_model.call_args_list
+    assert len(call_args_list) == 1
+    for args, kwargs in call_args_list:
+        target_execute_model_data = ExecuteModelData.from_dict(kwargs)
+
+        assert len(target_execute_model_data.seq_group_metadata_list) == (
+            k + 1) * batch_size
+        for seq_group_metadata in (
+                target_execute_model_data.seq_group_metadata_list):
+            for seq_data in seq_group_metadata.seq_data.values():
+                seen_contexts.append(seq_data.get_token_ids())
+
+    expected_seen_contexts = []
+
+    for prompt, prev_generated, draft_tokens in zip(
+            prompts, prev_output_tokens, proposal_token_ids.tolist()):
+
+        for i in range(len(draft_tokens) + 1):
+            expected_seen_contexts.append(prompt + prev_generated +
+                                          draft_tokens[:i])
+
+    seen_contexts.sort()
+    expected_seen_contexts.sort()
+    assert expected_seen_contexts == seen_contexts
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
+    """Verify SpecDecodeWorker calls the rejection sampler with
+    correct inputs. Everything else is mocked out.
+    """
+    vocab_size = 32_000
+
+    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
+    target_worker = mock_worker(vocab_size=vocab_size)
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs)
+
+    target_worker.execute_model.return_value = target_output[0]
+
+    exception_secret = 'artifical stop'
+    rejection_sampler.side_effect = ValueError(exception_secret)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+
+    assert len(rejection_sampler.call_args_list) == 1
+    args, _ = rejection_sampler.call_args_list[0]
+    (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs,
+     actual_proposal_token_ids) = args
+
+    assert torch.equal(actual_bonus_token_ids,
+                       target_token_ids.reshape(batch_size, k + 1)[:, -1:])
+    assert torch.equal(
+        actual_proposal_scores,
+        target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1])
+    assert torch.equal(actual_proposal_token_ids, proposal_token_ids)
+    assert torch.equal(actual_proposal_probs, proposal_probs)
+
+
+@pytest.mark.parametrize('k', [1, 2, 6])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_correctly_formats_output(k: int, batch_size: int):
+    """Verify SpecDecodeWorker formats sampler output correctly.
+    Everything else is mocked out.
+    """
+    vocab_size = 32_000
+
+    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
+    target_worker = mock_worker(vocab_size=vocab_size)
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs)
+
+    target_worker.execute_model.return_value = target_output[0]
+
+    rejection_sampler_output = torch.randint(low=0,
+                                             high=vocab_size,
+                                             size=(batch_size, k + 1),
+                                             dtype=torch.int64,
+                                             device='cuda')
+    for i in range(batch_size):
+        minimum_accepted_tokens = 1
+        rejection_sampler_output[i][
+            -random.randint(minimum_accepted_tokens, k + 1):] = -1
+
+    rejection_sampler.return_value = rejection_sampler_output
+
+    output = worker.execute_model(**execute_model_data.to_dict(),
+                                  num_spec_tokens=k)
+
+    expected_output = create_sampler_output_list(
+        rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)])
+
+    seq_ids = [
+        next(iter(seq_group_metadata.seq_data.keys()))
+        for seq_group_metadata in execute_model_data.seq_group_metadata_list
+    ]
+    actual_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+    expected_output_by_seq = {seq_id: [] for seq_id in seq_ids}
+
+    for step in output:
+        for seq_group in step:
+            for sample in seq_group.samples:
+                seq_id = sample.parent_seq_id
+                actual_output_by_seq[seq_id].append(sample)
+
+    for step in expected_output:
+        for seq_group in step:
+            for sample in seq_group.samples:
+                seq_id = sample.parent_seq_id
+                expected_output_by_seq[seq_id].append(sample)
+
+    all_seen_seq_ids = set(
+        list(actual_output_by_seq.keys()) +
+        list(expected_output_by_seq.keys()))
+    for seq_id in all_seen_seq_ids:
+        actual_by_step = actual_output_by_seq[seq_id]
+        expected_by_step = expected_output_by_seq[seq_id]
+
+        for i in range(k + 1):
+            if i >= len(actual_by_step):
+                assert expected_by_step[i].output_token == -1
+                continue
+            assert actual_by_step[i].output_token == expected_by_step[
+                i].output_token
+            assert actual_by_step[i].logprobs == expected_by_step[i].logprobs
+
+
+@pytest.mark.parametrize('k', [1, 2])
+@pytest.mark.parametrize('batch_size', [1])
+@pytest.mark.parametrize('returns_metrics', [True, False])
+@torch.inference_mode()
+def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
+    """Verify SpecDecodeWorker collects metrics.
+    """
+    vocab_size = 32_000
+
+    draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
+    target_worker = mock_worker(vocab_size=vocab_size)
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+    worker.init_model()
+
+    proposal_token_ids = torch.randint(low=0,
+                                       high=vocab_size,
+                                       size=(batch_size, k),
+                                       dtype=torch.int64,
+                                       device='cuda')
+    proposal_probs = torch.rand(batch_size,
+                                k,
+                                vocab_size,
+                                dtype=torch.float32,
+                                device='cuda')
+
+    proposal_lens = torch.ones(batch_size, dtype=torch.int64,
+                               device='cuda') * k
+
+    execute_model_data, _, _ = create_batch(batch_size, k)
+
+    draft_worker.get_spec_proposals.return_value = SpeculativeProposals(
+        proposal_token_ids=proposal_token_ids,
+        proposal_probs=proposal_probs,
+        proposal_lens=proposal_lens)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs)
+
+    target_worker.execute_model.return_value = target_output[0]
+
+    rejection_sampler_output = torch.randint(low=0,
+                                             high=vocab_size,
+                                             size=(batch_size, k + 1),
+                                             dtype=torch.int64,
+                                             device='cuda')
+    for i in range(batch_size):
+        minimum_accepted_tokens = 1
+        rejection_sampler_output[i][
+            -random.randint(minimum_accepted_tokens, k + 1):] = -1
+
+    rejection_sampler.return_value = rejection_sampler_output
+
+    mock_rejsample_metrics = MagicMock(
+        spec=SpecDecodeWorkerMetrics) if returns_metrics else None
+    metrics_collector.maybe_collect_rejsample_metrics.return_value = mock_rejsample_metrics
+
+    output = worker.execute_model(**execute_model_data.to_dict(),
+                                  num_spec_tokens=k)
+    assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
+
+    call_args_list = metrics_collector.maybe_collect_rejsample_metrics.call_args_list
+    assert len(call_args_list) == 1
+    args, kwargs = call_args_list[0]
+    assert args[0] == k or kwargs.get('k', -1) == k
+
+
+@pytest.mark.parametrize('k', [0])
+@pytest.mark.parametrize('batch_size', [1, 2, 32])
+@torch.inference_mode()
+def test_k_equals_zero(k: int, batch_size: int):
+    """Verify that the SpecDecodeWorker calls the draft and target workers
+    when k is zero. This happens during prefill.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    execute_model_data, prompts, prev_output_tokens = create_batch(
+        batch_size, k, prev_output_token_len=0)
+
+    out = worker.execute_model(**execute_model_data.to_dict(),
+                               num_spec_tokens=k)
+
+    assert len(out) == 1, f"expected only one token output when {k=}"
+    assert out[0].probs is None, "expect gpu tensor references to be None"
+    assert out[
+        0].sampled_tokens is None, "expect gpu tensor references to be None"
+
+    draft_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict(), return_python_output=False)
+    target_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict())
+
+
+@pytest.mark.parametrize('k', [0, 5])
+@pytest.mark.parametrize('batch_size', [0])
+@torch.inference_mode()
+def test_empty_input_batch(k: int, batch_size: int):
+    """Verify that the SpecDecodeWorker calls the draft and target workers
+    when the input batch is empty. This can happen if the engine communicates
+    to the workers information without scheduling a batch.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    draft_worker.device = 'cuda'
+    target_worker.device = 'cuda'
+
+    set_random_seed(1)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    execute_model_data, prompts, prev_output_tokens = create_batch(
+        batch_size, k, prev_output_token_len=0)
+
+    out = worker.execute_model(**execute_model_data.to_dict(),
+                               num_spec_tokens=k)
+
+    assert len(out) == 1, f"expected only one token output when {k=}"
+    assert out[0].probs is None, "expect gpu tensor references to be None"
+    assert out[
+        0].sampled_tokens is None, "expect gpu tensor references to be None"
+
+    draft_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict(), return_python_output=False)
+    target_worker.execute_model.assert_called_once_with(
+        **execute_model_data.to_dict())
+
+
+@torch.inference_mode()
+def test_init_model():
+    """Verify SpecDecodeWorker invokes proposer/scorer worker init_model, as
+    well as other GPU initialization.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    worker.init_model()
+
+    draft_worker.init_model.assert_called_once()
+
+    target_worker.init_model.assert_called_once()
+
+    metrics_collector.init_gpu_tensors.assert_called_once()
+    rejection_sampler.init_gpu_tensors.assert_called_once()
+
+
+@torch.inference_mode()
+def test_init_cache_engine():
+    """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer
+    workers.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    cache_config = MagicMock()
+
+    worker.init_cache_engine(cache_config)
+
+    draft_worker.init_cache_engine.assert_called_once_with(cache_config)
+    target_worker.init_cache_engine.assert_called_once_with(cache_config)
+
+
+@pytest.mark.parametrize('available_gpu_blocks', [1, 1024])
+@pytest.mark.parametrize('available_cpu_blocks', [500])
+@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096])
+@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@torch.inference_mode()
+def test_profile_num_available_blocks(available_gpu_blocks: int,
+                                      available_cpu_blocks: int,
+                                      target_cache_block_size_bytes: int,
+                                      draft_kv_size_bytes: int):
+    """Verify SpecDecodeWorker correctly profiles num available GPU blocks.
+    Specifically, it should run profiling in the scorer worker, and then evenly
+    split the blocks between proposer and scorer worker.
+    """
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    rejection_sampler.token_id_dtype = torch.int64
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+
+    target_worker.profile_num_available_blocks.return_value = (
+        available_gpu_blocks, available_cpu_blocks)
+    target_worker.get_cache_block_size_bytes.return_value = target_cache_block_size_bytes
+    draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
+
+    worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
+                              metrics_collector)
+
+    # These values do not directly impact the adjusted block size calculation,
+    # so they can be fixed.
+    gpu_memory_utilization = 0.9
+    cpu_swap_space = 100
+    block_size = 16
+
+    num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks(
+        block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto")
+
+    target_worker.profile_num_available_blocks.assert_called_once_with(
+        block_size, gpu_memory_utilization, cpu_swap_space, "auto")
+    assert num_cpu_blocks == available_cpu_blocks
+
+    assert num_gpu_blocks == split_num_cache_blocks_evenly(
+        target_cache_block_size_bytes, draft_kv_size_bytes,
+        available_gpu_blocks)
+
+
+@pytest.mark.parametrize('available_gpu_blocks',
+                         list(range(20)) + [1024, 1024**2])
+@pytest.mark.parametrize('target_cache_block_size_bytes',
+                         [2 * 2 * 4096, 2 * 2 * 8192])
+@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096])
+@torch.inference_mode()
+def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
+                                       target_cache_block_size_bytes: int,
+                                       draft_kv_size_bytes: int):
+    """Verify split_num_cache_blocks_evenly does not exceed original memory
+    allocation in bytes.
+    """
+    num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes,
+                                               draft_kv_size_bytes,
+                                               available_gpu_blocks)
+    assert (num_blocks * target_cache_block_size_bytes) + (
+        num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
+                                              target_cache_block_size_bytes)
diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py
new file mode 100644
index 0000000000000..19833ddb06154
--- /dev/null
+++ b/tests/spec_decode/test_utils.py
@@ -0,0 +1,111 @@
+from vllm.spec_decode.util import get_all_seq_ids
+from vllm.sequence import SequenceGroupMetadata
+from vllm.spec_decode.util import split_batch_by_proposal_len
+
+import pytest
+from unittest.mock import MagicMock
+
+
+def test_get_all_seq_ids():
+    """Verify get_all_seq_ids extracts all seq ids.
+    """
+    expected_seq_ids = list(range(10)) + list(range(100, 110))
+
+    seq_group_metadata_list = [
+        SequenceGroupMetadata(
+            request_id=str(seq_id),
+            is_prompt=True,
+            seq_data={
+                seq_id: MagicMock(),
+            },
+            sampling_params=MagicMock(),
+            block_tables={
+                seq_id: MagicMock(),
+            },
+            lora_request=None,
+        ) for seq_id in expected_seq_ids
+    ]
+
+    actual_seq_ids = get_all_seq_ids(seq_group_metadata_list)
+    assert actual_seq_ids == expected_seq_ids
+
+
+@pytest.fixture
+def fake_sequence_group_metadata():
+    seq_ids = list(range(3))
+    return [
+        SequenceGroupMetadata(
+            request_id=str(i),
+            is_prompt=True,
+            seq_data={
+                i: MagicMock(),
+            },
+            sampling_params=MagicMock(),
+            block_tables={
+                i: MagicMock(),
+            },
+            lora_request=None,
+        ) for i in seq_ids
+    ]
+
+
+def test_filter_zero_length_proposals(fake_sequence_group_metadata):
+    proposal_lens = [0, 1, 0]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=True)
+
+    expected_groups = [
+        fake_sequence_group_metadata[0], fake_sequence_group_metadata[2]
+    ]
+    expected_indices = [0, 2]
+
+    assert filtered_groups == expected_groups
+    assert indices == expected_indices
+
+
+def test_filter_non_zero_length_proposals(fake_sequence_group_metadata):
+    proposal_lens = [0, 1, 2]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=False)
+
+    expected_groups = [
+        fake_sequence_group_metadata[1], fake_sequence_group_metadata[2]
+    ]
+    expected_indices = [1, 2]
+
+    assert filtered_groups == expected_groups
+    assert indices == expected_indices
+
+
+def test_empty_inputs():
+    filtered_groups, indices = split_batch_by_proposal_len(
+        [], [], select_proposal_len_zero=True)
+
+    assert filtered_groups == []
+    assert indices == []
+
+
+def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata):
+    proposal_lens = [0, 0, 0]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=False)
+
+    assert filtered_groups == []
+    assert indices == []
+
+
+def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata):
+    proposal_lens = [1, 1, 1]
+    filtered_groups, indices = split_batch_by_proposal_len(
+        fake_sequence_group_metadata,
+        proposal_lens,
+        select_proposal_len_zero=True)
+
+    assert filtered_groups == []
+    assert indices == []
diff --git a/tests/worker/spec_decode/utils.py b/tests/spec_decode/utils.py
similarity index 60%
rename from tests/worker/spec_decode/utils.py
rename to tests/spec_decode/utils.py
index fa8767cf898aa..997093988c0eb 100644
--- a/tests/worker/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,13 +1,16 @@
 import torch
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Iterable, Union
+from unittest.mock import MagicMock
 
 from vllm.worker.worker import Worker
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.engine.arg_utils import EngineArgs
-from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData
+from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData,
+                           SamplerOutput, SequenceGroupOutput, SequenceOutput)
 from vllm.sampling_params import SamplingParams
 from vllm.worker.cache_engine import CacheEngine
 from vllm.model_executor.utils import set_random_seed
+from itertools import count
 from dataclasses import dataclass, fields
 
 
@@ -24,6 +27,11 @@ def to_dict(self):
         return dict(
             (field.name, getattr(self, field.name)) for field in fields(self))
 
+    @classmethod
+    def from_dict(cls, d):
+        cleaned = dict((field.name, d[field.name]) for field in fields(cls))
+        return cls(**cleaned)
+
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
@@ -50,6 +58,21 @@ def create_execute_model_data(
     )
 
 
+def mock_worker(cls=None,
+                vocab_size: int = 30_000,
+                max_model_len: int = 2048,
+                rank: int = 0) -> MagicMock:
+    if cls is None:
+        cls = Worker
+
+    worker = MagicMock(spec=cls)
+    worker.vocab_size = vocab_size
+    worker.max_model_len = max_model_len
+    worker.rank = rank
+    worker.device = 'cuda:0'
+    return worker
+
+
 def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
     seed_iter = iter(rand_seeds)
     original_execute_model = worker.execute_model
@@ -117,25 +140,12 @@ def create_seq_group_metadata_from_prompts(
     block_size: int,
     final_seq_lens: List[int],
     continuations: Optional[List[List[int]]] = None,
-    num_tokens_processed: Optional[List[int]] = None,
     seq_ids: Optional[List[int]] = None,
 ) -> List[SequenceGroupMetadata]:
 
     if continuations is None:
         continuations = [[] for _ in prompts]
 
-    if num_tokens_processed is None:
-        # Default to 1 token missing from kv cache for generation sequences.
-        num_tokens_processed = []
-        for continuation, prompt in zip(continuations, prompts):
-            # If prefill, then default to zero tokens processed.
-            if not continuation:
-                num_tokens_processed.append(0)
-            else:
-                # If generation, then default to all but one tokens processed.
-                num_tokens_processed.append(
-                    len(continuation) + len(prompt) - 1)
-
     if seq_ids is None:
         seq_ids = list(i for i, _ in enumerate(prompts))
 
@@ -155,13 +165,15 @@ def create_seq_group_metadata_from_prompts(
             is_prompt=len(cont_token_ids) == 0,
             seq_data={
                 i:
-                SequenceData(prompt_token_ids=prompt_token_ids[:] +
-                             cont_token_ids[:])
+                SequenceData(
+                    prompt_token_ids=prompt_token_ids[:],
+                    output_token_ids=cont_token_ids[:],
+                ),
             },
             sampling_params=SamplingParams(temperature=0.0, ),
             block_tables={i: block_allocations[i][:]},
-        ) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in
-        enumerate(zip(prompts, continuations, num_tokens_processed))
+        ) for i, (prompt_token_ids,
+                  cont_token_ids) in enumerate(zip(prompts, continuations))
     ]
 
 
@@ -178,3 +190,68 @@ def assert_logprobs_dict_allclose(
             expected = torch.tensor(
                 single_step_expected_logprobs[token_id].logprob)
             assert torch.allclose(actual, expected)
+
+
+def create_sampler_output_list(
+        token_ids: torch.Tensor,
+        probs: Iterable[Optional[torch.Tensor]],
+        seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
+    num_steps, batch_size = token_ids.shape
+    token_ids_by_step = token_ids.tolist()
+
+    if seq_ids is None:
+        seq_ids = list(range(batch_size))
+
+    return [
+        SamplerOutput(outputs=[
+            SequenceGroupOutput(
+                samples=[
+                    SequenceOutput(
+                        output_token=token_id,
+                        parent_seq_id=seq_ids[seq_index],
+                        logprobs={token_id: 0},
+                    )
+                ],
+                prompt_logprobs=None,
+            ) for seq_index, token_id in enumerate(token_ids_by_step[step])
+        ],
+                      sampled_token_probs=probs[step],
+                      sampled_token_ids=token_ids[step])
+        for step in range(num_steps)
+    ]
+
+
+def create_batch(batch_size,
+                 k,
+                 prompt_len: Union[int, List[int]] = 10,
+                 prev_output_token_len: int = 10,
+                 seq_ids: Optional[List[int]] = None,
+                 num_gpu_blocks: Optional[int] = None,
+                 block_size: Optional[int] = None):
+    if block_size is None:
+        block_size = 8
+
+    if num_gpu_blocks is None:
+        num_gpu_blocks = 2048 // block_size
+
+    iterator = count()
+
+    if isinstance(prompt_len, int):
+        prompt_lens = [prompt_len for _ in range(batch_size)]
+    else:
+        prompt_lens = prompt_len
+
+    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
+    prev_output_tokens = [[
+        next(iterator) for _ in range(prev_output_token_len)
+    ] for _ in range(batch_size)]
+    final_seq_lens = [
+        len(prompt) + len(prev_output_token) + k + 1
+        for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+    ]
+
+    execute_model_data = create_execute_model_data(
+        create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks,
+                                               block_size, final_seq_lens,
+                                               prev_output_tokens, seq_ids), )
+    return execute_model_data, prompts, prev_output_tokens
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
new file mode 100644
index 0000000000000..e18df059d770f
--- /dev/null
+++ b/tests/test_sequence.py
@@ -0,0 +1,50 @@
+import pytest
+
+from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput
+
+
+@pytest.fixture
+def sample_outputs():
+    return [
+        SequenceGroupOutput(samples=[
+            SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
+        ],
+                            prompt_logprobs=None) for i in range(5)
+    ]
+
+
+@pytest.fixture
+def sampler_output(sample_outputs):
+    return SamplerOutput(outputs=sample_outputs)
+
+
+def test_sampler_output_initialization(sampler_output, sample_outputs):
+    assert len(sampler_output) == len(sample_outputs)
+    assert sampler_output.sampled_token_probs is None
+    assert sampler_output.sampled_token_ids is None
+    assert sampler_output.spec_decode_worker_metrics is None
+
+
+def test_sampler_output_getitem(sampler_output, sample_outputs):
+    assert sampler_output[2] == sample_outputs[2]
+
+
+def test_sampler_output_setitem(sampler_output):
+    new_output = SequenceGroupOutput(samples=[
+        SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
+    ],
+                                     prompt_logprobs=None)
+    sampler_output[2] = new_output
+    assert sampler_output[2] == new_output
+
+
+def test_sampler_output_len(sampler_output, sample_outputs):
+    assert len(sampler_output) == len(sample_outputs)
+
+
+def test_sampler_output_eq(sample_outputs):
+    sampler_output1 = SamplerOutput(outputs=sample_outputs)
+    sampler_output2 = SamplerOutput(outputs=sample_outputs.copy())
+    sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1])
+    assert sampler_output1 == sampler_output2
+    assert sampler_output1 != sampler_output3
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index 3e1cfc783b8ef..5643454060251 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -21,8 +21,6 @@ def __init__(self, strict_mode: bool = False):
                 nontrivial latency.
         """
         super().__init__()
-        self.probs_dtype = torch.float32
-        self.token_id_dtype = torch.int64
         self._strict_mode = strict_mode
 
         # NOTE: A "bonus token" is accepted iff all proposal tokens are
@@ -44,6 +42,14 @@ def init_gpu_tensors(self, rank: int) -> None:
                                                dtype=torch.long,
                                                device=device)
 
+    @property
+    def probs_dtype(self):
+        return torch.float32
+
+    @property
+    def token_id_dtype(self):
+        return torch.int64
+
     def forward(
         self,
         target_probs: torch.Tensor,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 320cb443524ca..19e7f630c4620 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -587,4 +587,4 @@ def _build_sampler_output(
                 SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
         sampler_output.append(
             SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
-    return sampler_output
+    return SamplerOutput(outputs=sampler_output)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index fee96a875dde5..37c102407a5f2 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -2,12 +2,16 @@
 import copy
 import enum
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, TYPE_CHECKING
 
 from vllm.block import LogicalTokenBlock
 from vllm.sampling_params import SamplingParams
 from vllm.lora.request import LoRARequest
 
+if TYPE_CHECKING:
+    import torch
+    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+
 
 @dataclass
 class Logprob:
@@ -81,6 +85,8 @@ class SequenceData:
 
     Args:
         prompt_token_ids: The token IDs of the prompt.
+        output_token_ids: The token IDs of the output. Set to an empty list if
+            None.
 
     Attributes:
         prompt_token_ids: The token IDs of the prompt.
@@ -91,9 +97,13 @@ class SequenceData:
     def __init__(
         self,
         prompt_token_ids: List[int],
+        output_token_ids: Optional[List[int]] = None,
     ) -> None:
+        if output_token_ids is None:
+            output_token_ids = []
+
         self.prompt_token_ids = prompt_token_ids
-        self.output_token_ids: List[int] = []
+        self.output_token_ids = output_token_ids
         self.cumulative_logprob = 0.0
 
     def append_token_id(self, token_id: int, logprob: float) -> None:
@@ -117,6 +127,12 @@ def get_last_token_id(self) -> int:
             return self.prompt_token_ids[-1]
         return self.output_token_ids[-1]
 
+    def get_prompt_token_ids(self) -> int:
+        return self.prompt_token_ids
+
+    def get_output_token_ids(self) -> int:
+        return self.output_token_ids
+
     def __repr__(self) -> str:
         return (f"SequenceData("
                 f"prompt_token_ids={self.prompt_token_ids}, "
@@ -506,6 +522,35 @@ def __eq__(self, other: object) -> bool:
                 and self.prompt_logprobs == other.prompt_logprobs)
 
 
-# For each sequence group, we generate a list of SequenceOutput object,
-# each of which contains one possible candidate for the next token.
-SamplerOutput = List[SequenceGroupOutput]
+@dataclass
+class SamplerOutput:
+    """For each sequence group, we generate a list of SequenceOutput object,
+    each of which contains one possible candidate for the next token.
+
+    This datastructure implements methods so it can be used like a list, but
+    also has optional fields for device tensors.
+    """
+
+    outputs: List[SequenceGroupOutput]
+
+    # On-device tensor containing probabilities of each token.
+    sampled_token_probs: Optional["torch.Tensor"] = None
+
+    # On-device tensor containing the sampled token ids.
+    sampled_token_ids: Optional["torch.Tensor"] = None
+
+    # Spec decode metrics populated by workers.
+    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+
+    def __getitem__(self, idx: int):
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value):
+        self.outputs[idx] = value
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other,
+                          self.__class__) and self.outputs == other.outputs
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
new file mode 100644
index 0000000000000..478c950f52873
--- /dev/null
+++ b/vllm/spec_decode/batch_expansion.py
@@ -0,0 +1,351 @@
+from typing import Iterator, List, Tuple, Optional, Dict
+from itertools import chain, count
+
+import torch
+
+from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData)
+from vllm.worker.worker import Worker
+from vllm.spec_decode.util import nvtx_range, sampler_output_to_torch, get_all_seq_ids, split_batch_by_proposal_len
+from vllm.spec_decode.interfaces import SpeculativeScorer, SpeculativeProposals, SpeculativeScores
+
+SeqId = int
+TargetSeqId = int
+TokenId = int
+
+
+class BatchExpansionTop1Scorer(SpeculativeScorer):
+    """Implements a speculative scorer that uses batch expansion to get
+    probabilities of speculative tokens according to the scoring model.
+
+    Batch expansion converts a list of sequences and multiple query positions
+    to a new batch of sequences, each with a single query position. This allows
+    for MQA-like scoring in speculative decoding without requiring an MQA
+    kernel.
+
+    It is strictly less efficient than MQA scoring.
+
+    It only supports scoring the top1 proposal tokens of the proposer, instead
+    of topk/tree.
+    """
+
+    def __init__(self, scorer_worker: Worker, device: str, vocab_size: int):
+        self._scorer_worker = scorer_worker
+        self._device = device
+        self._vocab_size = vocab_size
+
+    @nvtx_range("BatchExpansionTop1Scorer.score_proposals")
+    def score_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        k: int,
+        proposals: SpeculativeProposals,
+    ) -> SpeculativeScores:
+        """Score the proposed tokens via the scorer model.
+
+        This converts each input sequence to a set of k+1 target sequences. The
+        target sequences have the unique continuations to be scored and a
+        unique sequence ID that is different from all input sequence ids.
+
+        If a speculative sequence length would exceed the max model length, then
+        no speculation is produced for that sequence.
+
+        Args:
+            seq_group_metadata_list: The input sequence group metadata.
+            blocks_to_swap_in: This is passed to the worker during scoring.
+            blocks_to_swap_out: This is passed to the worker during scoring.
+            blocks_to_copy: This is passed to the worker during scoring.
+            k: The fixed proposal length.
+            proposals: The speculative proposals to score.
+        Returns:
+            SpeculativeScores: The scores of each speculative token, along with
+                which sequences were ignored during scoring.
+        """
+
+        # TODO(cade) perform this on GPU to remove blocking call.
+        proposal_lens_list = proposals.proposal_lens.tolist()
+        proposal_token_ids_list = proposals.proposal_token_ids.tolist()
+
+        spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens = self._expand_batch(
+            seq_group_metadata_list=seq_group_metadata_list,
+            proposal_token_ids_list=proposal_token_ids_list,
+            proposal_lens_list=proposal_lens_list,
+        )
+
+        target_sampler_output = self._scorer_worker.execute_model(
+            seq_group_metadata_list=target_seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            return_python_output=False)
+
+        all_tokens, all_probs = self._contract_batch(
+            original_bs=len(seq_group_metadata_list),
+            target_sampler_output=target_sampler_output,
+            proposals=proposals,
+            num_scoring_tokens=num_scoring_tokens,
+            non_spec_indices=non_spec_indices,
+            spec_indices=spec_indices,
+            k=k,
+        )
+
+        return SpeculativeScores(
+            probs=all_probs,
+            token_ids=all_tokens,
+        )
+
+    def _expand_batch(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_token_ids_list: List[TokenId],
+        proposal_lens_list: List[int],
+    ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
+        """Given the input sequences and potentially multiple corresponding
+        proposal tokens, create a new batch where each sequence has a single
+        query token.
+        """
+
+        # vLLM currently only supports proposal lens equal to zero or the batch
+        # proposal len. This adds some complexity (splitting the batch into spec
+        # and non spec sequences) and should be removed in the future. It can be
+        # done by supporting per-sequence proposal lens.
+        spec_seqs, spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=False)
+        non_spec_seqs, non_spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=True)
+
+        target_seq_group_metadata_list = self._create_scoring_model_input(
+            spec_seqs, proposal_token_ids_list)
+        num_scoring_tokens = len(target_seq_group_metadata_list)
+        target_seq_group_metadata_list.extend(non_spec_seqs)
+
+        return spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens
+
+    def _contract_batch(self, original_bs: int,
+                        target_sampler_output: List[SamplerOutput],
+                        proposals: SpeculativeProposals,
+                        num_scoring_tokens: int, non_spec_indices: List[int],
+                        spec_indices: List[int],
+                        k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Contract the expanded batch back into its original size.
+        This maps the scores of speculative tokens back to their original
+        sequences.
+        """
+        (target_token_ids, target_probs, non_spec_target_token_ids,
+         non_spec_target_probs) = self._split_scoring_output(
+             target_sampler_output, num_scoring_tokens)
+
+        # Map distinct sequences used to score each token
+        # of shape [batch_size * k + 1] back to [batch_size, k + 1].
+        batch_size, k = proposals.proposal_token_ids.shape
+
+        target_token_ids = target_token_ids.squeeze().reshape(
+            batch_size, k + 1)
+        target_probs = target_probs.squeeze().reshape(batch_size, k + 1,
+                                                      self._vocab_size)
+
+        all_tokens = torch.full(size=(original_bs, k + 1),
+                                fill_value=-1,
+                                device=self._device,
+                                dtype=torch.long)
+        all_probs = torch.zeros(original_bs,
+                                k + 1,
+                                self._vocab_size,
+                                device=self._device,
+                                dtype=torch.float32)
+
+        if non_spec_indices:
+            all_tokens[non_spec_indices, 0] = non_spec_target_token_ids
+            all_probs[non_spec_indices, :1, :] = non_spec_target_probs
+
+        if spec_indices:
+            all_tokens[spec_indices] = target_token_ids
+            all_probs[spec_indices] = target_probs
+
+        return all_tokens, all_probs
+
+    def _create_scoring_model_input(
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            proposal_token_ids: List[List[TokenId]],  # shape: [batch_size, k]
+    ) -> List[SequenceGroupMetadata]:
+        """Given the original input sequences and proposed tokens from the draft
+        model, create a list of target sequences that can be used for scoring.
+        """
+
+        if not seq_group_metadata_list:
+            return []
+
+        target_seq_ids_iter = self._create_target_seq_id_iterator(
+            get_all_seq_ids(seq_group_metadata_list))
+
+        target_seq_group_metadata = list(
+            chain.from_iterable(
+                self._create_target_seq_group_metadata(
+                    seq_group_metadata,
+                    proposal_token_ids,
+                    i,
+                    target_seq_ids_iter,
+                ) for i, seq_group_metadata in enumerate(
+                    seq_group_metadata_list)))
+
+        return target_seq_group_metadata
+
+    def _create_target_seq_group_metadata(
+        self,
+        input_seq_group_metadata: SequenceGroupMetadata,
+        proposal_token_ids: List[TokenId],  # shape: [batch_size, k]
+        batch_index: int,
+        target_seq_ids_iter: Iterator[TargetSeqId],
+    ) -> List[SequenceGroupMetadata]:
+        """Given an input sequence group metadata and a list of draft tokens,
+        create a list of target SequenceGroupMetadata, one for each
+        token id that needs to be scored.
+
+        Naive speculative decoding requires K target model scores, one for each
+        draft model token. However one can add a bonus token such that if each
+        token is accepted, then a final token may be sampled from the model.
+        This function creates K+1 target SequenceGroupMetadata to take
+        advantage of the bonus token.
+        """
+        assert not input_seq_group_metadata.is_prompt, (
+            "Speculating on "
+            "prompts not yet supported")
+        assert len(input_seq_group_metadata.seq_data) == 1, (
+            "Beam search "
+            "not supported in speculative decoding")
+        input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys()))
+
+        token_ids_to_score = self._get_token_ids_to_score(
+            proposal_token_ids[batch_index])
+
+        target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for token_ids in token_ids_to_score:
+            target_seq_group_metadata_list.append(
+                self._create_single_target_seq_group_metadata(
+                    input_seq_group_metadata,
+                    input_seq_id,
+                    next(target_seq_ids_iter),
+                    token_ids,
+                ))
+
+        return target_seq_group_metadata_list
+
+    def _create_single_target_seq_group_metadata(
+        self,
+        seq_group_metadata: SequenceGroupMetadata,
+        seq_id: SeqId,
+        target_seq_id: TargetSeqId,
+        token_ids: List[TokenId],
+    ) -> SequenceGroupMetadata:
+        """Create a single target SequenceGroupMetadata.
+
+        Args:
+            seq_group_metadata: The metadata for the input sequence.
+            seq_id: The input sequence ID.
+            target_seq_id: The corresponding target sequence ID.
+            token_ids: The list of token ids that are to be appended to the
+                input sequence.
+        """
+        seq_data = seq_group_metadata.seq_data[seq_id]
+        prompt_token_ids = seq_data.get_prompt_token_ids()
+        new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
+
+        return SequenceGroupMetadata(
+            request_id=seq_group_metadata.request_id,
+            is_prompt=seq_group_metadata.is_prompt,
+            seq_data={
+                target_seq_id:
+                SequenceData(
+                    prompt_token_ids=prompt_token_ids,
+                    output_token_ids=new_output_token_ids,
+                ),
+            },
+            sampling_params=seq_group_metadata.sampling_params,
+            block_tables={
+                target_seq_id: seq_group_metadata.block_tables[seq_id],
+            },
+            lora_request=None,
+        )
+
+    def _split_scoring_output(
+        self, sampler_output: SamplerOutput, num_scoring_tokens: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Split the target model output into speculative and non-speculative
+        output.
+        """
+
+        # vLLM currently only supports proposal lens equal to zero or the batch
+        # proposal len. This adds some complexity (splitting the batch into spec
+        # and non spec sequences) and should be removed in the future. It can be
+        # done by supporting per-sequence proposal lens.
+        #
+        # First samples are from speculative scoring, latter samples are non-
+        # speculative samples.
+        split_sizes = [
+            num_scoring_tokens,
+            sampler_output.sampled_token_ids.numel() - num_scoring_tokens
+        ]
+        (spec_probs, non_spec_probs
+         ) = sampler_output.sampled_token_probs.split(split_sizes)
+        (spec_sampled_tokens, non_spec_sampled_tokens
+         ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
+
+        # Convert scores to tensors.
+        sampler_output.sampled_token_probs = spec_probs
+        sampler_output.sampled_token_ids = spec_sampled_tokens
+        target_token_ids, target_probs = sampler_output_to_torch(
+            [sampler_output])
+
+        # Convert non-speculative output tokens to tensors.
+        sampler_output.sampled_token_probs = non_spec_probs
+        sampler_output.sampled_token_ids = non_spec_sampled_tokens
+        non_spec_target_token_ids, non_spec_target_probs = sampler_output_to_torch(
+            [sampler_output])
+
+        return target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs
+
+    def _create_target_seq_id_iterator(
+            self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
+        """Create an iterator for creating target sequence ids.
+        Target sequence ids are distinct from sequence ids because we create a
+        distinct target sequence id for each proposal token to be scored.
+
+        This implementation increments a counter starting at 1 + max of all
+        provided input sequence ids.
+        """
+        return count(start=max(seq_ids) + 1)
+
+    def _get_token_ids_to_score(
+        self,
+        full_spec_token_ids: List[TokenId]  # shape: [k]
+    ) -> List[List[TokenId]]:
+        """Given an int tensor of proposal token ids, return a list of
+        token ids that should be scored.
+
+        Returns k+1 output lists. The additional one is used for generating the
+        bonus token.
+
+        Example:
+            Input: [0, 1, 2, 3] (k=4)
+            Output: (k+1 lists)
+                []
+                [0]
+                [0, 1]
+                [0, 1, 2]
+                [0, 1, 2, 3]
+        """
+        empty_token_ids = []
+
+        token_ids_to_score = [empty_token_ids]
+        token_ids_to_score.extend([
+            full_spec_token_ids[:i + 1]
+            for i in range(len(full_spec_token_ids))
+        ])
+        return token_ids_to_score
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
new file mode 100644
index 0000000000000..9e53ffb60ac32
--- /dev/null
+++ b/vllm/spec_decode/interfaces.py
@@ -0,0 +1,77 @@
+from typing import List, Tuple, Optional, Dict
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+import torch
+
+from vllm.sequence import SequenceGroupMetadata
+
+
+@dataclass
+class SpeculativeProposals:
+    """Datastructure used to represent proposal tokens from some proposer. It
+    also tracks how many speculative tokens each sequence has.
+    """
+
+    # Speculative proposal tokens.
+    proposal_token_ids: torch.Tensor
+
+    # Probabilities of the proposal tokens according to the proposer.
+    proposal_probs: torch.Tensor
+
+    # The valid length of each proposal; can be zero.
+    proposal_lens: torch.Tensor
+
+    def __repr__(self):
+        return (f"SpeculativeProposals("
+                f"proposal_token_ids={self.proposal_token_ids.shape}, "
+                f"proposal_probs={self.proposal_probs.shape}, "
+                f"proposal_lens={self.proposal_lens.shape})")
+
+
+@dataclass
+class SpeculativeScores:
+    """Datastructure used to represent the scores of speculative tokens
+    according to the scoring model.
+    """
+
+    # Probabilities of the speculative tokens according to the scoring model.
+    probs: torch.Tensor
+
+    # Token ids sampled from the scoring model. Used for speculative bonus
+    # tokens and also non-speculative normal decoding.
+    token_ids: torch.Tensor
+
+    def __repr__(self):
+        return (f"SpeculativeScores("
+                f"probs={self.probs.shape}, "
+                f"token_ids={self.token_ids.shape})")
+
+
+class SpeculativeProposer(ABC):
+
+    @abstractmethod
+    def get_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        raise NotImplementedError
+
+
+class SpeculativeScorer(ABC):
+
+    @abstractmethod
+    def score_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        k: int,
+        proposals: SpeculativeProposals,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
new file mode 100644
index 0000000000000..65a2a4a63a98f
--- /dev/null
+++ b/vllm/spec_decode/metrics.py
@@ -0,0 +1,174 @@
+import torch
+from dataclasses import dataclass
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from typing import Optional
+from vllm.utils import in_wsl
+import time
+from typing import Callable
+
+
+@dataclass
+class SpecDecodeWorkerMetrics:
+    """Dataclass holding metrics emitted from the spec decode worker.
+    """
+
+    # The empirical acceptance rate of the proposal method on a per-token basis.
+    # This is useful for evaluating how well the proposal method aligns with the
+    # scoring method.
+    draft_acceptance_rate: float
+
+    # The empirical efficiency, measured as the number of tokens emitted by the
+    # system divided by the number of tokens that could be emitted by the system
+    # if the proposal method were perfect.
+    system_efficiency: float
+
+    # The number of speculative tokens produced by the proposal method.
+    draft_tokens: int
+
+    # The number of tokens emitted by the entire system.
+    emitted_tokens: int
+
+    # The number of tokens accepted by the scoring model and verification
+    # routine, e.g. Llama2-70B and lossless rejection sampling.
+    #
+    # NOTE: Any token accepted by the verification routine is considered
+    # accepted (regardless of if the speculative prefix is also accepted). The
+    # user will usually see less accepted tokens. This metric is helpful when
+    # evaluating alignment of the proposal method with the scoring model.
+    accepted_tokens: int
+
+    # The number of speculative tokens per sequence.
+    num_spec_tokens: int
+
+
+Timer = Callable[[], float]
+
+
+class AsyncMetricsCollector:
+    """Class which copies rejection sampler metrics from the device to CPU on a
+    non-default Torch stream.
+    """
+
+    def __init__(self,
+                 rejection_sampler: RejectionSampler,
+                 timer: Optional[Timer] = None,
+                 collect_interval_s: float = 5.0):
+        self._rejection_sampler = rejection_sampler
+        self._timer = time.time if timer is None else timer
+
+        self._rank: Optional[int] = None
+
+        # We don't have a device set yet.
+        self._copy_stream: Optional[torch.cuda.Stream] = None
+
+        self._in_flight_copy: Optional[torch.cuda.Event] = None
+
+        pin_memory = not in_wsl()
+        self._aggregate_num_accepted_tokens = torch.tensor(
+            0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
+        self._aggregate_num_emitted_tokens = torch.tensor(
+            0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
+        self._aggregate_num_draft_tokens = 0
+
+        self._rejsample_metrics_collect_interval_s = collect_interval_s
+        self._last_metrics_collect_time = self._timer()
+
+    def init_gpu_tensors(self, rank: int) -> None:
+        self._rank = rank
+        self._copy_stream = torch.cuda.Stream()
+
+    def maybe_collect_rejsample_metrics(
+            self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
+
+        # If a copy was initiated in the previous call, collect and return.
+        if self._in_flight_copy is not None:
+            ready_event = self._in_flight_copy
+            self._in_flight_copy = None
+            return self._collect_rejsample_metrics(k, ready_event)
+
+        # Otherwise, check if we should start a new copy.
+        if self._should_collect_rejsample_metrics(self._timer()):
+            assert self._in_flight_copy is None
+            self._in_flight_copy = self._copy_rejsample_metrics_async()
+
+        return None
+
+    def _should_collect_rejsample_metrics(self, now: float) -> bool:
+        """Return whether or not this iteration should print rejection sampling
+        metrics.
+        """
+        if self._rank != 0:
+            return False
+
+        if (now - self._last_metrics_collect_time <
+                self._rejsample_metrics_collect_interval_s):
+            return False
+        return True
+
+    def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
+        """Copy rejection sampling metrics (number of accepted tokens, etc) to
+        CPU asynchronously.
+
+        Returns a CUDA event recording when the copy is complete.
+        """
+        self._copy_stream.wait_stream(torch.cuda.current_stream())
+
+        with torch.cuda.stream(self._copy_stream):
+            self._aggregate_num_accepted_tokens.copy_(
+                self._rejection_sampler.num_accepted_tokens, non_blocking=True)
+            self._aggregate_num_emitted_tokens.copy_(
+                self._rejection_sampler.num_emitted_tokens, non_blocking=True)
+            # Number of draft tokens is calculated on CPU, so no copy is
+            # required.
+            self._aggregate_num_draft_tokens = (
+                self._rejection_sampler.num_draft_tokens)
+
+        aggregate_metrics_ready = torch.cuda.Event()
+        aggregate_metrics_ready.record(self._copy_stream)
+
+        return aggregate_metrics_ready
+
+    def _collect_rejsample_metrics(
+            self, k: int,
+            ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics:
+        """Create metrics object from statistics copied asynchronously.
+
+        Args:
+            k: int. The number of speculative tokens; used to determine system
+                efficiency.
+            ready_event: torch.cuda.Event. The CUDA event recording when the
+                async GPU->CPU copy is complete.
+        """
+
+        ready_event.synchronize()
+        accepted_tokens = self._aggregate_num_accepted_tokens.item()
+        emitted_tokens = self._aggregate_num_emitted_tokens.item()
+        draft_tokens = self._aggregate_num_draft_tokens
+
+        num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k)
+
+        if draft_tokens > 0:
+            draft_acceptance_rate = accepted_tokens / draft_tokens
+        else:
+            draft_acceptance_rate = float("nan")
+
+        if num_possible_tokens > 0:
+            system_efficiency = emitted_tokens / num_possible_tokens
+        else:
+            system_efficiency = float("nan")
+
+        return SpecDecodeWorkerMetrics(
+            num_spec_tokens=k,
+            draft_acceptance_rate=draft_acceptance_rate,
+            system_efficiency=system_efficiency,
+            accepted_tokens=accepted_tokens,
+            draft_tokens=draft_tokens,
+            emitted_tokens=emitted_tokens,
+        )
+
+    @staticmethod
+    def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int:
+        # Divide by k since batch size can be variable.
+        total_num_spec_seqs = draft_tokens / k
+        num_accepted_per_seq_if_all_accepted = k + 1
+        return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
new file mode 100644
index 0000000000000..f7be14d3d22c2
--- /dev/null
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -0,0 +1,366 @@
+from typing import List, Dict, Optional, Tuple
+import copy
+
+import torch
+
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.worker import Worker
+from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer
+from vllm.spec_decode.util import sampler_output_to_torch
+
+
+class MultiStepWorker(Worker):
+    """The MultiStepWorker is equivalent to a Worker except that it allows
+    multiple forward passes in a single call, assuming the scheduler has
+    allocated enough space to store the additional KV. This reduces overhead
+    by invoking the scheduler less.
+
+    The MultiStepWorker does not support cache swap operations, or beam search.
+    Cache swap operations do not require large modifications. On the other hand,
+    beam search requires memory allocations during sequence forks and thus
+    requires more thought for MultiStepWorker support.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._proposer: Optional[DraftModelTop1Proposer] = None
+
+    def init_model(self):
+        super().init_model()
+
+        self._proposer = DraftModelTop1Proposer(
+            self,
+            self.device,
+            self.max_model_len,
+            self.vocab_size,
+        )
+
+    @torch.inference_mode()
+    def execute_model_multi_step(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        num_steps: int,
+    ) -> List[SamplerOutput]:
+        """Run the model forward pass num_steps times. Returns the list of
+        sampler output, one per model forward pass.
+        """
+        self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in,
+                                   blocks_to_swap_out, blocks_to_copy)
+
+        # Shallow copy input data so modifications (such as appending tokens)
+        # do not cause side-effects.
+        copied_seq_group_metadata_list = self._shallow_copy_inputs(
+            seq_group_metadata_list)
+
+        # Assert enough KV space for num_steps tokens per sequence.
+        self._assert_enough_kv_space(seq_group_metadata_list, num_steps)
+
+        # Run model num_steps times.
+        model_outputs = []
+        for _ in range(num_steps):
+            model_output = super().execute_model(
+                seq_group_metadata_list=copied_seq_group_metadata_list,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+            )
+
+            self._append_new_tokens(model_output,
+                                    copied_seq_group_metadata_list)
+            model_outputs.append(model_output)
+
+        return model_outputs
+
+    def get_spec_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        """Produce speculations given an input batch of sequences. The number of
+        speculative tokens per sequence is determined by max_proposal_len.
+        """
+
+        return self._proposer.get_proposals(
+            seq_group_metadata_list,
+            blocks_to_swap_in,
+            blocks_to_swap_out,
+            blocks_to_copy,
+            max_proposal_len,
+        )
+
+    def _append_new_tokens(
+            self, model_output: SamplerOutput,
+            seq_group_metadata_list: SequenceGroupMetadata) -> None:
+        """Given model output from a single run, append the tokens to the
+        sequences. This is normally done outside of the worker, but it is
+        required if the worker is to perform multiple forward passes.
+        """
+        for seq_group_metadata, sequence_group_outputs in zip(
+                seq_group_metadata_list, model_output):
+            seq_group_metadata.is_prompt = False
+
+            for seq_output in sequence_group_outputs.samples:
+                # NOTE: Beam search is not supported, so we can assume that
+                # parent_seq_id == seq_id.
+                seq = seq_group_metadata.seq_data[seq_output.parent_seq_id]
+
+                token_id = seq_output.output_token
+                token_logprob = seq_output.logprobs[token_id]
+
+                seq.append_token_id(token_id, token_logprob.logprob)
+
+    def _shallow_copy_inputs(
+        self, seq_group_metadata_list: List[SequenceGroupMetadata]
+    ) -> List[SequenceGroupMetadata]:
+        """Copy input data structures to remove side-effects when input data
+        structures are shared with other modules.
+
+        Helpful when the vLLM scheduler runs in the same process as the worker.
+        The alternative is deep-copying (or other form of deep copy); this has
+        performance downsides.
+        """
+
+        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
+        # append tokens and change is_prompt without external side-effects.
+        new_seq_group_metadata_list = []
+
+        for old_seq_group_metadata in seq_group_metadata_list:
+            # We must shallow-copy seq_group_metadata as is_prompt could change.
+            seq_group_metadata = copy.copy(old_seq_group_metadata)
+            new_seq_group_metadata_list.append(seq_group_metadata)
+
+            # We must shallow-copy seq_data as we will append token ids
+            new_seq_data = {}
+            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+                new_seq_data[seq_id] = copy.copy(old_seq_data)
+                new_seq_data[
+                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
+
+            seq_group_metadata.seq_data = new_seq_data
+
+        return new_seq_group_metadata_list
+
+    def _assert_enough_kv_space(
+            self, seq_group_metadata_list: List[SequenceGroupMetadata],
+            num_steps: int) -> None:
+        """Assert there are enough physical blocks per sequence to store the
+        current KV plus additional KV from num_steps tokens.
+        """
+        assert self.model_runner.block_size is not None
+        for seq_group_metadata in seq_group_metadata_list:
+            # Only one seq_id is guaranteed because there is no beam search.
+            seq_id = list(seq_group_metadata.seq_data.keys())[0]
+            seq = seq_group_metadata.seq_data[seq_id]
+
+            # After num_steps, the seq len will be the current seq len
+            # plus one token per step.
+            final_seq_len = seq.get_len() + num_steps
+
+            # We will have final_seq_len - 1 KV because vLLM saves KV for a
+            # token in the iteration after the token was generated.
+            required_num_kv_slots = final_seq_len - 1
+
+            # The allocated number of kv slots is the number of allocated blocks
+            # times the number of slots of block.
+            number_physical_blocks = len(
+                seq_group_metadata.block_tables[seq_id])
+            allocated_kv_slots = (number_physical_blocks *
+                                  self.model_runner.block_size)
+
+            if required_num_kv_slots > allocated_kv_slots:
+                request_id = seq_group_metadata.request_id
+                raise ValueError(
+                    "The worker attempted to run "
+                    f"{num_steps} times but found insufficient KV space for "
+                    f"{request_id=} {seq_id=}. ({allocated_kv_slots=} "
+                    f"{required_num_kv_slots=}).")
+
+    def _raise_if_unsupported(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        """MultiStepWorker does not yet implement support for cache swap
+        operations or beam search.
+        """
+        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
+            raise NotImplementedError(
+                "MultiStepWorker does not support cache operations")
+
+        if any(
+                len(seq_group_metadata.seq_data.keys()) != 1
+                for seq_group_metadata in seq_group_metadata_list):
+            raise NotImplementedError(
+                "MultiStepWorker does not support beam search.")
+
+
+class DraftModelTop1Proposer(SpeculativeProposer):
+    """Helper class which separates out sequences which would exceed the max
+    model length when speculated upon.
+
+    This allows combinations of models such as JackFram/llama-68m draft with
+    meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of
+    2048 while Llama2-13b has max_position_embeddings of 4096.
+
+    We treat the sequences which exceed the proposal draft model length as
+    "non-spec sequences". Essentially they skip the draft model and go through
+    normal decoding in the target model.
+
+    Currently, only proposal_lens of 0 and k are supported, where k is a global
+    batch proposal length. In the future vLLM should support per-sequence
+    proposal lengths.
+    """
+
+    def __init__(
+        self,
+        draft_worker: MultiStepWorker,
+        device: str,
+        max_model_len: int,
+        vocab_size: int,
+    ):
+        self._draft_worker = draft_worker
+        self._device = device
+        self._max_model_len = max_model_len
+        self._vocab_size = vocab_size
+
+    def get_proposals(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        max_proposal_len: int,
+    ) -> SpeculativeProposals:
+        """Get speculative proposals given the input batch.
+
+        Sequences which would exceed the max model length are skipped during
+        speculation.
+        """
+
+        # Split speculative- and non-speculative- sequences.
+        proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices = self._split_by_max_model_len(
+            seq_group_metadata_list, max_proposal_len)
+
+        if nonzero_proposal_len_seqs:
+            # Speculate tokens using the draft worker for the speculative
+            # sequences.
+            maybe_sampler_output = self._draft_worker.execute_model_multi_step(
+                seq_group_metadata_list=nonzero_proposal_len_seqs,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+                num_steps=max_proposal_len,
+            )
+        else:
+            # If no sequences can be speculated, set sampler output to None.
+            maybe_sampler_output = None
+
+        # Combine speculative- and non-speculative sequences into the same
+        # representation.
+        proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs(
+            batch_size=len(seq_group_metadata_list),
+            max_proposal_len=max_proposal_len,
+            maybe_sampler_output=maybe_sampler_output,
+            proposal_lens=proposal_lens,
+            nonzero_proposal_len_indices=nonzero_proposal_len_indices,
+        )
+
+        proposals = SpeculativeProposals(
+            proposal_token_ids=proposal_tokens,
+            proposal_probs=proposal_probs,
+            proposal_lens=proposal_lens,
+        )
+
+        return proposals
+
+    def _split_by_max_model_len(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        max_proposal_len: int,
+    ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
+        """Determine which sequences would exceed the max model length.
+        """
+
+        proposal_lens: List[int] = []
+        nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
+        nonzero_proposal_len_indices: List[int] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_data = next(iter(seq_group_metadata.seq_data.values()))
+            seq_len = seq_data.get_len()
+
+            # Currently only proposal lens of 0 or the global batch proposal len
+            # are supported.
+            if seq_len + max_proposal_len < self._max_model_len:
+                proposal_lens.append(max_proposal_len)
+                nonzero_proposal_len_seqs.append(seq_group_metadata)
+                nonzero_proposal_len_indices.append(i)
+            else:
+                proposal_lens.append(0)
+
+        return proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices
+
+    def _merge_outputs(
+        self,
+        batch_size: int,
+        max_proposal_len: int,
+        maybe_sampler_output: Optional[SamplerOutput],
+        proposal_lens: List[int],
+        nonzero_proposal_len_indices: List[int],
+    ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]:
+        """After speculations are produced, merge the speculation results with
+        the skipped sequences.
+        """
+        if maybe_sampler_output is None:
+            # If no speculative tokens, the sampler output will be None.
+            # In this case we return empty tensors.
+            proposal_tokens = torch.zeros(0,
+                                          max_proposal_len,
+                                          dtype=torch.long,
+                                          device=self._device)
+            proposal_probs = torch.zeros(0,
+                                         max_proposal_len,
+                                         self._vocab_size,
+                                         dtype=torch.float32,
+                                         device=self._device)
+            proposal_lens = torch.zeros(len(proposal_lens),
+                                        dtype=torch.long,
+                                        device=self._device)
+            return proposal_tokens, proposal_probs, proposal_lens
+
+        sampler_output = maybe_sampler_output
+
+        proposal_tokens, proposal_probs = sampler_output_to_torch(
+            sampler_output)
+
+        # Now, reformat the output GPU tensors such that each sequence has
+        # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
+
+        entire_proposal_tokens = torch.full(size=(batch_size,
+                                                  *proposal_tokens.shape[1:]),
+                                            fill_value=-1,
+                                            dtype=torch.long,
+                                            device=self._device)
+        entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
+        entire_proposal_probs = torch.zeros(batch_size,
+                                            *proposal_probs.shape[1:],
+                                            dtype=torch.float32,
+                                            device=self._device)
+        entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
+
+        proposal_tokens, proposal_probs = entire_proposal_tokens, entire_proposal_probs
+
+        proposal_lens = torch.zeros(batch_size,
+                                    dtype=torch.long,
+                                    device=self._device)
+        proposal_lens[nonzero_proposal_len_indices] = max_proposal_len
+
+        return proposal_tokens, proposal_probs, proposal_lens
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
new file mode 100644
index 0000000000000..890e479202372
--- /dev/null
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -0,0 +1,372 @@
+from typing import List, Tuple, Optional, Dict
+from functools import cached_property
+
+import torch
+
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.sequence import (SamplerOutput, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceOutput)
+from vllm.worker.worker import Worker
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.config import CacheConfig
+from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len
+from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
+from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.interfaces import SpeculativeScorer
+
+
+class SpecDecodeWorker:
+    """Worker which implements speculative decoding.
+
+    Speculative decoding reduces decoding per-token latency by using a proposal
+    method, such as a small draft model, to speculate ahead of a larger LLM. The
+    probabilities of the speculative tokens are then determined by the larger
+    LLM, after which some verification routine determines which (if any) of the
+    speculative tokens are accepted by the larger LLM.
+
+    See https://github.com/vllm-project/vllm/pull/2188 and 
+    https://github.com/vllm-project/vllm/pull/3103 for more info.
+
+    The current implementation has the following limitations:
+    * Only draft-model proposal is implemented (contributions for more forms are
+        welcome!).
+    * Only top-1 proposal and scoring are implemented. Tree-attention is left as
+        future work.
+    * Only lossless rejection sampling is supported. Contributions adding lossy
+        verification routines are welcome (e.g. Medusa's typical acceptance).
+    * All sequences in a batch must have the same proposal length, or zero. This
+        can be improved by having per-sequence speculation in the future.
+    * The scoring forward pass is done without an MQA kernel, which is
+        suboptimal especially as the batch size, proposal length, and sequence
+        lengths grow. Contributions to add a MQA scoring are welcome once
+        correctness tests pass.
+        More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit.
+    """
+
+    def __init__(
+        self,
+        proposer_worker: MultiStepWorker,
+        scorer_worker: Worker,
+        rejection_sampler: RejectionSampler,
+        metrics_collector: Optional[AsyncMetricsCollector] = None,
+    ):
+        """
+        Create a SpecDecodeWorker.
+
+        Args:
+            proposer_worker: A worker that can produce speculative tokens for
+                sequences.
+            scorer_worker: A worker that produces probabilities of speculative
+                tokens according to some base model. Typically a vanilla vLLM
+                Worker.
+            rejection_sampler: A Torch module used to perform modified rejection
+                sampling for speculative decoding.
+            metrics_collector: Helper class for collecting metrics; can be set
+                for testing purposes.
+        """
+        self.proposer_worker = proposer_worker
+        self.scorer_worker = scorer_worker
+        self.rejection_sampler = rejection_sampler
+
+        self._metrics = AsyncMetricsCollector(
+            rejection_sampler
+        ) if metrics_collector is None else metrics_collector
+
+        self.probs_dtype = self.rejection_sampler.probs_dtype
+        self.token_id_dtype = self.rejection_sampler.token_id_dtype
+
+        self.scorer: SpeculativeScorer = None
+
+    def init_model(self) -> None:
+        """Initialize both scorer and proposer models.
+        """
+        # The scorer worker model is initialized first in case the proposer
+        # model has a smaller TP degree than the target worker.
+        self.scorer_worker.init_model()
+        self.proposer_worker.init_model()
+
+        self._metrics.init_gpu_tensors(self.rank)
+        self.rejection_sampler.init_gpu_tensors(self.rank)
+        self.scorer = BatchExpansionTop1Scorer(
+            scorer_worker=self.scorer_worker,
+            device=self.device,
+            vocab_size=self._vocab_size)
+
+    def profile_num_available_blocks(self, block_size: int,
+                                     gpu_memory_utilization: float,
+                                     cpu_swap_space: int,
+                                     cache_dtype: str) -> Tuple[int, int]:
+        """Determine the number of cache blocks to use.
+
+        This is done by profiling the scorer model (which is typically the
+        larger of the two). Then the total memory which would be used by the
+        scorer cache is divided evenly between the proposer and scorer model KV,
+        such that the number of blocks is equal in both KV caches.
+        """
+        num_gpu_blocks, num_cpu_blocks = (
+            self.scorer_worker.profile_num_available_blocks(
+                block_size, gpu_memory_utilization, cpu_swap_space,
+                cache_dtype))
+
+        scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes(
+            block_size, cache_dtype)
+        proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes(
+            block_size, cache_dtype)
+
+        new_num_gpu_blocks = split_num_cache_blocks_evenly(
+            scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
+            num_gpu_blocks)
+        return new_num_gpu_blocks, num_cpu_blocks
+
+    def init_cache_engine(self, cache_config: CacheConfig):
+        """Initialize the cache engine of the scorer and proposer workers.
+        """
+        self.scorer_worker.init_cache_engine(cache_config)
+        self.proposer_worker.init_cache_engine(cache_config)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        num_spec_tokens: int,
+    ) -> List[SamplerOutput]:
+        """Perform speculative decoding on the input batch.
+        """
+
+        assert seq_group_metadata_list is not None, (
+            "speculative decoding "
+            "requires non-None seq_group_metadata_list")
+
+        # If no spec tokens, call the proposer and scorer workers normally.
+        # Used for prefill.
+        if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0:
+            return self._run_no_spec(
+                seq_group_metadata_list=seq_group_metadata_list,
+                blocks_to_swap_in=blocks_to_swap_in,
+                blocks_to_swap_out=blocks_to_swap_out,
+                blocks_to_copy=blocks_to_copy,
+            )
+
+        return self._run_speculative_decoding_step(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            k=num_spec_tokens,
+        )
+
+    @nvtx_range("spec_decode_worker._run_no_spec")
+    def _run_no_spec(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+    ) -> List[SamplerOutput]:
+        """Run a prefill step, without any speculation. The input is sent to the
+        proposer and scorer model so that the KV cache is consistent between the
+        two.
+        """
+
+        self.proposer_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            return_python_output=False)
+
+        sampler_output = self.scorer_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
+
+        # Clear device tensors from sampler output. This reduces communication
+        # overhead when the engine runs in a different process than the workers.
+        sampler_output.probs = None
+        sampler_output.sampled_tokens = None
+        return [sampler_output]
+
+    @nvtx_range("spec_decode_worker._run_speculative_decoding_step")
+    def _run_speculative_decoding_step(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Optional[Dict[int, int]],
+        blocks_to_swap_out: Optional[Dict[int, int]],
+        blocks_to_copy: Optional[Dict[int, List[int]]],
+        k: int,
+    ) -> List[SamplerOutput]:
+        """Execute a single step of speculative decoding.
+
+        This invokes the proposer worker to get k speculative tokens for each
+        sequence, then scores each speculative token using the scoring worker.
+
+        Returns a list of SamplerOutput, each containing a single token per
+        sequence.
+        """
+
+        # Generate proposals using draft worker.
+        proposals = self.proposer_worker.get_spec_proposals(
+            seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
+            blocks_to_copy, k)
+
+        proposal_scores = self.scorer.score_proposals(
+            seq_group_metadata_list,
+            blocks_to_swap_in,
+            blocks_to_swap_out,
+            blocks_to_copy,
+            k,
+            proposals,
+        )
+
+        accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
+                                                 proposal_scores, proposals, k)
+
+        return self._create_output_sampler_list(seq_group_metadata_list,
+                                                accepted_token_ids, k)
+
+    @nvtx_range("spec_decode_worker._verify_tokens")
+    def _verify_tokens(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_scores: SpeculativeScores,
+        proposals: SpeculativeProposals,
+        max_proposal_len: int,
+    ) -> torch.Tensor:
+        """Determine which speculative tokens are accepted using the
+        probabilities of each token according to the proposer and scorer models.
+        """
+        proposal_lens_list = proposals.proposal_lens.tolist()
+
+        # vLLM currently only supports proposal lens equal to zero or the batch
+        # proposal len. This adds some complexity (splitting the batch into spec
+        # and non spec sequences) and should be removed in the future. It can be
+        # done by supporting per-sequence proposal lens.
+        _, spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=False)
+        _, non_spec_indices = split_batch_by_proposal_len(
+            seq_group_metadata_list,
+            proposal_lens_list,
+            select_proposal_len_zero=True)
+        original_indices = spec_indices + non_spec_indices
+
+        proposal_probs = proposal_scores.probs[spec_indices, :-1]
+        bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
+        non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
+
+        accepted_token_ids = self.rejection_sampler(
+            proposal_probs,
+            bonus_token_ids,
+            proposals.proposal_probs,
+            proposals.proposal_token_ids,
+        )
+
+        # Append output tokens from non-speculative sequences to
+        # the accepted token ids tensor.
+        non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len +
+                                                       1).clone()
+        non_spec_token_ids[:, 1:] = -1
+        accepted_token_ids = torch.cat(
+            [accepted_token_ids, non_spec_token_ids])
+
+        # Rearrange so that results are in the order of the original seq group
+        # metadata.
+        accepted_token_ids[original_indices] = accepted_token_ids.clone()
+
+        return accepted_token_ids
+
+    def _create_output_sampler_list(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        accepted_token_ids: torch.Tensor,  # shape: [batch_size, k+1]
+        k: int,
+    ) -> List[SamplerOutput]:
+        """Given the accepted token ids, create a list of SamplerOutput.
+
+        The output is padded with -1 tokens such that each sequence has
+        the same number of outputs.
+        """
+        seq_ids = get_all_seq_ids(seq_group_metadata_list)
+
+        # shape: [k+1, batch_size]
+        accepted_token_ids_by_step = accepted_token_ids.transpose(0,
+                                                                  1).tolist()
+        sampler_output_list = []
+        for token_ids_by_step in accepted_token_ids_by_step:
+            if all(token_id == -1 for token_id in token_ids_by_step):
+                break
+
+            step_output_token_ids = []
+            for token_id, seq_id in zip(token_ids_by_step, seq_ids):
+                step_output_token_ids.append(
+                    SequenceGroupOutput(
+                        samples=[
+                            SequenceOutput(
+                                parent_seq_id=seq_id,
+                                output_token=token_id,
+                                # TODO Add verifier logprobs.
+                                logprobs={token_id: 0.0},
+                            )
+                        ],
+                        prompt_logprobs=None,
+                    ))
+            sampler_output_list.append(
+                SamplerOutput(outputs=step_output_token_ids))
+
+        maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics(
+            k)
+        if maybe_rejsample_metrics is not None:
+            sampler_output_list[
+                0].spec_decode_worker_metrics = maybe_rejsample_metrics
+
+        return sampler_output_list
+
+    @cached_property
+    def _vocab_size(self) -> int:
+        """Get the vocab size of the model and make sure it's consistent between
+        draft and target workers.
+        """
+        vocab_sizes = [
+            worker.vocab_size
+            for worker in [self.proposer_worker, self.scorer_worker]
+        ]
+        assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes)
+        return vocab_sizes[0]
+
+    @property
+    def rank(self):
+        return self.scorer_worker.rank
+
+    @property
+    def device(self):
+        return self.scorer_worker.device
+
+
+def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int,
+                                  proposer_cache_block_size_bytes: int,
+                                  total_num_gpu_blocks: int) -> int:
+    """Given total_num_gpu_blocks, the number of GPU blocks that could be
+    allocate to the target model, this function calculates how many blocks
+    should be given to the draft and target model.
+
+    Note that usually the block size, in bytes, of each model is different,
+    as it's a function of number of KV/layer, number of heads, and hidden
+    dimension size.
+
+    Since the target and draft models allocate the same number of blocks, we
+    simply calculate the number of blocks where if allocated by both models,
+    the total memory usage from KV cache is no larger than the number of
+    blocks allocatable by the target model alone.
+    """
+    new_num_gpu_blocks = int(
+        total_num_gpu_blocks * scorer_cache_block_size_bytes /
+        (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes))
+
+    return new_num_gpu_blocks
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
new file mode 100644
index 0000000000000..2c5f954551905
--- /dev/null
+++ b/vllm/spec_decode/util.py
@@ -0,0 +1,99 @@
+import torch
+from typing import List, Tuple
+from vllm.sequence import SequenceGroupMetadata, SamplerOutput
+from contextlib import contextmanager
+from itertools import chain
+
+SeqId = int
+
+
+def get_all_seq_ids(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]:
+    """Given a list of SequenceGroupMetadata, create a list of all
+    sequence ids.
+    """
+    return list(
+        chain.from_iterable([
+            seq_group_metadata.seq_data.keys()
+            for seq_group_metadata in seq_group_metadata_list
+        ]))
+
+
+def split_batch_by_proposal_len(
+    seq_group_metadata_list: List[SequenceGroupMetadata],
+    proposal_lens: List[int], select_proposal_len_zero: bool
+) -> Tuple[List[SequenceGroupMetadata], List[int]]:
+    """Utility function that splits a batch based on whether the proposal len is
+    zero or not. We should remove this once vLLM supports per-sequence proposal
+    lens in a batch.
+    """
+
+    if select_proposal_len_zero:
+        predicate = lambda proposal_len: proposal_len == 0
+    else:
+        predicate = lambda proposal_len: proposal_len != 0
+
+    indices = [
+        i for i, (_, proposal_len
+                  ) in enumerate(zip(seq_group_metadata_list, proposal_lens))
+        if predicate(proposal_len)
+    ]
+    seq_groups = [
+        seq_group for seq_group, proposal_len in zip(
+            seq_group_metadata_list, proposal_lens) if predicate(proposal_len)
+    ]
+
+    return seq_groups, indices
+
+
+def sampler_output_to_torch(
+    sampler_output_list: List[SamplerOutput],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Utility function which converts a list of SamplerOutput to tensors.
+
+        Returns:
+            sampled_token_ids: torch.Tensor
+                shape: [batch_size, len(sampler_output_list)]
+
+            sampled_token_probs: torch.Tensor
+                shape: [batch_size, len(sampler_output_list), vocab_size]
+        """
+
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_probs = torch.stack(
+        [
+            sampler_output.sampled_token_probs
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    ).transpose(0, 1)
+
+    # shape: [batch_size, num_sampler_output]
+    sampled_token_ids = torch.stack(
+        [
+            sampler_output.sampled_token_ids.flatten()
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    ).transpose(0, 1)
+
+    return sampled_token_ids, sampled_token_probs
+
+
+@contextmanager
+def nvtx_range(msg, *args, **kwargs):
+    """ 
+    Context manager / decorator that pushes an NVTX range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+
+    If running with cuda graphs, you must enable nsys cuda graph profiling.
+
+    Arguments:
+        msg (string): message to associate with the range
+    """
+    torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+    try:
+        yield
+    finally:
+        torch.cuda.nvtx.range_pop()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 9023b0c59b3fb..0dd2309079403 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -97,8 +97,6 @@ def load_model(self) -> None:
             f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB"
         )
 
-        vocab_size = self.model.config.vocab_size
-
         if self.lora_config:
             assert hasattr(
                 self.model, "supported_lora_modules"
@@ -111,7 +109,7 @@ def load_model(self) -> None:
             self.lora_manager = LRUCacheWorkerLoRAManager(
                 self.scheduler_config.max_num_seqs,
                 self.scheduler_config.max_num_batched_tokens +
-                self.scheduler_config.max_paddings, vocab_size,
+                self.scheduler_config.max_paddings, self.vocab_size,
                 self.lora_config, self.device, self.model.embedding_modules,
                 self.model.embedding_padding_modules)
             self.model = self.lora_manager.create_lora_manager(self.model)
@@ -607,8 +605,7 @@ def execute_model(
     @torch.inference_mode()
     def profile_run(self) -> None:
         # Enable top-k sampling to reflect the accurate memory usage.
-        vocab_size = self.model_config.get_vocab_size()
-        sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
         max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
         max_num_seqs = self.scheduler_config.max_num_seqs
 
@@ -774,6 +771,10 @@ def __del__(self) -> None:
         self.graph_runners.clear()
         self.cupy_nccl_backend = None
 
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
+
 
 class CUDAGraphRunner:
 
diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py
deleted file mode 100644
index ab3e28389a04c..0000000000000
--- a/vllm/worker/spec_decode/multi_step_worker.py
+++ /dev/null
@@ -1,178 +0,0 @@
-from typing import List, Dict
-import copy
-
-import torch
-
-from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.worker.worker import Worker
-
-
-class MultiStepWorker(Worker):
-    """The MultiStepWorker is equivalent to a Worker except that it allows
-    multiple forward passes in a single call, assuming the scheduler has
-    allocated enough space to store the additional KV. This reduces overhead
-    by invoking the scheduler less.
-
-    The MultiStepWorker does not support cache swap operations, or beam search.
-    Cache swap operations do not require large modifications. On the other hand,
-    beam search requires memory allocations during sequence forks and thus
-    requires more thought for MultiStepWorker support.
-    """
-
-    @torch.inference_mode()
-    def execute_model_multi_step(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-        num_steps: int,
-    ) -> List[SamplerOutput]:
-        """Run the model forward pass num_steps times. Returns the list of
-        sampler output, one per model forward pass.
-        """
-        self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in,
-                                   blocks_to_swap_out, blocks_to_copy)
-
-        # Shallow copy input data so modifications (such as appending tokens)
-        # do not cause side-effects.
-        copied_seq_group_metadata_list = self._shallow_copy_inputs(
-            seq_group_metadata_list)
-
-        # Assert enough KV space for num_steps tokens per sequence.
-        self._assert_enough_kv_space(seq_group_metadata_list, num_steps)
-
-        # Run model num_steps times.
-        model_outputs = []
-        for _ in range(num_steps):
-            model_output = super().execute_model(
-                seq_group_metadata_list=copied_seq_group_metadata_list,
-                blocks_to_swap_in=blocks_to_swap_in,
-                blocks_to_swap_out=blocks_to_swap_out,
-                blocks_to_copy=blocks_to_copy,
-            )
-
-            self._append_new_tokens(model_output,
-                                    copied_seq_group_metadata_list)
-            model_outputs.append(model_output)
-
-        return model_outputs
-
-    def _append_new_tokens(
-            self, model_output: SamplerOutput,
-            seq_group_metadata_list: SequenceGroupMetadata) -> None:
-        """Given model output from a single run, append the tokens to the
-        sequences. This is normally done outside of the worker, but it is
-        required if the worker is to perform multiple forward passes.
-        """
-        for seq_group_metadata, sequence_group_outputs in zip(
-                seq_group_metadata_list, model_output):
-            seq_group_metadata.is_prompt = False
-
-            for seq_output in sequence_group_outputs.samples:
-                # NOTE: Beam search is not supported, so we can assume that
-                # parent_seq_id == seq_id.
-                seq = seq_group_metadata.seq_data[seq_output.parent_seq_id]
-
-                token_id = seq_output.output_token
-                token_logprob = seq_output.logprobs[token_id]
-
-                seq.append_token_id(token_id, token_logprob.logprob)
-
-    def _shallow_copy_inputs(
-        self, seq_group_metadata_list: List[SequenceGroupMetadata]
-    ) -> List[SequenceGroupMetadata]:
-        """Copy input data structures to remove side-effects when input data
-        structures are shared with other modules.
-
-        The multi-step worker must be able to append tokens to sequences after
-        a forward pass. This necessitates modification of the data structures
-        used by the worker. Since these data structures are shared with other
-        parts of vLLM, like the scheduler, we must take care not to introduce
-        unexpected side-effects.
-
-        When Ray is used to orchestrate worker processes (such as when the
-        tensor-parallel degree is >1), this is not a problem because the input
-        datastructures will be serialized and created anew in the worker
-        process.
-
-        However, when Ray is not used to orchestrate the worker processes (such
-        as when the tensor-parallel degree is 1), this is a problem. We avoid
-        the problem by shallow-copying the input datastructures (specifically,
-        the parts that will change in multiple steps).
-        """
-
-        # Shallow-copy the list of SequenceGroupMetadata. This allows us to
-        # append tokens and change is_prompt without external side-effects.
-        new_seq_group_metadata_list = []
-
-        for old_seq_group_metadata in seq_group_metadata_list:
-            # We must shallow-copy seq_group_metadata as is_prompt could change.
-            seq_group_metadata = copy.copy(old_seq_group_metadata)
-            new_seq_group_metadata_list.append(seq_group_metadata)
-
-            # We must shallow-copy seq_data as we will append token ids
-            new_seq_data = {}
-            for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
-                new_seq_data[seq_id] = copy.copy(old_seq_data)
-                new_seq_data[
-                    seq_id].output_token_ids = old_seq_data.output_token_ids[:]
-
-            seq_group_metadata.seq_data = new_seq_data
-
-        return new_seq_group_metadata_list
-
-    def _assert_enough_kv_space(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata],
-            num_steps: int) -> None:
-        """Assert there are enough physical blocks per sequence to store the
-        current KV plus additional KV from num_steps tokens.
-        """
-        assert self.model_runner.block_size is not None
-        for seq_group_metadata in seq_group_metadata_list:
-            # Only one seq_id is guaranteed because there is no beam search.
-            seq_id = list(seq_group_metadata.seq_data.keys())[0]
-            seq = seq_group_metadata.seq_data[seq_id]
-
-            # After num_steps, the seq len will be the current seq len
-            # plus one token per step.
-            final_seq_len = seq.get_len() + num_steps
-
-            # We will have final_seq_len - 1 KV because vLLM saves KV for a
-            # token in the iteration after the token was generated.
-            required_num_kv_slots = final_seq_len - 1
-
-            # The allocated number of kv slots is the number of allocated blocks
-            # times the number of slots of block.
-            number_physical_blocks = len(
-                seq_group_metadata.block_tables[seq_id])
-            allocated_kv_slots = (number_physical_blocks *
-                                  self.model_runner.block_size)
-
-            if required_num_kv_slots > allocated_kv_slots:
-                request_id = seq_group_metadata.request_id
-                raise ValueError(
-                    "The worker attempted to run "
-                    f"{num_steps} times but found insufficient KV space for "
-                    f"{request_id=} {seq_id=}. ({allocated_kv_slots=} "
-                    f"{required_num_kv_slots=}).")
-
-    def _raise_if_unsupported(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> None:
-        """MultiStepWorker does not yet implement support for cache swap
-        operations or beam search.
-        """
-        if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]):
-            raise NotImplementedError(
-                "MultiStepWorker does not support cache operations")
-
-        if any(
-                len(seq_group_metadata.seq_data.keys()) != 1
-                for seq_group_metadata in seq_group_metadata_list):
-            raise NotImplementedError(
-                "MultiStepWorker does not support beam search.")
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 157e8c45836b1..0dcd4018afa5f 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -130,8 +130,8 @@ def profile_num_available_blocks(
         # GPU did not change their memory usage during the profiling.
         peak_memory = self.init_gpu_memory - free_gpu_memory
 
-        cache_block_size = CacheEngine.get_cache_block_size(
-            block_size, cache_dtype, self.model_config, self.parallel_config)
+        cache_block_size = self.get_cache_block_size_bytes(
+            block_size, cache_dtype)
         num_gpu_blocks = int(
             (total_gpu_memory * gpu_memory_utilization - peak_memory) //
             cache_block_size)
@@ -232,6 +232,22 @@ def remove_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> Set[int]:
         return self.model_runner.list_loras()
 
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self, block_size: int,
+                                   cache_dtype: str) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(block_size, cache_dtype,
+                                                self.model_config,
+                                                self.parallel_config)
+
 
 def init_distributed_environment(
     parallel_config: ParallelConfig,

From 0bba88df03754c40bd9135fc2ff9554ffca59c87 Mon Sep 17 00:00:00 2001
From: Terry <149540247+tterrysun@users.noreply.github.com>
Date: Sat, 9 Mar 2024 17:14:16 -0800
Subject: [PATCH 076/113] Enhance lora tests with more layer and rank
 variations (#3243)

---
 csrc/punica/bgmv/bgmv_config.h     |   1 +
 requirements-dev.txt               |   1 +
 tests/lora/test_layer_variation.py | 104 +++++++++++++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 tests/lora/test_layer_variation.py

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index 3eb84ceb4d534..4dc90de1ab42a 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -14,6 +14,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 128) \
     f(in_T, out_T, W_T, narrow, 256) \
     f(in_T, out_T, W_T, narrow, 512) \
+    f(in_T, out_T, W_T, narrow, 768) \
     f(in_T, out_T, W_T, narrow, 1024) \
     f(in_T, out_T, W_T, narrow, 1280) \
     f(in_T, out_T, W_T, narrow, 1728) \
diff --git a/requirements-dev.txt b/requirements-dev.txt
index dfcbfa4253f1c..5502c97d014ac 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -21,6 +21,7 @@ einops # required for MPT
 openai
 requests
 ray
+peft
 
 # Benchmarking
 aiohttp
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
new file mode 100644
index 0000000000000..1a1da517b2276
--- /dev/null
+++ b/tests/lora/test_layer_variation.py
@@ -0,0 +1,104 @@
+from typing import List, Optional
+import peft
+import pytest
+from random import sample
+import tempfile
+from transformers import AutoModelForCausalLM
+
+import vllm
+from vllm.lora.request import LoRARequest
+from .conftest import cleanup
+
+MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
+PROMPTS = [
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+]
+
+
+def get_lora_model(model_id: str, target_modules: List[str], rank: int):
+    model = AutoModelForCausalLM.from_pretrained(model_id)
+    lora_config = peft.tuners.lora.LoraConfig(target_modules, rank)
+    lora_model = peft.PeftModel(model, lora_config)
+    return lora_model
+
+
+def do_sample(llm,
+              lora_path: Optional[str] = None,
+              lora_id: Optional[int] = None,
+              logprobs: int = 0,
+              n_tokens: int = 256):
+    prompts = PROMPTS
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=n_tokens,
+                                          logprobs=logprobs,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts = []
+    generated_logprobs = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        generated_logprobs.append([
+            list(logprob.keys()) for out in output.outputs
+            for logprob in out.logprobs
+        ])
+    return generated_logprobs if logprobs else generated_texts
+
+
+SUPPORTED_MODULES = [
+    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+    "lm_head"
+]
+TARGET_MODULES_LIST = []
+for length in range(2, 6):
+    TARGET_MODULES_LIST.extend(
+        [sample(SUPPORTED_MODULES, length) for _ in range(3)])
+
+
+# Test the correctness when layer and rank are varied
+# step 1: init a base model and serve with LoRA to get the reference results
+# step 2: merge the same LoRA to the base model, serve the merged model
+# step 3: compare the results from step 1 and step 2
+@pytest.mark.parametrize("tp_size", [1])
+@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST)
+@pytest.mark.parametrize("rank", [8, 16, 32, 64])
+def test_layer_variation_correctness(tp_size, target_modules, rank):
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=tp_size,
+                   worker_use_ray=True)
+    model = get_lora_model(MODEL_PATH, target_modules, rank)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model.save_pretrained(tmpdir)
+        merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32)
+    del llm
+    cleanup()
+    reference_id_sets = [set(prob[0]) for prob in merged_probs]
+
+    model = get_lora_model(MODEL_PATH, target_modules, rank)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        merged_model = model.merge_and_unload()
+        merged_model.save_pretrained(tmpdir)
+        llm = vllm.LLM(tmpdir,
+                       tokenizer=MODEL_PATH,
+                       enable_lora=False,
+                       max_num_seqs=16,
+                       tensor_parallel_size=tp_size,
+                       worker_use_ray=True)
+    probs = do_sample(llm, logprobs=5, n_tokens=32)
+    del llm
+    cleanup()
+    # verify the top-5 tokens are identical for each token
+    id_sets = [set(prob[0]) for prob in probs]
+    assert id_sets == reference_id_sets

From e4a28e53165902ffc5daf20977c70885d0c05768 Mon Sep 17 00:00:00 2001
From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com>
Date: Sun, 10 Mar 2024 17:27:45 -0500
Subject: [PATCH 077/113] [ROCM] Fix blockReduceSum to use correct warp counts
 for ROCm and CUDA (#3262)

---
 csrc/attention/attention_kernels.cu |  8 --------
 csrc/cuda_compat.h                  | 10 ++++++++++
 csrc/reduction_utils.cuh            |  6 +++---
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index b5be3befa07e2..5e61668d5cc1a 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -15,9 +15,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-#endif
 
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -31,11 +28,6 @@
 
 #include <algorithm>
 
-#ifndef USE_ROCM
-#define WARP_SIZE 32
-#else
-#define WARP_SIZE warpSize
-#endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h
index aa58dd73c148a..c711d8d1b24b9 100644
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@@ -1,5 +1,15 @@
 #pragma once
 
+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
 #ifndef USE_ROCM
   #define VLLM_LDG(arg) __ldg(arg)
 #else
diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh
index b95ccef162072..210bf0b023ab2 100644
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@@ -24,7 +24,7 @@ namespace vllm {
 template<typename T>
 __inline__ __device__ T warpReduceSum(T val) {
 #pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1)
+  for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
     val += VLLM_SHFL_XOR_SYNC(val, mask);
   return val;
 }
@@ -32,7 +32,7 @@ __inline__ __device__ T warpReduceSum(T val) {
 /* Calculate the sum of all elements in a block */
 template<typename T>
 __inline__ __device__ T blockReduceSum(T val) {
-  static __shared__ T shared[32];
+  static __shared__ T shared[WARP_SIZE];
   int lane = threadIdx.x & 0x1f;
   int wid = threadIdx.x >> 5;
 
@@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) {
 
   // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
   // blockDim.x is not divided by 32
-  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
+  val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
   val = warpReduceSum<T>(val);
   return val;
 }

From 9e8744a545f34ca636a5113ae98cec5851af56eb Mon Sep 17 00:00:00 2001
From: Roy <jasonailu87@gmail.com>
Date: Mon, 11 Mar 2024 10:17:16 +0800
Subject: [PATCH 078/113] [BugFix] Fix get tokenizer when using ray (#3301)

---
 tests/async_engine/test_async_llm_engine.py   | 3 +++
 vllm/engine/async_llm_engine.py               | 9 +++++++--
 vllm/engine/llm_engine.py                     | 8 +++++++-
 vllm/entrypoints/openai/serving_chat.py       | 2 +-
 vllm/entrypoints/openai/serving_completion.py | 2 +-
 vllm/transformers_utils/tokenizer.py          | 6 ++++--
 6 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 1e31ff7373031..cb125a7bfec30 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -89,3 +89,6 @@ async def test_new_requests_event():
     await asyncio.sleep(0.01)
     assert engine.engine.add_request_calls == 3
     assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    assert engine.get_tokenizer() is not None
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 65ab0c0634176..5629d1a863d04 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -5,6 +5,8 @@
 from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
                     Union, AsyncIterator, Callable)
 
+from transformers import PreTrainedTokenizer
+
 from vllm.lora.request import LoRARequest
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -372,8 +374,11 @@ def _error_callback(self, exc: Exception) -> None:
         self.set_errored(exc)
         self._request_tracker.propagate_exception(exc)
 
-    def get_tokenizer(self):
-        return self.engine.tokenizer.tokenizer
+    async def get_tokenizer(self) -> "PreTrainedTokenizer":
+        if self.engine_use_ray:
+            return await self.engine.get_tokenizer.remote()
+        else:
+            return self.engine.get_tokenizer()
 
     def start_background_loop(self) -> None:
         """Start the background loop."""
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 8484014c9a13f..5b46d9db5649a 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,6 +7,8 @@
 from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
                     Union)
 
+from transformers import PreTrainedTokenizer
+
 import vllm
 from vllm.lora.request import LoRARequest
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
@@ -163,7 +165,11 @@ def __reduce__(self):
         # the closure used to initialize Ray worker actors
         raise RuntimeError("LLMEngine should not be pickled!")
 
-    def get_tokenizer_for_seq(self, sequence: Sequence):
+    def get_tokenizer(self) -> "PreTrainedTokenizer":
+        return self.tokenizer.get_lora_tokenizer()
+
+    def get_tokenizer_for_seq(self,
+                              sequence: Sequence) -> "PreTrainedTokenizer":
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
 
     def _dispatch_worker(self):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ba352f18f6454..7d5603c85e4e9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -65,7 +65,7 @@ async def create_chat_completion(
             lora_request = self._maybe_get_lora(request)
             guided_decode_logits_processor = (
                 await get_guided_decoding_logits_processor(
-                    request, self.engine.get_tokenizer()))
+                    request, await self.engine.get_tokenizer()))
             if guided_decode_logits_processor:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a8244fd150753..c673b2582c47b 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -126,7 +126,7 @@ async def create_completion(self, request: CompletionRequest,
             lora_request = self._maybe_get_lora(request)
             guided_decode_logit_processor = (
                 await get_guided_decoding_logits_processor(
-                    request, self.engine.get_tokenizer()))
+                    request, await self.engine.get_tokenizer()))
             if guided_decode_logit_processor is not None:
                 if sampling_params.logits_processors is None:
                     sampling_params.logits_processors = []
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 6edc225cdfc80..2600ea2642da2 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -120,7 +120,8 @@ async def encode_async(
 
     def get_lora_tokenizer(
             self,
-            lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:
@@ -133,7 +134,8 @@ def get_lora_tokenizer(
 
     async def get_lora_tokenizer_async(
             self,
-            lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
         if not lora_request or not self.enable_lora:
             return self.tokenizer
         if lora_request.lora_int_id not in self.lora_tokenizers:

From 4b59f00e917679337169c88c981f268e6ab96cd6 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sun, 10 Mar 2024 19:17:46 -0700
Subject: [PATCH 079/113] [Fix] Fix best_of behavior when n=1 (#3298)

---
 vllm/outputs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index 4f9eddee11cd4..b8173fd7a0638 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -87,12 +87,12 @@ def __init__(
 
     @classmethod
     def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
-        # Get the top-n sequences.
-        n = seq_group.sampling_params.n
         seqs = seq_group.get_seqs()
-        if n == 1:
+        if len(seqs) == 1:
             top_n_seqs = seqs
         else:
+            # Get the top-n sequences.
+            n = seq_group.sampling_params.n
             if seq_group.sampling_params.use_beam_search:
                 sorting_key = lambda seq: seq.get_beam_search_score(
                     seq_group.sampling_params.length_penalty)

From 2f8844ba08d77af8a64784317055b03a475f6051 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sun, 10 Mar 2024 19:49:14 -0700
Subject: [PATCH 080/113] Re-enable the 80 char line width limit (#3305)

---
 pyproject.toml                                |   6 +-
 setup.py                                      |   4 +-
 tests/async_engine/test_chat_template.py      |   6 +-
 tests/core/test_block_manager.py              |   3 +-
 tests/entrypoints/test_guided_processors.py   |   4 +-
 tests/entrypoints/test_openai_server.py       |  36 +++---
 tests/kernels/test_moe.py                     |   3 +-
 tests/kernels/test_prefix_prefill.py          |   3 +-
 tests/lora/test_layer_variation.py            |   6 +-
 tests/lora/test_layers.py                     |  15 ++-
 tests/lora/test_llama.py                      |  47 ++++----
 tests/lora/test_mixtral.py                    |  12 +-
 tests/metrics/test_metrics.py                 |  14 ++-
 tests/models/test_marlin.py                   |  15 +--
 tests/prefix_caching/test_prefix_caching.py   |  15 ++-
 tests/samplers/test_logprobs.py               |   4 +-
 tests/samplers/test_sampler.py                |  17 +--
 tests/spec_decode/test_metrics.py             |   6 +-
 tests/spec_decode/test_multi_step_worker.py   |   3 +-
 tests/spec_decode/test_spec_decode_worker.py  |  18 ++-
 vllm/config.py                                |  14 ++-
 vllm/core/block_manager.py                    |  15 ++-
 vllm/core/evictor.py                          |   6 +-
 vllm/core/scheduler.py                        |   8 +-
 vllm/engine/llm_engine.py                     |  27 +++--
 vllm/engine/metrics.py                        |  22 ++--
 vllm/entrypoints/api_server.py                |   8 +-
 vllm/entrypoints/openai/api_server.py         |  33 +++---
 vllm/entrypoints/openai/serving_chat.py       |  25 ++--
 vllm/entrypoints/openai/serving_completion.py |  28 +++--
 vllm/entrypoints/openai/serving_engine.py     |  13 ++-
 vllm/lora/layers.py                           |  14 ++-
 vllm/lora/models.py                           |   3 +-
 vllm/lora/worker_manager.py                   |   7 +-
 vllm/model_executor/guided_decoding.py        |   6 +-
 .../guided_logits_processors.py               |  15 ++-
 .../layers/attention/attention.py             |   4 +-
 .../layers/fused_moe/fused_moe.py             | 107 ++++++++++++------
 vllm/model_executor/layers/linear.py          |  12 +-
 .../layers/quantization/__init__.py           |   3 +-
 .../model_executor/layers/quantization/awq.py |   6 +-
 .../layers/quantization/gptq.py               |  10 +-
 .../layers/quantization/marlin.py             |  39 ++++---
 .../layers/quantization/squeezellm.py         |   3 +-
 vllm/model_executor/layers/sampler.py         |   3 +-
 vllm/model_executor/models/baichuan.py        |   3 +-
 vllm/model_executor/models/deepseek.py        |   8 +-
 vllm/model_executor/models/gpt_j.py           |   3 +-
 vllm/model_executor/models/internlm2.py       |   3 +-
 vllm/model_executor/models/olmo.py            |  19 ++--
 vllm/model_executor/models/qwen2.py           |   3 +-
 vllm/model_executor/models/stablelm.py        |  13 ++-
 vllm/model_executor/models/starcoder2.py      |   3 +-
 vllm/model_executor/neuron_model_loader.py    |   3 +-
 .../parallel_utils/communication_op.py        |   5 +-
 vllm/model_executor/sampling_metadata.py      |   3 +-
 vllm/sampling_params.py                       |   4 +-
 vllm/sequence.py                              |   3 +-
 vllm/spec_decode/batch_expansion.py           |  29 +++--
 vllm/spec_decode/multi_step_worker.py         |  14 ++-
 vllm/spec_decode/spec_decode_worker.py        |  19 ++--
 vllm/transformers_utils/configs/mpt.py        |  89 +++------------
 vllm/transformers_utils/configs/starcoder2.py |  72 ------------
 .../transformers_utils/tokenizers/baichuan.py |  92 +++++++--------
 vllm/utils.py                                 |  12 +-
 vllm/worker/model_runner.py                   |  11 +-
 vllm/worker/neuron_worker.py                  |   6 +-
 67 files changed, 557 insertions(+), 528 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c5db016cebdb7..d6fa5d7a035ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,10 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+
 [tool.ruff.lint]
 select = [
     # pycodestyle
@@ -29,8 +33,6 @@ ignore = [
     "F405", "F403",
     # lambda expression assignment
     "E731",
-    # line too long, handled by black formatting
-    "E501",
     # .strip() with multi-character strings
     "B005",
     # Loop control variable not used within loop body
diff --git a/setup.py b/setup.py
index 745b5a9b2d02a..023c3cde1910c 100644
--- a/setup.py
+++ b/setup.py
@@ -142,8 +142,8 @@ def get_pytorch_rocm_arch() -> Set[str]:
     # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
     if env_arch_list is None:
         command = "rocm_agent_enumerator"
-        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
-                        .strip().replace("\n", ";")
+        env_arch_list = (subprocess.check_output(
+            [command]).decode('utf-8').strip().replace("\n", ";"))
         arch_source_str = "rocm_agent_enumerator"
     else:
         arch_source_str = "PYTORCH_ROCM_ARCH env variable"
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 32d110e0f0b47..e98bba8d43b49 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -73,7 +73,7 @@ def test_load_chat_template():
     assert template_content is not None
     # Hard coded value for template_chatml.jinja
     assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501
 
 
 def test_no_load_chat_template():
@@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
         add_generation_prompt=mock_request.add_generation_prompt)
 
     # Test assertion
-    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 04d01f7724e4f..b280fd1d73c2f 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -4,7 +4,8 @@
 
 from vllm import SamplingParams
 from vllm.block import PhysicalTokenBlock
-from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus
+from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager,
+                                     AllocStatus)
 from vllm.utils import Device
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob
 
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index 5b39269916f8b..4a0e3e759e25a 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -46,8 +46,8 @@
     "required": ["name", "age", "skills", "work history"]
 }
 
-TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
-             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
 
 def test_guided_logits_processors():
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index f4a6e44d88a87..a5b2bf4c0f0c9 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -5,9 +5,12 @@
 import sys
 import pytest
 import requests
-import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
+# using Ray for overall ease of process management, parallel requests,
+# and debugging.
+import ray
 import openai  # use the official client for correctness check
-from huggingface_hub import snapshot_download  # downloading lora to test lora requests
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
 
 # imports for guided decoding tests
 import json
@@ -17,8 +20,11 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
-LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here
+# any model with a chat template should work here
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+# technically this needs Mistral-7B-v0.1 as base, but we're not testing
+# generation quality here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
 
 TEST_SCHEMA = {
     "type": "object",
@@ -59,8 +65,8 @@
     "required": ["name", "age", "skills", "work history"]
 }
 
-TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \
-             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
 TEST_CHOICE = [
     "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
@@ -120,8 +126,9 @@ def server(zephyr_lora_files):
     server_runner = ServerRunner.remote([
         "--model",
         MODEL_NAME,
+        # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "bfloat16",  # use half precision for speed and memory savings in CI environment
+        "bfloat16",
         "--max-model-len",
         "8192",
         "--enforce-eager",
@@ -392,7 +399,8 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI,
         max_tokens=5,
         temperature=0.0,
         extra_body=dict(
-            # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client.
+            # NOTE: this has to be true for n > 1 in vLLM, but not necessary
+            # for official client.
             use_beam_search=True),
     )
     assert len(batch.choices) == 4
@@ -469,8 +477,8 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
 async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
     completion = await client.completions.create(
         model=MODEL_NAME,
-        prompt=
-        f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}",
+        prompt=f"Give an example JSON for an employee profile "
+        f"that fits this schema: {TEST_SCHEMA}",
         n=3,
         temperature=1.0,
         max_tokens=500,
@@ -489,9 +497,11 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
         "role": "system",
         "content": "you are a helpful assistant"
     }, {
-        "role": "user",
-        "content": "Give an example JSON for an employee profile that " + \
-                    f"fits this schema: {TEST_SCHEMA}"
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
     }]
     chat_completion = await client.chat.completions.create(
         model=MODEL_NAME,
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index c402fe3e98c7f..6165225d2d819 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -57,7 +57,8 @@ def test_fused_moe(
                          [torch.float32, torch.float16, torch.bfloat16])
 @torch.inference_mode()
 def test_mixtral_moe(dtype: torch.dtype):
-    "Make sure our Mixtral MoE implementation agrees with the one from huggingface."
+    """Make sure our Mixtral MoE implementation agrees with the one from
+    huggingface."""
 
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index e881cd1ec3753..a0be658acac7b 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -114,7 +114,8 @@ def test_contexted_kv_attention(
     v_cache = v_cache.view(-1, block_size, num_kv_heads,
                            head_size).permute(0, 2, 3, 1).contiguous()
 
-    # Warm up the Triton kernel by calling it once before actually measuring generation time
+    # Warm up the Triton kernel by calling it once before actually measuring
+    # generation time
     context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table,
                           b_start_loc, b_seq_len, b_ctx_len, max_input_len)
     torch.cuda.synchronize()
diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py
index 1a1da517b2276..95cf0cede8729 100644
--- a/tests/lora/test_layer_variation.py
+++ b/tests/lora/test_layer_variation.py
@@ -11,9 +11,9 @@
 
 MODEL_PATH = "Felladrin/Llama-68M-Chat-v1"
 PROMPTS = [
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
-    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+    "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
 ]
 
 
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 18ce300449dbf..46f054c5b84ef 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -17,14 +17,16 @@
     LoRAMapping,
     BaseLayerWithLoRA,
 )
-from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights
+from vllm.lora.models import (LoRALayerWeights, convert_mapping,
+                              PackedLoRALayerWeights)
 from vllm.config import LoRAConfig
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
                                                RowParallelLinear,
                                                QKVParallelLinear)
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.utils import set_random_seed
 
 from .utils import DummyLoRAManager
@@ -258,7 +260,8 @@ def create_random_embedding_layer():
 
 
 @torch.inference_mode()
-# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.")
+# @pytest.mark.skip(
+#     reason="Fails when loras are in any slot other than the first.")
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None:
@@ -674,9 +677,9 @@ class FakeConfig:
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
             for i, sublora in enumerate(subloras):
-                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * (
-                    i + 1
-                )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling
+                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
+                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
+                                    sublora.scaling)
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index dfaf8c700695a..130906c3d584d 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -10,12 +10,12 @@
 
 def do_sample(llm, lora_path: str, lora_id: int):
     prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
     ]
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
@@ -48,20 +48,20 @@ def test_llama_lora(sql_lora_files, tp_size):
                    tensor_parallel_size=tp_size)
 
     expected_no_lora_output = [
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",
-        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",
-        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",
-        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ",  # noqa: E501
+        "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ",  # noqa: E501
+        " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ",  # noqa: E501
+        "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE",  # noqa: E501
     ]
     expected_lora_output = [
-        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",
-        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",
-        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",
-        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "
+        "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
+        "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",  # noqa: E501
+        "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
+        "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
+        "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
+        "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
     ]
 
     print("lora adapter created")
@@ -121,7 +121,8 @@ def test_llama_tensor_parallel_equality(sql_lora_files):
 
 
 def test_llama_lora_warmup(sql_lora_files):
-    """Test that the LLM initialization works with a warmup LORA path and is more conservative"""
+    """Test that the LLM initialization works with a warmup LORA path and
+    is more conservative"""
 
     @ray.remote(num_gpus=1)
     def get_num_gpu_blocks_lora():
@@ -132,13 +133,15 @@ def get_num_gpu_blocks_lora():
     @ray.remote(num_gpus=1)
     def get_num_gpu_blocks_no_lora():
         llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
-        num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
+        num_gpu_blocks_no_lora_warmup = (
+            llm.llm_engine.cache_config.num_gpu_blocks)
         return num_gpu_blocks_no_lora_warmup
 
     num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
     num_gpu_blocks_no_lora_warmup = ray.get(
         get_num_gpu_blocks_no_lora.remote())
     assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
-        "The warmup with lora should be more"
-        " conservative than without lora, therefore the number of memory blocks for the KV cache should be "
+        "The warmup with lora should be more "
+        "conservative than without lora, therefore the number of "
+        "memory blocks for the KV cache should be "
         "less when using lora than when not using lora")
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index e45fb92ab7edf..4d74722aaa926 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -9,9 +9,9 @@
 
 def do_sample(llm, lora_path: str, lora_id: int):
     prompts = [
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
     ]
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
     outputs = llm.generate(
@@ -42,9 +42,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                    worker_use_ray=True)
 
     expected_lora_output = [
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
-        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",
-        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
+        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
     ]
 
     assert do_sample(llm, mixtral_lora_files,
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 410bdfa5c69e2..0ab9c63ce4377 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens(
                              gpu_memory_utilization=0.4)
     tokenizer = vllm_model.model.get_tokenizer()
     prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
-    # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
+    # This test needs at least 2 prompts in a batch of different lengths to
+    # verify their token count is correct despite padding.
     assert len(example_prompts) > 1, "at least 2 prompts are required"
     assert prompt_token_counts[0] != prompt_token_counts[1], (
         "prompts of different lengths are required")
@@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens(
         **stat_logger.labels)._value.get()
 
     assert vllm_prompt_token_count == metric_count, (
-        f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
-    )
+        f"prompt token count: {vllm_prompt_token_count!r}\n"
+        f"metric: {metric_count!r}")
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens(
     for i in range(len(example_prompts)):
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
         prompt_ids = tokenizer.encode(example_prompts[i])
-        # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens.
+        # vllm_output_ids contains both prompt tokens and generation tokens.
+        # We're interested only in the count of the generation tokens.
         vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
 
     assert vllm_generation_count == metric_count, (
-        f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}"
-    )
+        f"generation token count: {vllm_generation_count!r}\n"
+        f"metric: {metric_count!r}")
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index f3cc517364f06..a3a1487e62e05 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -1,7 +1,7 @@
 """Compare the outputs of a GPTQ model to a Marlin model.
 
-Note: GPTQ and Marlin do not have bitwise correctness. 
-As a result, in this test, we just confirm that the top selected tokens of the 
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
 
 Note: Marlin internally uses locks to synchronize the threads. This can
@@ -14,7 +14,8 @@
 import pytest
 import torch
 from dataclasses import dataclass
-from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+from vllm.model_executor.layers.quantization import (
+    _QUANTIZATION_CONFIG_REGISTRY)
 
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
@@ -87,11 +88,11 @@ def test_models(
             if marlin_output_id != gptq_output_id:
                 # Each predicted token must be in top 5 of the other's
                 assert gptq_output_id in marlin_logprobs[idx], (
-                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}"
-                )
+                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
+                    f"Marlin:\t{marlin_output_str!r}")
                 assert marlin_output_id in gptq_logprobs[idx], (
-                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}"
-                )
+                    f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n"
+                    f"Marlin:\t{marlin_output_str!r}")
 
                 # Break out since sequences will now diverge.
                 break
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7ef8dde7bb8f6..c83551c36ef10 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -20,20 +20,23 @@ def test_block_allocator(
                                      num_blocks,
                                      enable_caching=True)
 
-    # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock
+    # Allocate two PysicalTokenBlocks with the same hash and check
+    # that they are the same PhysicalTokenBlock
     first_block = block_allocator.allocate(block_hash, 0)
     second_block = block_allocator.allocate(block_hash, 0)
     assert (first_block == second_block)
     assert (second_block.ref_count == 2)
 
-    # Free the first_block and confirm that the ref_count is correctly decremented on the second block
+    # Free the first_block and confirm that the ref_count is correctly
+    # decremented on the second block
     block_allocator.free(first_block)
     assert (second_block.ref_count == 1)
 
     # Free the second block
     block_allocator.free(second_block)
 
-    # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back
+    # Reallocate the first block and confirm that, even after the block
+    # had its ref_count go to 0, we still get the same block back
     first_block = block_allocator.allocate(block_hash, 0)
     assert (first_block == second_block)
     assert (first_block.block_hash == block_hash)
@@ -56,7 +59,8 @@ def test_eviction(num_blocks: int, ):
     for block in blocks:
         block_allocator.free(block)
 
-    # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block
+    # Allocate a new block and confirm that it's the first block freed.
+    # I.E The Least Recently Used block
     new_block_hash = block_size
     new_block = block_allocator.allocate(new_block_hash, 0)
     assert (new_block == blocks[0])
@@ -68,7 +72,8 @@ def test_eviction(num_blocks: int, ):
     assert (realloc_block == blocks[realloc_block_hash])
     assert (realloc_block.block_hash == realloc_block_hash)
 
-    # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list
+    # Allocate a new block and confirm that it's not the realloc_block,
+    # since the realloc_block shouldn't be in the free list
     new_block_hash = block_size + 1
     new_block = block_allocator.allocate(new_block_hash, 0)
     assert (realloc_block != new_block)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 1abb55f021214..14f1872c45258 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -70,8 +70,8 @@ def test_get_prompt_logprobs(
                                            hf_logprob[i][-1][token_id].item(),
                                            atol=1e-2,
                                            rtol=1e-2)
-                assert isinstance(sample_logprob.decoded_token, str), \
-                    ("The token should be decoded by the time it is returned "
+                assert isinstance(sample_logprob.decoded_token, str), (
+                    "The token should be decoded by the time it is returned "
                     " to the user.")
 
 
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 31e865f42ff3b..1bc8703d1a8e0 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -255,9 +255,10 @@ def test_sampling(model_runner: ModelRunner):
             if metadata.sampling_params.use_beam_search:
                 continue
 
-            if metadata.sampling_params.seed is not None \
-                    and expected_tokens[i] is None:
-                # Record seeded random result to compare with results of second invocation
+            if (metadata.sampling_params.seed is not None
+                    and expected_tokens[i] is None):
+                # Record seeded random result to compare with results of
+                # second invocation
                 expected_tokens[i] = [
                     nth_output.output_token
                     for nth_output in sequence_output.samples
@@ -265,11 +266,13 @@ def test_sampling(model_runner: ModelRunner):
                 continue
 
             for n, nth_output in enumerate(sequence_output.samples):
-                if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None:
+                if (metadata.sampling_params.temperature == 0
+                        or metadata.sampling_params.seed is not None):
                     # Ensure exact matches for greedy or random with seed
                     assert nth_output.output_token == expected_tokens[i][n]
                 else:
-                    # For non-seeded random check that one of the high-logit tokens were chosen
+                    # For non-seeded random check that one of the high-logit
+                    # tokens were chosen
                     assert nth_output.output_token in expected_tokens[i]
 
     # Test batch
@@ -284,8 +287,8 @@ def test_sampling(model_runner: ModelRunner):
     input_tensor.data = input_tensor.index_select(0, target_index)
     fake_logits.data = fake_logits.index_select(0, target_index)
 
-    # This time, results of seeded random samples will be compared with the corresponding
-    # sample in the pre-shuffled batch
+    # This time, results of seeded random samples will be compared with
+    # the corresponding sample in the pre-shuffled batch
     test_sampling(model_runner)
 
     del model_runner
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 941ea37aa81e0..09847136d13e9 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -150,8 +150,10 @@ def test_initial_metrics_has_correct_values(has_data: bool):
     assert metrics.emitted_tokens == num_emitted_tokens
 
     if has_data:
-        assert metrics.draft_acceptance_rate == num_accepted_tokens / num_draft_tokens
-        assert metrics.system_efficiency == num_emitted_tokens / num_possible_tokens
+        assert (metrics.draft_acceptance_rate == num_accepted_tokens /
+                num_draft_tokens)
+        assert (metrics.system_efficiency == num_emitted_tokens /
+                num_possible_tokens)
     else:
         assert math.isnan(metrics.draft_acceptance_rate)
         assert math.isnan(metrics.system_efficiency)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 88bb7c293fe95..45b43ec59ee8f 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -3,7 +3,8 @@
 import pytest
 from unittest.mock import MagicMock
 
-from vllm.spec_decode.multi_step_worker import MultiStepWorker, DraftModelTop1Proposer
+from vllm.spec_decode.multi_step_worker import (MultiStepWorker,
+                                                DraftModelTop1Proposer)
 from vllm.worker.worker import Worker
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplerOutput
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index e919711c3ed2c..bfc69e01e3eb9 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -4,12 +4,15 @@
 from unittest.mock import MagicMock
 
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
-from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker, split_num_cache_blocks_evenly
+from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
+                                                 split_num_cache_blocks_evenly)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.model_executor.utils import set_random_seed
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from .utils import mock_worker, create_batch, ExecuteModelData, create_sampler_output_list
-from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics, AsyncMetricsCollector
+from .utils import (mock_worker, create_batch, ExecuteModelData,
+                    create_sampler_output_list)
+from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics,
+                                      AsyncMetricsCollector)
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -391,13 +394,15 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
 
     mock_rejsample_metrics = MagicMock(
         spec=SpecDecodeWorkerMetrics) if returns_metrics else None
-    metrics_collector.maybe_collect_rejsample_metrics.return_value = mock_rejsample_metrics
+    metrics_collector.maybe_collect_rejsample_metrics.return_value = (
+        mock_rejsample_metrics)
 
     output = worker.execute_model(**execute_model_data.to_dict(),
                                   num_spec_tokens=k)
     assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
 
-    call_args_list = metrics_collector.maybe_collect_rejsample_metrics.call_args_list
+    call_args_list = (
+        metrics_collector.maybe_collect_rejsample_metrics.call_args_list)
     assert len(call_args_list) == 1
     args, kwargs = call_args_list[0]
     assert args[0] == k or kwargs.get('k', -1) == k
@@ -547,7 +552,8 @@ def test_profile_num_available_blocks(available_gpu_blocks: int,
 
     target_worker.profile_num_available_blocks.return_value = (
         available_gpu_blocks, available_cpu_blocks)
-    target_worker.get_cache_block_size_bytes.return_value = target_cache_block_size_bytes
+    target_worker.get_cache_block_size_bytes.return_value = (
+        target_cache_block_size_bytes)
     draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes
 
     worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler,
diff --git a/vllm/config.py b/vllm/config.py
index ef9a920f29c2a..e893fe702c975 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -45,7 +45,7 @@ class ModelConfig:
             a tag name, or a commit id. If unspecified, will use the default
             version.
         code_revision: The specific revision to use for the model code on
-            Hugging Face Hub. It can be a branch name, a tag name, or a 
+            Hugging Face Hub. It can be a branch name, a tag name, or a
             commit id. If unspecified, will use the default version.
         tokenizer_revision: The specific tokenizer version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified, will use
@@ -189,8 +189,8 @@ def _verify_quantization(self) -> None:
             if is_hip(
             ) and self.quantization in rocm_not_supported_quantization:
                 raise ValueError(
-                    f"{self.quantization} quantization is currently not supported "
-                    f"in ROCm.")
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in ROCm.")
             if self.quantization != "marlin":
                 logger.warning(
                     f"{self.quantization} quantization is not fully "
@@ -321,7 +321,8 @@ def __init__(
         self.num_cpu_blocks = None
 
     def metrics_info(self):
-        # convert cache_config to dict(key: str, value: str) for prometheus metrics info
+        # convert cache_config to dict(key: str, value: str) for prometheus
+        # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
     def _verify_args(self) -> None:
@@ -399,8 +400,9 @@ def __init__(
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
         if is_neuron():
-            # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly.
-            # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload
+            # For Neuron device support, here we assign TP=1 to avoid sharding
+            # within vLLM directly. Transformer-neuronx would take
+            # neuron_tp_degree attribute, and distribute the workload
             # to multiple NeuronCores.
             self.tensor_parallel_size = 1
             self.neuron_tp_degree = tensor_parallel_size
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 52b120f227eda..8bfc14999f0a7 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -95,13 +95,15 @@ def free(self, block: PhysicalTokenBlock) -> None:
                 del self.cached_blocks[block.block_hash]
 
     def get_num_free_blocks(self) -> int:
-        return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks
+        return (self.num_blocks - self.current_num_blocks +
+                self.evictor.num_blocks)
 
     def contains_block(self, block_hash: int) -> bool:
         return block_hash in self.cached_blocks or block_hash in self.evictor
 
     def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
-        # If caching is enabled, update the hash of block and the cached_blocks dictionary.
+        # If caching is enabled, update the hash of block and the
+        # cached_blocks dictionary.
         if self.enable_caching:
             assert not self.contains_block(block_hash)
             old_hash = block.block_hash
@@ -218,10 +220,12 @@ def _promote_last_block(
         seq: Sequence,
         last_block: PhysicalTokenBlock,
     ) -> PhysicalTokenBlock:
-        # Compute a new hash for the block so that it can be shared by other Sequences
+        # Compute a new hash for the block so that it can be shared by
+        # other Sequences
         new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
 
-        # if new_hash is already in the cached table, then free last_block and return the cached version
+        # if new_hash is already in the cached table, then free last_block
+        # and return the cached version
         if self.gpu_allocator.contains_block(new_hash):
             self.gpu_allocator.free(last_block)
             return self.gpu_allocator.allocate(new_hash)
@@ -289,7 +293,8 @@ def append_slot(
         assert last_block.device == Device.GPU
         if last_block.ref_count == 1:
             # Not shared with other sequences. Appendable.
-            # If the last block is now complete, promote it to a full block so that it can be shared
+            # If the last block is now complete, promote it to a full block so
+            # that it can be shared
             new_block = self._maybe_promote_last_block(seq, last_block)
             block_table[-1] = new_block
             return None
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index b538ea574b604..1d81f5a97d71c 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -39,9 +39,9 @@ def add(self, block: PhysicalTokenBlock):
     @abstractmethod
     def remove(self, block_hash: int) -> PhysicalTokenBlock:
         """Simply removes the block with the hash value block_hash from the
-        evictor. Caller is responsible for making sure that block_hash is contained
-        in the evictor before calling remove. Should be used to "bring back" blocks
-        that have been freed but not evicted yet.
+        evictor. Caller is responsible for making sure that block_hash is
+        contained in the evictor before calling remove. Should be used to
+        "bring back" blocks that have been freed but not evicted yet.
         """
         pass
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index c96c6d62ef19d..9255f91be55cb 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -214,8 +214,8 @@ def _schedule(self) -> SchedulerOutputs:
                 lora_int_id = 0
                 if self.lora_enabled:
                     lora_int_id = seq_group.lora_int_id
-                    if lora_int_id > 0 and lora_int_id not in curr_loras and len(
-                            curr_loras) >= self.lora_config.max_loras:
+                    if (lora_int_id > 0 and lora_int_id not in curr_loras
+                            and len(curr_loras) >= self.lora_config.max_loras):
                         # We don't have a space for another LoRA, so
                         # we ignore this request for now.
                         leftover_waiting_sequences.appendleft(seq_group)
@@ -309,8 +309,8 @@ def _schedule(self) -> SchedulerOutputs:
                 lora_int_id = 0
                 if self.lora_enabled:
                     lora_int_id = seq_group.lora_int_id
-                    if lora_int_id > 0 and lora_int_id not in curr_loras and len(
-                            curr_loras) >= self.lora_config.max_loras:
+                    if (lora_int_id > 0 and lora_int_id not in curr_loras
+                            and len(curr_loras) >= self.lora_config.max_loras):
                         # We don't have a space for another LoRA, so
                         # we ignore this request for now.
                         leftover_swapped.appendleft(seq_group)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5b46d9db5649a..6e045cd6d73c6 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -100,7 +100,8 @@ def __init__(
             f"download_dir={model_config.download_dir!r}, "
             f"load_format={model_config.load_format}, "
             f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
-            f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
+            f"disable_custom_all_reduce="
+            f"{parallel_config.disable_custom_all_reduce}, "
             f"quantization={model_config.quantization}, "
             f"enforce_eager={model_config.enforce_eager}, "
             f"kv_cache_dtype={cache_config.cache_dtype}, "
@@ -929,7 +930,8 @@ def _get_stats(self,
             # Latency Timings.
             time_last_iters = []
             for seq_group in scheduler_outputs.scheduled_seq_groups:
-                # Time since last token. (n.b. updates seq_group.metrics.last_token_time)
+                # Time since last token.
+                # (n.b. updates seq_group.metrics.last_token_time)
                 time_last_iters.append(seq_group.get_last_latency(now))
                 # Time since arrival for all finished requests.
                 if seq_group.is_finished():
@@ -961,16 +963,17 @@ def _decode_logprobs(self, seq: Sequence, prms: SamplingParams,
         for token_id, sample_logprob in logprobs.items():
             if (sample_logprob.decoded_token is None and token_id != -1):
                 all_input_ids_with_logprob = all_input_ids[:-1] + [token_id]
-                _, new_text, prefix_offset, read_offset = detokenize_incrementally(
-                    self.get_tokenizer_for_seq(seq),
-                    all_input_ids=all_input_ids_with_logprob,
-                    prev_tokens=seq.tokens,
-                    prefix_offset=seq.prefix_offset,
-                    read_offset=seq.read_offset,
-                    skip_special_tokens=prms.skip_special_tokens,
-                    spaces_between_special_tokens=prms.
-                    spaces_between_special_tokens,
-                )
+                (_, new_text, prefix_offset,
+                 read_offset) = detokenize_incrementally(
+                     self.get_tokenizer_for_seq(seq),
+                     all_input_ids=all_input_ids_with_logprob,
+                     prev_tokens=seq.tokens,
+                     prefix_offset=seq.prefix_offset,
+                     read_offset=seq.read_offset,
+                     skip_special_tokens=prms.skip_special_tokens,
+                     spaces_between_special_tokens=prms.
+                     spaces_between_special_tokens,
+                 )
                 sample_logprob.decoded_token = new_text
 
     def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index d31542159e4a4..17b1852f5b0a3 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,5 +1,6 @@
 from vllm.logger import init_logger
-from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics
+from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY,
+                               disable_created_metrics)
 
 import time
 import numpy as np
@@ -177,10 +178,12 @@ def _log_prometheus(self, stats: Stats) -> None:
     def _log_prometheus_interval(self, prompt_throughput: float,
                                  generation_throughput: float) -> None:
         # Logs metrics to prometheus that are computed every logging_interval.
-        # Support legacy gauge metrics that make throughput calculations on the vLLM side.
-        # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens
-        # Which log raw data and calculate summaries using rate() on the grafana/prometheus side.
-        # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
+        # Support legacy gauge metrics that make throughput calculations on
+        # the vLLM side. Moving forward, we should use counters like
+        # counter_prompt_tokens, counter_generation_tokens
+        # Which log raw data and calculate summaries using rate() on the
+        # grafana/prometheus side. See
+        # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
         self.metrics.gauge_avg_prompt_throughput.labels(
             **self.labels).set(prompt_throughput)
         self.metrics.gauge_avg_generation_throughput.labels(
@@ -188,7 +191,7 @@ def _log_prometheus_interval(self, prompt_throughput: float,
 
     def log(self, stats: Stats) -> None:
         """Called by LLMEngine.
-           Logs to prometheus and tracked stats every iteration. 
+           Logs to prometheus and tracked stats every iteration.
            Logs to Stdout every self.local_interval seconds."""
 
         # Log to prometheus.
@@ -200,8 +203,8 @@ def log(self, stats: Stats) -> None:
 
         # Log locally every local_interval seconds.
         if self._local_interval_elapsed(stats.now):
-
-            # Compute summary metrics for tracked stats (and log them to promethus if applicable).
+            # Compute summary metrics for tracked stats (and log them
+            # to promethus if applicable).
             prompt_throughput = self._get_throughput(self.num_prompt_tokens,
                                                      now=stats.now)
             generation_throughput = self._get_throughput(
@@ -213,7 +216,8 @@ def log(self, stats: Stats) -> None:
             # Log to stdout.
             logger.info(
                 f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
-                f"Avg generation throughput: {generation_throughput:.1f} tokens/s, "
+                f"Avg generation throughput: "
+                f"{generation_throughput:.1f} tokens/s, "
                 f"Running: {stats.num_running} reqs, "
                 f"Swapped: {stats.num_swapped} reqs, "
                 f"Pending: {stats.num_waiting} reqs, "
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 1eb4ab8b06b64..86b6c4c67cfa4 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -1,7 +1,9 @@
 """
-NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks.
-It is not intended for production use. For production use, we recommend using our OpenAI compatible server.
-We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead.
+NOTE: This API server is used only for demonstrating usage of AsyncEngine
+and simple performance benchmarks. It is not intended for production use.
+For production use, we recommend using our OpenAI compatible server.
+We are also not going to accept PRs modifying this file, please
+change `vllm/entrypoints/openai/api_server.py` instead.
 """
 
 import argparse
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9f29b4ac92f48..00407bc0e809c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -18,7 +18,9 @@
 import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
+from vllm.entrypoints.openai.protocol import (CompletionRequest,
+                                              ChatCompletionRequest,
+                                              ErrorResponse)
 from vllm.logger import init_logger
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -84,13 +86,11 @@ def parse_args():
                         type=json.loads,
                         default=["*"],
                         help="allowed headers")
-    parser.add_argument(
-        "--api-key",
-        type=str,
-        default=None,
-        help=
-        "If provided, the server will require this key to be presented in the header."
-    )
+    parser.add_argument("--api-key",
+                        type=str,
+                        default=None,
+                        help="If provided, the server will require this key "
+                        "to be presented in the header.")
     parser.add_argument("--served-model-name",
                         type=str,
                         default=None,
@@ -103,9 +103,8 @@ def parse_args():
         default=None,
         nargs='+',
         action=LoRAParserAction,
-        help=
-        "LoRA module configurations in the format name=path. Multiple modules can be specified."
-    )
+        help="LoRA module configurations in the format name=path. "
+        "Multiple modules can be specified.")
     parser.add_argument("--chat-template",
                         type=str,
                         default=None,
@@ -138,9 +137,10 @@ def parse_args():
         help="Additional ASGI middleware to apply to the app. "
         "We accept multiple --middleware arguments. "
         "The value should be an import path. "
-        "If a function is provided, vLLM will add it to the server using @app.middleware('http'). "
-        "If a class is provided, vLLM will add it to the server using app.add_middleware(). "
-    )
+        "If a function is provided, vLLM will add it to the server "
+        "using @app.middleware('http'). "
+        "If a class is provided, vLLM will add it to the server "
+        "using app.add_middleware(). ")
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     return parser.parse_args()
@@ -235,9 +235,8 @@ async def authentication(request: Request, call_next):
         elif inspect.iscoroutinefunction(imported):
             app.middleware("http")(imported)
         else:
-            raise ValueError(
-                f"Invalid middleware {middleware}. Must be a function or a class."
-            )
+            raise ValueError(f"Invalid middleware {middleware}. "
+                             f"Must be a function or a class.")
 
     logger.info(f"vLLM API server version {vllm.__version__}")
     logger.info(f"args: {args}")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7d5603c85e4e9..d2fb9ca001b15 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -12,7 +12,8 @@
     UsageInfo)
 from vllm.outputs import RequestOutput
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
-from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 
 logger = init_logger(__name__)
 
@@ -37,8 +38,9 @@ async def create_chat_completion(
                ChatCompletionResponse]:
         """Completion API similar to OpenAI's API.
 
-        See  https://platform.openai.com/docs/api-reference/chat/create
-        for the API specification. This API mimics the OpenAI ChatCompletion API.
+        See https://platform.openai.com/docs/api-reference/chat/create
+        for the API specification. This API mimics the OpenAI
+        ChatCompletion API.
 
         NOTE: Currently we do not support the following feature:
             - function_call (Users should implement this by themselves)
@@ -116,7 +118,8 @@ async def chat_completion_stream_generator(
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
                 if first_iteration:
-                    # Send first response for each request.n (index) with the role
+                    # Send first response for each request.n (index) with
+                    # the role
                     role = self.get_chat_request_role(request)
                     for i in range(request.n):
                         choice_data = ChatCompletionResponseStreamChoice(
@@ -133,7 +136,8 @@ async def chat_completion_stream_generator(
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
 
-                    # Send response to echo the input portion of the last message
+                    # Send response to echo the input portion of the
+                    # last message
                     if request.echo:
                         last_msg_content = ""
                         if request.messages and isinstance(
@@ -145,11 +149,12 @@ async def chat_completion_stream_generator(
 
                         if last_msg_content:
                             for i in range(request.n):
-                                choice_data = ChatCompletionResponseStreamChoice(
-                                    index=i,
-                                    delta=DeltaMessage(
-                                        content=last_msg_content),
-                                    finish_reason=None)
+                                choice_data = (
+                                    ChatCompletionResponseStreamChoice(
+                                        index=i,
+                                        delta=DeltaMessage(
+                                            content=last_msg_content),
+                                        finish_reason=None))
                                 chunk = ChatCompletionStreamResponse(
                                     id=request_id,
                                     object=chunk_object_type,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c673b2582c47b..b78f053800f3c 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -1,7 +1,8 @@
 import asyncio
 import time
 from fastapi import Request
-from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple
+from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional,
+                    Dict, Tuple)
 from vllm.logger import init_logger
 from vllm.utils import random_uuid
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -16,7 +17,8 @@
 )
 from vllm.outputs import RequestOutput
 from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA
-from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 
 logger = init_logger(__name__)
 
@@ -44,9 +46,8 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]:
             prompt_is_tokens = True
             prompts = prompt  # case 4: array of token arrays
         else:
-            raise ValueError(
-                "prompt must be a string, array of strings, array of tokens, or array of token arrays"
-            )
+            raise ValueError("prompt must be a string, array of strings, "
+                             "array of tokens, or array of token arrays")
     return prompt_is_tokens, prompts
 
 
@@ -156,7 +157,8 @@ async def create_completion(self, request: CompletionRequest,
             int, RequestOutput]] = merge_async_iterators(*generators)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. In addition, we do not stream the results when use beam search.
+        # results. In addition, we do not stream the results when use
+        # beam search.
         stream = (request.stream
                   and (request.best_of is None or request.n == request.best_of)
                   and not request.use_beam_search)
@@ -223,7 +225,8 @@ async def completion_stream_generator(
 
                 for output in res.outputs:
                     i = output.index + prompt_idx * request.n
-                    # TODO(simon): optimize the performance by avoiding full text O(n^2) sending.
+                    # TODO(simon): optimize the performance by avoiding full
+                    # text O(n^2) sending.
 
                     if request.echo and request.max_tokens == 0:
                         # only return the prompt
@@ -231,11 +234,12 @@ async def completion_stream_generator(
                         delta_token_ids = res.prompt_token_ids
                         top_logprobs = res.prompt_logprobs
                         has_echoed[i] = True
-                    elif request.echo and request.max_tokens > 0 and not has_echoed[
-                            i]:
+                    elif (request.echo and request.max_tokens > 0
+                          and not has_echoed[i]):
                         # echo the prompt and first token
                         delta_text = res.prompt + output.text
-                        delta_token_ids = res.prompt_token_ids + output.token_ids
+                        delta_token_ids = (res.prompt_token_ids +
+                                           output.token_ids)
                         top_logprobs = res.prompt_logprobs + (output.logprobs
                                                               or [])
                         has_echoed[i] = True
@@ -248,7 +252,9 @@ async def completion_stream_generator(
                             i]:] if output.logprobs else None
 
                     if request.logprobs is not None:
-                        assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested"
+                        assert top_logprobs is not None, (
+                            "top_logprobs must be provided when logprobs "
+                            "is requested")
                         logprobs = self._create_logprobs(
                             token_ids=delta_token_ids,
                             top_logprobs=top_logprobs,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 230d13d97dbba..2db884945c491 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -50,10 +50,12 @@ def __init__(self,
         except RuntimeError:
             event_loop = None
 
-        if event_loop is not None and event_loop.is_running(
-        ):  # If the current is instanced by Ray Serve, there is already a running event loop
+        if event_loop is not None and event_loop.is_running():
+            # If the current is instanced by Ray Serve,
+            # there is already a running event loop
             event_loop.create_task(self._post_init())
-        else:  # When using single vLLM without engine_use_ray
+        else:
+            # When using single vLLM without engine_use_ray
             asyncio.run(self._post_init())
 
     async def _post_init(self):
@@ -178,8 +180,9 @@ def _validate_prompt_and_tokenize(
 
         if token_num + request.max_tokens > self.max_model_len:
             raise ValueError(
-                f"This model's maximum context length is {self.max_model_len} tokens. "
-                f"However, you requested {request.max_tokens + token_num} tokens "
+                f"This model's maximum context length is "
+                f"{self.max_model_len} tokens. However, you requested "
+                f"{request.max_tokens + token_num} tokens "
                 f"({token_num} in the messages, "
                 f"{request.max_tokens} in the completion). "
                 f"Please reduce the length of the messages or completion.", )
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e667d70f71e39..99e6cdeee6364 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -20,10 +20,12 @@
                                                RowParallelLinear,
                                                QKVParallelLinear,
                                                MergedColumnParallelLinear)
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
-from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim
+from vllm.model_executor.parallel_utils.utils import (
+    split_tensor_along_last_dim)
 
 if TYPE_CHECKING:
     pass
@@ -84,7 +86,8 @@ def _apply_lora_packed_nslice(
         lora_b_stacked:    3 element tuple of (num_loras, output_dim, lora_rank)
         indices:           (batch_size)
         output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...), where n is number of slices
+        output_slices:     n-1 element tuple of (slice_size...),
+                           where n is number of slices
     """
     org_output = output
     x = x.view(-1, x.shape[-1])
@@ -819,9 +822,8 @@ def create_lora_weights(
     ) -> None:
         # Keep this in sync with csrc/punica/bgmv/bgmv_config.h
         if 32000 < self.base_layer.vocab_size > 33024:
-            raise ValueError(
-                "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024"
-            )
+            raise ValueError("When using LoRA, vocab size must be "
+                             "32000 >= vocab_size <= 33024")
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 7386d21c58e4e..238da256b7cdc 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -13,7 +13,8 @@
 from vllm.config import LoRAConfig
 from vllm.utils import LRUCache, in_wsl
 
-from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler
+from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer,
+                              from_layer_sampler)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 7e92bc93ab472..911115d63a639 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -154,10 +154,9 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel:
                 f"LoRA rank {lora.rank} is greater than max_lora_rank "
                 f"{self.lora_config.max_lora_rank}.")
         if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
-            raise ValueError(
-                f"LoRA added vocab size {lora.extra_vocab_size} is greater than "
-                f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}."
-            )
+            raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} "
+                             f"is greater than lora_extra_vocab_size "
+                             f"{self.lora_config.lora_extra_vocab_size}.")
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py
index a8573f8bdc6c8..00984460d79a6 100644
--- a/vllm/model_executor/guided_decoding.py
+++ b/vllm/model_executor/guided_decoding.py
@@ -8,8 +8,10 @@
 from typing import Union, Tuple
 from pydantic import BaseModel
 
-from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest
-from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor
+from vllm.entrypoints.openai.protocol import (CompletionRequest,
+                                              ChatCompletionRequest)
+from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
+                                                          RegexLogitsProcessor)
 
 
 class GuidedDecodingMode(Enum):
diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py
index 1b3e5e71a5911..76d41aa37dd7b 100644
--- a/vllm/model_executor/guided_logits_processors.py
+++ b/vllm/model_executor/guided_logits_processors.py
@@ -107,12 +107,15 @@ def __init__(self,
         Parameters
         ----------
         schema
-            A JSON schema that encodes the structure we want the model to generate
+            A JSON schema that encodes the structure we want the model to
+            generate
         tokenizer
             The model's tokenizer
         whitespace_pattern
-            Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
-            Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
+            Pattern to use for JSON syntactic whitespace (doesn't impact
+            string literals)
+            Example: allow only a single space or newline with
+            `whitespace_pattern=r"[\n ]?"`
         """
         if isinstance(schema, type(BaseModel)):
             schema_str = json.dumps(schema.model_json_schema())
@@ -122,8 +125,8 @@ def __init__(self,
             schema_str = schema
         else:
             raise ValueError(
-                f"Cannot parse schema {schema}. The schema must be either " +
-                "a Pydantic object, a dictionary or a string that contains the JSON "
-                + "Schema specification")
+                f"Cannot parse schema {schema}. The schema must be either "
+                f"a Pydantic object, a dictionary or a string that contains "
+                f"the JSON Schema specification")
         regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
         super().__init__(regex_string, tokenizer)
diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py
index 724dd0511c5aa..4b63b9eaf59a7 100644
--- a/vllm/model_executor/layers/attention/attention.py
+++ b/vllm/model_executor/layers/attention/attention.py
@@ -35,12 +35,12 @@ def __init__(
     ) -> None:
         super().__init__()
         if _use_flash_attn():
-            from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend
+            from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend  # noqa: E501
             self.backend = FlashAttentionBackend(num_heads, head_size, scale,
                                                  num_kv_heads, alibi_slopes,
                                                  sliding_window)
         else:
-            from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend
+            from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend  # noqa: E501
             self.backend = XFormersBackend(num_heads, head_size, scale,
                                            num_kv_heads, alibi_slopes,
                                            sliding_window)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 08e3c2d5b706e..3e6dd0dfe2eb3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -30,9 +30,10 @@ def fused_moe_kernel(
     K,
     EM,
     num_valid_tokens,
-    # The stride variables represent how much to increase the ptr by when moving by 1
-    # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
-    # by to get the element one row down (A has M rows).
+    # The stride variables represent how much to increase the ptr by when
+    # moving by 1 element in a particular dimension. E.g. `stride_am` is
+    # how much to increase `a_ptr` by to get the element one row down
+    # (A has M rows).
     stride_am,
     stride_ak,
     stride_be,
@@ -50,17 +51,30 @@ def fused_moe_kernel(
     compute_type: tl.constexpr,
 ):
     """
-    Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices.
+    Implements the fused computation for a Mixture of Experts (MOE) using
+    token and expert matrices.
 
     Key Parameters:
-    - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token.
-    - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension.
-    - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated,
-        and N is the output feature dimension.
-    - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to.
-    - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A.
-    This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids`
-    by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert.
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
     """
     # -----------------------------------------------------------
     # Map program ids `pid` to the block of C it should compute.
@@ -105,7 +119,8 @@ def fused_moe_kernel(
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
-        # Load the next block of A and B, generate a mask by checking the K dimension.
+        # Load the next block of A and B, generate a mask by checking the
+        # K dimension.
         a = tl.load(a_ptrs,
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
@@ -139,30 +154,41 @@ def moe_align_block_size(
         topk_ids: torch.Tensor, block_size: int,
         num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Aligns the token distribution across experts to be compatible with block size for matrix multiplication.
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
 
     Parameters:
-    - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token.
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
     - block_size: The block size used in block matrix multiplication.
     - num_experts: The total number of experts.
 
     Returns:
-    - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert.
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
     - expert_ids: A tensor indicating the assigned expert index for each block.
-    - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
 
-    This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. 
-    Padding ensures that during block matrix multiplication, the dimensions align correctly.
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
 
     Example:
-    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4:
-    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens.
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
     - As block_size is 4, we pad 1 token for each expert.
     - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
     - Then append padding tokens [12, 12, 12, 12] for each block.
-    - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. 
-        Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication.
-    - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
     """
     sorted_ids = torch.empty(
         (topk_ids.numel() + num_experts * (block_size - 1), ),
@@ -224,13 +250,14 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
     """
     Return optimized configurations for the fused MoE kernel.
 
-    The return value will be a dictionary that maps an irregular grid of batch sizes
-    to configurations of the fused_moe kernel. To evaluate the kernel on a given batch
-    size bs, the closest batch size in the grid should be picked and the associated
-    configuration chosen to invoke the kernel.
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the fused_moe kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
     """
 
-    # First look up if an optimized configuration is available in the configs directory
+    # First look up if an optimized configuration is available in the configs
+    # directory
     device_name = torch.cuda.get_device_name().replace(" ", "_")
 
     config_file_path = os.path.join(
@@ -243,7 +270,8 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
             # If a configuration has been found, return it
             return {int(key): val for key, val in json.load(f).items()}
 
-    # If no optimized configuration is available, we will use the default configuration
+    # If no optimized configuration is available, we will use the default
+    # configuration
     return None
 
 
@@ -258,18 +286,22 @@ def fused_moe(
     override_config: Optional[Dict[str, Any]] = None,
 ) -> torch.Tensor:
     """
-    This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism.
-    
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
     Parameters:
     - hidden_states (torch.Tensor): The input tensor to the MoE layer.
     - w1 (torch.Tensor): The first set of expert weights.
     - w2 (torch.Tensor): The second set of expert weights.
-    - gating_output (torch.Tensor): The output of the gating operation (before softmax).
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
     - topk (int): The number of top-k experts to select.
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
-    - inplace (bool): If True, perform the operation in-place. Defaults to False.
-    - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration.
-    
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - override_config (Optional[Dict[str, Any]]): Optional override
+        for the kernel configuration.
+
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
@@ -325,7 +357,8 @@ def fused_moe(
         configs = get_moe_configs(E, w2.shape[2])
 
         if configs:
-            # If an optimal configuration map has been found, look up the optimal config
+            # If an optimal configuration map has been found, look up the
+            # optimal config
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index b2396a1d6f141..60f6fc83b200f 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -285,7 +285,8 @@ def weight_loader(self,
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
 
-                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    # If marlin, we need to adjust the offset and size to
+                    # account for the tiling.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -307,7 +308,8 @@ def weight_loader(self,
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
 
-                # If marlin, we need to adjust the offset and size to account for the tiling.
+                # If marlin, we need to adjust the offset and size to
+                # account for the tiling.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
@@ -413,7 +415,8 @@ def weight_loader(self,
                     shard_size = shard_size // param.pack_factor
                     shard_offset = shard_offset // param.pack_factor
 
-                    # If marlin, we need to adjust the offset and size to account for the tiling.
+                    # If marlin, we need to adjust the offset and size to
+                    # account for the tiling.
                     shard_size, shard_offset = adjust_marlin_shard(
                         param, shard_size, shard_offset)
 
@@ -442,7 +445,8 @@ def weight_loader(self,
                 shard_size = shard_size // param.pack_factor
                 shard_offset = shard_offset // param.pack_factor
 
-                # If marlin, we need to adjust the offset and size to account for the tiling.
+                # If marlin, we need to adjust the offset and size to
+                # account for the tiling.
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index dc54641878c64..af27b1844cea4 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,6 +1,7 @@
 from typing import Type
 
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 3e1c814dd233c..2caef5f1ebf50 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -6,7 +6,8 @@
 from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 
 
 class AWQConfig(QuantizationConfig):
@@ -50,7 +51,8 @@ def get_min_capability(self) -> int:
     def get_config_filenames() -> List[str]:
         return [
             "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
-            "quantize_config.json",  # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            "quantize_config.json",
         ]
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 2e6aabb232673..bb69c7235a133 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -31,8 +31,8 @@ def __init__(
         self.pack_factor = Fraction(32, self.weight_bits)
         if self.weight_bits not in [2, 3, 4, 8]:
             raise ValueError(
-                "Currently, only 2/3/4/8-bit weight quantization is supported for "
-                f"GPTQ, but got {self.weight_bits} bits.")
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits.")
 
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
@@ -101,7 +101,8 @@ def create_weights(
                 "The input size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
                 "tensor parallel size.")
-        if output_size_per_partition % self.quant_config.pack_factor.numerator != 0:
+        if (output_size_per_partition % self.quant_config.pack_factor.numerator
+                != 0):
             raise ValueError(
                 "The output size is not aligned with the quantized "
                 "weight shape. This can be caused by too large "
@@ -114,7 +115,8 @@ def create_weights(
         exllama_state = ExllamaState.UNINITIALIZED
         scale_and_zero_size = input_size // group_size
         scale_and_zero_input_dim = None
-        if input_size != input_size_per_partition and self.quant_config.group_size != -1:
+        if (input_size != input_size_per_partition
+                and self.quant_config.group_size != -1):
             # For act-order models, we cannot use Exllama for row parallel layer
             if self.quant_config.desc_act:
                 exllama_state = ExllamaState.UNUSED
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 7566d78a8aba4..0c4f20d9e3a58 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -5,7 +5,8 @@
 
 from vllm._C import ops
 from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 
 
 class MarlinConfig(QuantizationConfig):
@@ -22,8 +23,9 @@ def __init__(
         self.group_size = group_size
         if self.group_size != 128 and self.group_size != -1:
             raise ValueError(
-                "Currently, only group size 128 and -1 (channelwise) is supported for "
-                f"Marlin, but got group_size of {self.group_size}")
+                "Currently, only group size 128 and -1 (channelwise) "
+                "is supported for Marlin, but got group_size of "
+                f"{self.group_size}")
 
         # 4 Bits packed into 32 bit datatype.
         self.pack_factor = 32 // 4
@@ -37,7 +39,8 @@ def __init__(
         # Min in_features dim
         self.min_k_threads = 128
 
-        # Max parallel problems to solve at once (improves large batch performance)
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
         self.max_parallel = 16
 
         # Permutation length used by the marlin kernels.
@@ -102,22 +105,26 @@ def create_weights(
         # Validate output_size_per_partition
         if output_size_per_partition % self.quant_config.min_n_threads != 0:
             raise ValueError(
-                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}."
-            )
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}.")
         if output_size_per_partition % self.quant_config.pack_factor != 0:
             raise ValueError(
-                f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}."
-            )
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}.")
 
         # Validate input_size_per_partition
         if input_size_per_partition % self.quant_config.min_k_threads != 0:
             raise ValueError(
-                f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}."
-            )
-        if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0:
-            raise ValueError(
-                f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}."
-            )
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}.")
+        if (self.quant_config.group_size != -1 and
+                input_size_per_partition % self.quant_config.group_size != 0):
+            raise ValueError(f"Weight input_size_per_partition = "
+                             f"{input_size_per_partition} is not divisible by "
+                             f"group_size = {self.quant_config.group_size}.")
 
         # Check that we have at least 4 tiles horizontally in the shard
         num_tiles_per_perm = self.quant_config.perm_len // (
@@ -149,7 +156,9 @@ def create_weights(
         )
 
         # Determine if channelwise or not
-        input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size
+        input_groups = (1 if self.quant_config.group_size == -1 else
+                        input_size_per_partition //
+                        self.quant_config.group_size)
 
         scales = Parameter(
             torch.empty(
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 9244e88552756..ed25455e6ec1f 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -6,7 +6,8 @@
 from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
-from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.utils import is_hip
 
 
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 19e7f630c4620..4377b845df628 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -6,7 +6,8 @@
 
 from vllm.model_executor.parallel_utils.communication_op import (
     tensor_model_parallel_gather)
-from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors
+from vllm.model_executor.sampling_metadata import (SamplingMetadata,
+                                                   SamplingTensors)
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs,
                            SamplerOutput, SequenceData, SequenceGroupOutput,
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 6da0082b94285..cbf472750e294 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -333,7 +333,8 @@ def load_weights(self,
             if "rotary_emb.inv_freq" in name:
                 continue
             if name == "lm_head.weight":
-                # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to:
+                # Unlike Baichuan, Baichuan2 normalizes the head weights.
+                # Refer to:
                 # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
                 # Distinguish between Baichuan and Baichuan2 by checking the
                 # vocab size. This is suggested by
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index f2dca3df27cfb..13c080cb02774 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -119,7 +119,8 @@ def __init__(
                                      linear_method=None)
 
         if config.n_shared_experts is not None:
-            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            intermediate_size = (config.moe_intermediate_size *
+                                 config.n_shared_experts)
             self.shared_experts = DeepseekMLP(
                 hidden_size=config.hidden_size,
                 intermediate_size=intermediate_size,
@@ -273,8 +274,9 @@ def __init__(
             max_position_embeddings=max_position_embeddings,
             linear_method=linear_method,
         )
-        if (config.n_routed_experts is not None and  \
-            layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0):
+        if (config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0):
             self.mlp = DeepseekMoE(config=config, linear_method=linear_method)
         else:
             self.mlp = DeepseekMLP(
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index b8c6822e9825e..93dce7b67a7a5 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -143,7 +143,8 @@ def __init__(
         linear_method: Optional[LinearMethodBase] = None,
     ):
         super().__init__()
-        inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
+        inner_dim = (4 * config.n_embd
+                     if config.n_inner is None else config.n_inner)
         self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.attn = GPTJAttention(config, linear_method)
         self.mlp = GPTJMLP(inner_dim, config, linear_method)
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 0ae0a85643456..7b2215ef4bda5 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -305,7 +305,8 @@ def load_weights(self,
                 param = params_dict[name]
                 if "wqkv" in name:
                     config = self.config
-                    kv_groups = config.num_attention_heads // config.num_key_value_heads
+                    kv_groups = (config.num_attention_heads //
+                                 config.num_key_value_heads)
                     head_dim = config.hidden_size // config.num_attention_heads
                     loaded_weight = loaded_weight.view(-1, 2 + kv_groups,
                                                        head_dim,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index fa7a6d850051e..2b0a420e82faf 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -52,7 +52,8 @@
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
     get_tensor_model_parallel_world_size, )
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -81,7 +82,8 @@ def output_multiplier(self) -> float:
 
 class OlmoAttention(nn.Module):
     """
-    This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
     """
 
@@ -94,11 +96,12 @@ def __init__(
         self.config = config
         self.hidden_size = config.d_model
         assert config.d_model % config.n_heads == 0
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
-        )
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
         self.total_num_heads = self.config.n_heads
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
         self.head_dim = self.hidden_size // self.total_num_heads
 
         # Layer norms.
@@ -158,7 +161,8 @@ def forward(
 
 class OlmoMLP(nn.Module):
     """
-    This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    This is the MLP block where the output is computed as
+    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
     """
 
@@ -217,7 +221,8 @@ def forward(
 
 class OlmoBlock(nn.Module):
     """
-    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
     (plus another skip connection).
     """
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 4dd63f923e5f2..3e4f843e649b4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -170,7 +170,8 @@ def __init__(
         self.hidden_size = config.hidden_size
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 1000000)
-        use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers
+        use_sliding_window = (config.use_sliding_window
+                              and layer_idx < config.max_window_layers)
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index d1a547f815616..c66f327beee7a 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -1,5 +1,6 @@
 # coding=utf-8
-# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team.
+# All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +17,8 @@
 # This code is based off the following work:
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
-"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights."""
+"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple
 
 import torch
@@ -102,9 +104,9 @@ def __init__(self,
         self.kv_size = self.num_key_value_heads * self.head_dim
         self.qkv_bias = getattr(config, "use_qkv_bias", False)
         if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads}).")
+            raise ValueError(f"hidden_size must be divisible by num_heads "
+                             f"(got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
 
         self.qkv_proj = QKVParallelLinear(self.hidden_size,
                                           self.head_dim,
@@ -192,7 +194,6 @@ def __init__(self,
                  config: PretrainedConfig,
                  linear_method: Optional[LinearMethodBase] = None) -> None:
         super().__init__()
-        # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index efa235233372f..cfbb1bdb7909e 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -35,7 +35,8 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE)
-from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size)
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                               hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py
index b8d63d4ff12fc..c434b270a5562 100644
--- a/vllm/model_executor/neuron_model_loader.py
+++ b/vllm/model_executor/neuron_model_loader.py
@@ -34,7 +34,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
 
 def get_model(model_config: ModelConfig, device_config: DeviceConfig,
               **kwargs) -> nn.Module:
-    from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig
+    from transformers_neuronx.config import (NeuronConfig,
+                                             ContinuousBatchingConfig)
 
     parallel_config = kwargs.get("parallel_config")
     scheduler_config = kwargs.get("scheduler_config")
diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py
index cf805df892fdc..521b6b8a383b0 100644
--- a/vllm/model_executor/parallel_utils/communication_op.py
+++ b/vllm/model_executor/parallel_utils/communication_op.py
@@ -11,7 +11,8 @@
     get_tensor_model_parallel_group,
     is_cupy_nccl_enabled_for_all_reduce,
 )
-from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce
+from vllm.model_executor.parallel_utils.custom_all_reduce import (
+    custom_all_reduce)
 
 
 def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
@@ -24,7 +25,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
     and GPU topology.
 
     TLDR: always assume this function modifies its input, but use the return
-    value as the output. 
+    value as the output.
     """
     # Bypass the function if we are using only 1 GPU.
     if get_tensor_model_parallel_world_size() == 1:
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 7deb80801856e..b23f0170a6ca5 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -114,7 +114,8 @@ def from_sampling_metadata(
                 do_penalties = True
             if (i < sampling_metadata.num_prompts
                     and sampling_params.prompt_logprobs is not None):
-                # For tokens in the prompt that we only need to get their logprobs
+                # For tokens in the prompt that we only need to get
+                # their logprobs
                 prompt_len = sampling_metadata.prompt_lens[i]
                 temperatures += [temperature] * (prompt_len - 1)
                 top_ps += [top_p] * (prompt_len - 1)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8103f3c2b24bf..4aa158878fb96 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -74,8 +74,8 @@ class SamplingParams:
         stop_token_ids: List of tokens that stop the generation when they are
             generated. The returned output will contain the stop tokens unless
             the stop tokens are special tokens.
-        include_stop_str_in_output: Whether to include the stop strings in output
-            text. Defaults to False.
+        include_stop_str_in_output: Whether to include the stop strings in
+            output text. Defaults to False.
         ignore_eos: Whether to ignore the EOS token and continue generating
             tokens after the EOS token is generated.
         max_tokens: Maximum number of tokens to generate per output sequence.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 37c102407a5f2..4a002edaf580f 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -351,7 +351,8 @@ def maybe_set_first_token_time(self, time: float) -> None:
             self.metrics.first_token_time = time
 
     def maybe_set_first_scheduled_time(self, time: float) -> None:
-        """Sets the first scheduled time and time in queue for Request level timings."""
+        """Sets the first scheduled time and time in queue for Request
+        level timings."""
         if self.metrics.first_scheduled_time is None:
             self.metrics.first_scheduled_time = time
             self.metrics.time_in_queue = time - self.metrics.arrival_time
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 478c950f52873..0f698fa346010 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -5,8 +5,12 @@
 
 from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData)
 from vllm.worker.worker import Worker
-from vllm.spec_decode.util import nvtx_range, sampler_output_to_torch, get_all_seq_ids, split_batch_by_proposal_len
-from vllm.spec_decode.interfaces import SpeculativeScorer, SpeculativeProposals, SpeculativeScores
+from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch,
+                                   get_all_seq_ids,
+                                   split_batch_by_proposal_len)
+from vllm.spec_decode.interfaces import (SpeculativeScorer,
+                                         SpeculativeProposals,
+                                         SpeculativeScores)
 
 SeqId = int
 TargetSeqId = int
@@ -68,11 +72,12 @@ def score_proposals(
         proposal_lens_list = proposals.proposal_lens.tolist()
         proposal_token_ids_list = proposals.proposal_token_ids.tolist()
 
-        spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens = self._expand_batch(
-            seq_group_metadata_list=seq_group_metadata_list,
-            proposal_token_ids_list=proposal_token_ids_list,
-            proposal_lens_list=proposal_lens_list,
-        )
+        (spec_indices, non_spec_indices, target_seq_group_metadata_list,
+         num_scoring_tokens) = self._expand_batch(
+             seq_group_metadata_list=seq_group_metadata_list,
+             proposal_token_ids_list=proposal_token_ids_list,
+             proposal_lens_list=proposal_lens_list,
+         )
 
         target_sampler_output = self._scorer_worker.execute_model(
             seq_group_metadata_list=target_seq_group_metadata_list,
@@ -125,7 +130,8 @@ def _expand_batch(
         num_scoring_tokens = len(target_seq_group_metadata_list)
         target_seq_group_metadata_list.extend(non_spec_seqs)
 
-        return spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens
+        return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
+                num_scoring_tokens)
 
     def _contract_batch(self, original_bs: int,
                         target_sampler_output: List[SamplerOutput],
@@ -306,10 +312,11 @@ def _split_scoring_output(
         # Convert non-speculative output tokens to tensors.
         sampler_output.sampled_token_probs = non_spec_probs
         sampler_output.sampled_token_ids = non_spec_sampled_tokens
-        non_spec_target_token_ids, non_spec_target_probs = sampler_output_to_torch(
-            [sampler_output])
+        non_spec_target_token_ids, non_spec_target_probs = (
+            sampler_output_to_torch([sampler_output]))
 
-        return target_token_ids, target_probs, non_spec_target_token_ids, non_spec_target_probs
+        return (target_token_ids, target_probs, non_spec_target_token_ids,
+                non_spec_target_probs)
 
     def _create_target_seq_id_iterator(
             self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index f7be14d3d22c2..0915c275b0408 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -5,7 +5,8 @@
 
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.worker.worker import Worker
-from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeProposer
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
 from vllm.spec_decode.util import sampler_output_to_torch
 
 
@@ -247,8 +248,9 @@ def get_proposals(
         """
 
         # Split speculative- and non-speculative- sequences.
-        proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices = self._split_by_max_model_len(
-            seq_group_metadata_list, max_proposal_len)
+        (proposal_lens, nonzero_proposal_len_seqs,
+         nonzero_proposal_len_indices) = self._split_by_max_model_len(
+             seq_group_metadata_list, max_proposal_len)
 
         if nonzero_proposal_len_seqs:
             # Speculate tokens using the draft worker for the speculative
@@ -306,7 +308,8 @@ def _split_by_max_model_len(
             else:
                 proposal_lens.append(0)
 
-        return proposal_lens, nonzero_proposal_len_seqs, nonzero_proposal_len_indices
+        return (proposal_lens, nonzero_proposal_len_seqs,
+                nonzero_proposal_len_indices)
 
     def _merge_outputs(
         self,
@@ -356,7 +359,8 @@ def _merge_outputs(
                                             device=self._device)
         entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
 
-        proposal_tokens, proposal_probs = entire_proposal_tokens, entire_proposal_probs
+        proposal_tokens, proposal_probs = (entire_proposal_tokens,
+                                           entire_proposal_probs)
 
         proposal_lens = torch.zeros(batch_size,
                                     dtype=torch.long,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 890e479202372..1e56741347008 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -10,7 +10,8 @@
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.config import CacheConfig
-from vllm.spec_decode.util import nvtx_range, get_all_seq_ids, split_batch_by_proposal_len
+from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids,
+                                   split_batch_by_proposal_len)
 from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import SpeculativeScorer
@@ -25,7 +26,7 @@ class SpecDecodeWorker:
     LLM, after which some verification routine determines which (if any) of the
     speculative tokens are accepted by the larger LLM.
 
-    See https://github.com/vllm-project/vllm/pull/2188 and 
+    See https://github.com/vllm-project/vllm/pull/2188 and
     https://github.com/vllm-project/vllm/pull/3103 for more info.
 
     The current implementation has the following limitations:
@@ -109,10 +110,12 @@ def profile_num_available_blocks(self, block_size: int,
                 block_size, gpu_memory_utilization, cpu_swap_space,
                 cache_dtype))
 
-        scorer_cache_block_size_bytes = self.scorer_worker.get_cache_block_size_bytes(
-            block_size, cache_dtype)
-        proposer_cache_block_size_bytes = self.proposer_worker.get_cache_block_size_bytes(
-            block_size, cache_dtype)
+        scorer_cache_block_size_bytes = (
+            self.scorer_worker.get_cache_block_size_bytes(
+                block_size, cache_dtype))
+        proposer_cache_block_size_bytes = (
+            self.proposer_worker.get_cache_block_size_bytes(
+                block_size, cache_dtype))
 
         new_num_gpu_blocks = split_num_cache_blocks_evenly(
             scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
@@ -320,8 +323,8 @@ def _create_output_sampler_list(
             sampler_output_list.append(
                 SamplerOutput(outputs=step_output_token_ids))
 
-        maybe_rejsample_metrics = self._metrics.maybe_collect_rejsample_metrics(
-            k)
+        maybe_rejsample_metrics = (
+            self._metrics.maybe_collect_rejsample_metrics(k))
         if maybe_rejsample_metrics is not None:
             sampler_output_list[
                 0].spec_decode_worker_metrics = maybe_rejsample_metrics
diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py
index 5ea0d9122ef11..2c0e45623aa25 100644
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@@ -62,62 +62,6 @@ def __init__(self,
                  fc_type: str = 'torch',
                  verbose: Optional[int] = None,
                  **kwargs: Any):
-        """The MPT configuration class.
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the ffn.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict): A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-                kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
-            ffn_config (Dict): A dictionary used to configure the model's ffn module:
-                ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-            fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
-        """
         self.d_model = d_model
         self.n_heads = n_heads
         self.n_layers = n_layers
@@ -139,8 +83,8 @@ def __init__(self,
         self.fc_type = fc_type
         if verbose is not None:
             warnings.warn(DeprecationWarning(
-                'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
-            ),
+                'verbose argument for MPTConfig is now ignored and '
+                'will be removed. Use python_log_level instead.'),
                           stacklevel=2)
         if 'name' in kwargs:
             del kwargs['name']
@@ -149,7 +93,8 @@ def __init__(self,
         if self.attn_config.get('alibi', False):
             self.learned_pos_emb = False
             warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
+                f'alibi is turned on, setting `learned_pos_emb` '
+                f'to {self.learned_pos_emb}`',
                 stacklevel=2)
         super().__init__(**kwargs)
         self._validate_config()
@@ -176,8 +121,8 @@ def _validate_config(self) -> None:
             [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]
         )):
             raise ValueError(
-                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"  # pylint: disable=line-too-long
-            )
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are "
+                "probabilities and must be between 0 and 1")
         if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
             raise ValueError(
                 f"Unknown attn_impl={self.attn_config['attn_impl']}")
@@ -193,17 +138,17 @@ def _validate_config(self) -> None:
         if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
                 'attn_impl'] not in ['torch', 'triton']:
             raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch and triton attention.'  # pylint: disable=line-too-long
-            )
+                'attn_uses_sequence_id only implemented with torch '
+                'and triton attention.')
         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError(
-                'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'  # pylint: disable=line-too-long
-            )
+                'model.embedding_fraction must be between 0 (exclusive) '
+                'and 1 (inclusive)!')
         if isinstance(self.logit_scale,
                       str) and self.logit_scale != 'inv_sqrt_d_model':
             raise ValueError(
-                f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."  # pylint: disable=line-too-long
-            )
+                f"self.logit_scale={self.logit_scale!r} is not recognized as "
+                "an option; use numeric value or 'inv_sqrt_d_model'.")
         if self.init_config.get('name', None) is None:
             raise ValueError(
                 f"self.init_config={self.init_config!r} 'name' needs to be set."
@@ -219,11 +164,11 @@ def _validate_config(self) -> None:
                 del te
             except Exception as exc:
                 raise ImportError(
-                    # pylint: disable=line-too-long
-                    'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. '
-                    +
-                    'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n'
-                    + 'pip install flash-attn==1.0.6 --no-build-isolation \n' +
+                    'TransformerEngine import fail. `fc_type: te` requires '
+                    'TransformerEngine be installed. '
+                    'The required version of transformer_engine also requires '
+                    'FlashAttention v1.0.6 is installed:\n'
+                    'pip install flash-attn==1.0.6 --no-build-isolation \n'
                     'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156'
                 ) from exc
         if self.ffn_config['ffn_type'] == 'mptmlp':
diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py
index 4c3b6b8def074..2879cd0445275 100644
--- a/vllm/transformers_utils/configs/starcoder2.py
+++ b/vllm/transformers_utils/configs/starcoder2.py
@@ -2,78 +2,6 @@
 
 
 class Starcoder2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
-    Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.
-
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 49152):
-            Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Starcoder2Model`]
-        hidden_size (`int`, *optional*, defaults to 3072):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 12288):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 30):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 24):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 2):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 4096):
-            The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
-            allows sequence of up to 4096*32 tokens.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        norm_epsilon (`float`, *optional*, defaults to 1e-05):
-            Epsilon value for the layer norm
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        bos_token_id (`int`, *optional*, defaults to 50256):
-            The id of the "beginning-of-sequence" token.
-        eos_token_id (`int`, *optional*, defaults to 50256):
-            The id of the "end-of-sequence" token.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        sliding_window (`int`, *optional*):
-            Sliding window attention window size. If not specified, will default to `None` (no sliding window).
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        residual_dropout (`float`, *optional*, defaults to 0.0):
-            Residual connection dropout value.
-        embedding_dropout (`float`, *optional*, defaults to 0.0):
-            Embedding dropout.
-        use_bias (`bool`, *optional*, defaults to `True`):
-            Whether to use bias term on linear layers of the model.
-
-
-    ```python
-    >>> from transformers import Starcoder2Model, Starcoder2Config
-
-    >>> # Initializing a Starcoder2 7B style configuration
-    >>> configuration = Starcoder2Config()
-
-    >>> # Initializing a model from the Starcoder2 7B style configuration
-    >>> model = Starcoder2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
     model_type = "starcoder2"
     keys_to_ignore_at_inference = ["past_key_values"]
 
diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py
index 1dd241e4a5c4b..02045bdcb2ccf 100644
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
@@ -1,4 +1,3 @@
-# yapf: disable
 # Adapted from
 # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
 # This includes a fix suggested in
@@ -13,7 +12,6 @@
 from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 from transformers.utils import logging
 
-
 logger = logging.get_logger(__name__)
 
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
@@ -52,27 +50,16 @@ def __init__(
         clean_up_tokenization_spaces=False,
         **kwargs,
     ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
+        self.sp_model_kwargs = ({} if sp_model_kwargs is None else
+                                sp_model_kwargs)
+        bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
+                     if isinstance(bos_token, str) else bos_token)
+        eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
+                     if isinstance(eos_token, str) else eos_token)
+        unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
+                     if isinstance(unk_token, str) else unk_token)
+        pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
+                     if isinstance(pad_token, str) else pad_token)
         self.vocab_file = vocab_file
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
@@ -107,7 +94,10 @@ def vocab_size(self):
 
     def get_vocab(self):
         """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
         vocab.update(self.added_tokens_encoder)
         return vocab
 
@@ -130,7 +120,8 @@ def convert_tokens_to_string(self, tokens):
         out_string = ""
         prev_is_special = False
         for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
+            # make sure that special tokens are not decoded using
+            # sentencepiece model
             if token in self.all_special_tokens:
                 if not prev_is_special and i != 0:
                     out_string += " "
@@ -143,9 +134,9 @@ def convert_tokens_to_string(self, tokens):
         out_string += self.sp_model.decode(current_sub_tokens)
         return out_string
 
-    def save_vocabulary(
-        self, save_directory, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
+    def save_vocabulary(self,
+                        save_directory,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
         Save the vocabulary and special tokens file to a directory.
 
@@ -157,24 +148,24 @@ def save_vocabulary(
             `Tuple(str)`: Paths to the files saved.
         """
         if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            logger.error(f"Vocabulary path ({save_directory}) "
+                         "should be a directory")
             return
         out_vocab_file = os.path.join(
             save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
-            + VOCAB_FILES_NAMES["vocab_file"],
+            (filename_prefix + "-" if filename_prefix else "") +
+            VOCAB_FILES_NAMES["vocab_file"],
         )
 
         if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
-        ) and os.path.isfile(self.vocab_file):
+                out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
         elif not os.path.isfile(self.vocab_file):
             with open(out_vocab_file, "wb") as fi:
                 content_spiece_model = self.sp_model.serialized_model_proto()
                 fi.write(content_spiece_model)
 
-        return (out_vocab_file,)
+        return (out_vocab_file, )
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
@@ -194,7 +185,8 @@ def get_special_tokens_mask(
         already_has_special_tokens: bool = False,
     ) -> List[int]:
         """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        Retrieve sequence ids from a token list that has no special tokens
+        added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
@@ -202,11 +194,14 @@ def get_special_tokens_mask(
                 List of IDs.
             token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
+            already_has_special_tokens (`bool`, *optional*, defaults to
+            `False`):
+                Whether or not the token list is already formatted with
+                special tokens for the model.
 
         Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]:
+            1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -220,20 +215,16 @@ def get_special_tokens_mask(
 
         if token_ids_1 is None:
             return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
+        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
+                bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
 
     def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
         """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        Creates a mask from the two sequences passed to be used in a
+        sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
 
         ```
@@ -250,7 +241,8 @@ def create_token_type_ids_from_sequences(
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids)
+            according to the given sequence(s).
         """
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
         eos_token_id = [self.eos_token_id] if self.add_eos_token else []
diff --git a/vllm/utils.py b/vllm/utils.py
index 5b94067cec777..fe6fd27962cd3 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -133,9 +133,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     # the Neuron-X backend does not have the `cuda_utils` module.
     from vllm._C import cuda_utils
 
-    max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
-        gpu)
-    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
+    max_shared_mem = (
+        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
     assert max_shared_mem > 0, "max_shared_mem can not be zero"
     return int(max_shared_mem)
 
@@ -209,9 +210,8 @@ def get_nvcc_cuda_version() -> Optional[Version]:
     if not cuda_home:
         cuda_home = '/usr/local/cuda'
         if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(
-                f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
-            )
+            logger.info(f'CUDA_HOME is not found in the environment. '
+                        f'Using {cuda_home} as CUDA_HOME.')
         else:
             logger.warning(
                 f'Not found nvcc in {cuda_home}. Skip cuda version check!')
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0dd2309079403..7eac576e3f0fe 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -93,14 +93,13 @@ def load_model(self) -> None:
                                    scheduler_config=self.scheduler_config)
 
         self.model_memory_usage = m.consumed_memory
-        logger.info(
-            f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB"
-        )
+        logger.info(f"Loading model weights took "
+                    f"{self.model_memory_usage / float(2**30):.4f} GB")
 
         if self.lora_config:
-            assert hasattr(
-                self.model, "supported_lora_modules"
-            ) and self.model.supported_lora_modules, "Model does not support LoRA"
+            assert hasattr(self.model, "supported_lora_modules"
+                           ) and self.model.supported_lora_modules, (
+                               "Model does not support LoRA")
             assert hasattr(
                 self.model,
                 "embedding_modules"), "Model does not have embedding_modules"
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 3229a21c11a38..340c079600c78 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -79,7 +79,8 @@ def profile_num_available_blocks(
         cpu_swap_space: int = 0,
         cache_dtype: str = "float16",
     ) -> Tuple[int, int]:
-        """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
+        """Simply returns max_num_seqs as num_gpu_blocks, 0 as
+        num_cpu_blocks."""
         num_gpu_blocks = self.scheduler_config.max_num_seqs
         num_cpu_blocks = 0
         return num_gpu_blocks, num_cpu_blocks
@@ -177,7 +178,8 @@ def _init_distributed_environment(
             "distributed_init_method must be set if torch.distributed "
             "is not already initialized")
     else:
-        distributed_backend = distributed_backend if distributed_backend else "nccl"
+        distributed_backend = (distributed_backend
+                               if distributed_backend else "nccl")
         torch.distributed.init_process_group(
             backend=distributed_backend,
             world_size=parallel_config.world_size,

From 657061fdced8a33a60c1b09f5da2525de9da8f03 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Mon, 11 Mar 2024 00:54:51 -0700
Subject: [PATCH 081/113] [docs] Add LoRA support information for models
 (#3299)

---
 docs/source/models/lora.rst             |  3 ++-
 docs/source/models/supported_models.rst | 27 ++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst
index 21b18c75fc552..f05fafe9f8279 100644
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -92,7 +92,8 @@ LoRA adapter requests if they were provided and ``max_loras`` is set high enough
 
 The following is an example request 
 
-.. code-block::bash 
+.. code-block:: bash
+
     curl http://localhost:8000/v1/completions \
         -H "Content-Type: application/json" \
         -d '{
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 9d4ec663a16e5..4019e0bbd90fb 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -8,84 +8,109 @@ The following is the list of model architectures that are currently supported by
 Alongside each architecture, we include some popular models that use it.
 
 .. list-table::
-  :widths: 25 25 50
+  :widths: 25 25 50 5
   :header-rows: 1
 
   * - Architecture
     - Models
     - Example HuggingFace Models
+    - :ref:`LoRA <lora>`
   * - :code:`AquilaForCausalLM`
     - Aquila
     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
+    - ✅︎
   * - :code:`BaiChuanForCausalLM`
     - Baichuan
     - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
+    - 
   * - :code:`ChatGLMModel`
     - ChatGLM
     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
+    - 
   * - :code:`DeciLMForCausalLM`
     - DeciLM
     - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
+    - 
   * - :code:`BloomForCausalLM`
     - BLOOM, BLOOMZ, BLOOMChat
     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
+    - 
   * - :code:`FalconForCausalLM`
     - Falcon
     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
+    - 
   * - :code:`GemmaForCausalLM`
     - Gemma
     - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
+    - ✅︎
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
+    - 
   * - :code:`GPTBigCodeForCausalLM`
     - StarCoder, SantaCoder, WizardCoder
     - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
+    - 
   * - :code:`GPTJForCausalLM`
     - GPT-J
     - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
+    - 
   * - :code:`GPTNeoXForCausalLM`
     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
+    - 
   * - :code:`InternLMForCausalLM`
     - InternLM
     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
+    - ✅︎
   * - :code:`InternLM2ForCausalLM`
     - InternLM2
     - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
+    - 
   * - :code:`LlamaForCausalLM`
     - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - ✅︎
   * - :code:`MistralForCausalLM`
     - Mistral, Mistral-Instruct
     - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
+    - ✅︎
   * - :code:`MixtralForCausalLM`
     - Mixtral-8x7B, Mixtral-8x7B-Instruct
     - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.
+    - ✅︎
   * - :code:`MPTForCausalLM`
     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
+    - 
   * - :code:`OLMoForCausalLM`
     - OLMo
     - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
+    - 
   * - :code:`OPTForCausalLM`
     - OPT, OPT-IML
     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
+    - 
   * - :code:`OrionForCausalLM`
     - Orion
     - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
+    - 
   * - :code:`PhiForCausalLM`
     - Phi
     - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
+    - 
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
+    - 
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
     - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+    - ✅︎
   * - :code:`StableLmForCausalLM`
     - StableLM
     - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
+    - 
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.

From 4c922709b65ff5c0652ae36b93047016bdeaace8 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Mon, 11 Mar 2024 11:03:45 -0700
Subject: [PATCH 082/113] Add distributed model executor abstraction (#3191)

---
 docs/source/dev/engine/llm_engine.rst |   2 +-
 format.sh                             |   8 +-
 tests/lora/conftest.py                |   3 +-
 vllm/__init__.py                      |   4 +-
 vllm/config.py                        |   7 +-
 vllm/engine/async_llm_engine.py       | 106 +++---
 vllm/engine/llm_engine.py             | 446 +++-----------------------
 vllm/engine/ray_utils.py              |  58 ++--
 vllm/executor/__init__.py             |   0
 vllm/executor/executor_base.py        |  75 +++++
 vllm/executor/gpu_executor.py         | 163 ++++++++++
 vllm/executor/ray_gpu_executor.py     | 442 +++++++++++++++++++++++++
 vllm/executor/utils.py                |  13 +
 13 files changed, 818 insertions(+), 509 deletions(-)
 create mode 100644 vllm/executor/__init__.py
 create mode 100644 vllm/executor/executor_base.py
 create mode 100644 vllm/executor/gpu_executor.py
 create mode 100644 vllm/executor/ray_gpu_executor.py
 create mode 100644 vllm/executor/utils.py

diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst
index b550a9b5faa62..1de6d7adc87c6 100644
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
@@ -2,5 +2,5 @@ LLMEngine
 =================================
 
 .. autoclass:: vllm.engine.llm_engine.LLMEngine
-    :members: add_request, abort_request, step, _init_cache
+    :members: add_request, abort_request, step
     :show-inheritance:
\ No newline at end of file
diff --git a/format.sh b/format.sh
index eb2c5ab031626..ff30111123bee 100755
--- a/format.sh
+++ b/format.sh
@@ -95,13 +95,17 @@ echo 'vLLM yapf: Done'
 # echo 'vLLM mypy:'
 # mypy
 
+CODESPELL_EXCLUDES=(
+    '--skip' '*docs/source/_build/**'
+)
+
 # check spelling of specified files
 spell_check() {
     codespell "$@"
 }
 
 spell_check_all(){
-  codespell --toml pyproject.toml
+  codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
 }
 
 # Spelling  check of files that differ from main branch.
@@ -116,7 +120,7 @@ spell_check_changed() {
 
     if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
         git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             codespell
+             codespell "${CODESPELL_EXCLUDES[@]}"
     fi
 }
 
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 67273144ecd02..30a8ad03c8ada 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -152,4 +152,5 @@ def get_model_patched(model_config, device_config, **kwargs):
 @pytest.fixture
 def llama_2_7b_model_extra_embeddings(
         llama_2_7b_engine_extra_embeddings) -> nn.Module:
-    yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model
+    yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
+           model_runner.model)
diff --git a/vllm/__init__.py b/vllm/__init__.py
index f1e30f5eb6e6e..5e40c3c20fcd2 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -3,7 +3,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_cluster
+from vllm.engine.ray_utils import initialize_ray_cluster
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
@@ -19,5 +19,5 @@
     "EngineArgs",
     "AsyncLLMEngine",
     "AsyncEngineArgs",
-    "initialize_cluster",
+    "initialize_ray_cluster",
 ]
diff --git a/vllm/config.py b/vllm/config.py
index e893fe702c975..d2b68b6fa1fe2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union, ClassVar
+from typing import TYPE_CHECKING, Optional, Union, ClassVar
 from dataclasses import dataclass
 import os
 from packaging.version import Version
@@ -10,6 +10,9 @@
 from vllm.transformers_utils.config import get_config
 from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version
 
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
 logger = init_logger(__name__)
 
 _GB = 1 << 30
@@ -397,6 +400,7 @@ def __init__(
         max_parallel_loading_workers: Optional[int] = None,
         disable_custom_all_reduce: bool = False,
         ray_workers_use_nsight: bool = False,
+        placement_group: Optional["PlacementGroup"] = None,
     ) -> None:
         self.pipeline_parallel_size = pipeline_parallel_size
         if is_neuron():
@@ -412,6 +416,7 @@ def __init__(
         self.max_parallel_loading_workers = max_parallel_loading_workers
         self.disable_custom_all_reduce = disable_custom_all_reduce
         self.ray_workers_use_nsight = ray_workers_use_nsight
+        self.placement_group = placement_group
 
         self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         # Ray worker is not supported for Neuron backend.
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 5629d1a863d04..0cee604c14d45 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -2,8 +2,8 @@
 import os
 import time
 from functools import partial
-from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
-                    Union, AsyncIterator, Callable)
+from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type,
+                    Union, AsyncIterator)
 
 from transformers import PreTrainedTokenizer
 
@@ -11,7 +11,7 @@
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_cluster, ray
+from vllm.engine.ray_utils import initialize_ray_cluster, ray
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
@@ -208,17 +208,10 @@ async def step_async(self) -> List[RequestOutput]:
 
         if not scheduler_outputs.is_empty():
             # Execute the model.
-            all_outputs = await self._run_workers_async(
-                "execute_model",
-                driver_kwargs={
-                    "seq_group_metadata_list": seq_group_metadata_list,
-                    "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in,
-                    "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out,
-                    "blocks_to_copy": scheduler_outputs.blocks_to_copy,
-                })
-
-            # Only the driver worker returns the sampling results.
-            output = all_outputs[0]
+            output = await self.model_executor.execute_model_async(
+                seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in,
+                scheduler_outputs.blocks_to_swap_out,
+                scheduler_outputs.blocks_to_copy)
         else:
             output = []
 
@@ -268,37 +261,8 @@ async def add_request_async(
             lora_request=lora_request,
         )
 
-    async def _run_workers_async(
-        self,
-        method: str,
-        *args,
-        driver_args: Optional[List[Any]] = None,
-        driver_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers."""
-        coros = []
-
-        if driver_args is None:
-            driver_args = args
-        if driver_kwargs is None:
-            driver_kwargs = kwargs
-
-        # Run the driver worker asynchronously.
-        driver_executor = getattr(self.driver_worker, method)
-        coros.append(asyncio.get_event_loop().run_in_executor(
-            None, partial(driver_executor, *driver_args, **driver_kwargs)))
-
-        # Run the ray workers asynchronously.
-        for worker in self.workers:
-            coros.append(worker.execute_method.remote(method, *args, **kwargs))
-
-        all_outputs = await asyncio.gather(*coros)
-        return all_outputs
-
-    async def check_health_async(self):
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()
+    async def check_health_async(self) -> None:
+        self.model_executor.check_health()
 
 
 class AsyncLLMEngine:
@@ -353,6 +317,34 @@ def __init__(self,
         self._request_tracker: Optional[RequestTracker] = None
         self._errored_with: Optional[BaseException] = None
 
+    @classmethod
+    def from_engine_args(cls,
+                         engine_args: AsyncEngineArgs,
+                         start_engine_loop: bool = True) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+        if parallel_config.worker_use_ray or engine_args.engine_use_ray:
+            initialize_ray_cluster(parallel_config)
+            from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
+            executor_class = RayGPUExecutorAsync
+        else:
+            assert parallel_config.world_size == 1, (
+                "Ray is required if parallel_config.world_size > 1.")
+            from vllm.executor.gpu_executor import GPUExecutorAsync
+            executor_class = GPUExecutorAsync
+        # Create the async LLM engine.
+        engine = cls(parallel_config.worker_use_ray,
+                     engine_args.engine_use_ray,
+                     *engine_configs,
+                     executor_class,
+                     log_requests=not engine_args.disable_log_requests,
+                     log_stats=not engine_args.disable_log_stats,
+                     max_log_len=engine_args.max_log_len,
+                     start_engine_loop=start_engine_loop)
+        return engine
+
     @property
     def is_running(self) -> bool:
         return (self.background_loop is not None
@@ -670,35 +662,13 @@ async def get_model_config(self) -> ModelConfig:
         else:
             return self.engine.get_model_config()
 
-    @classmethod
-    def from_engine_args(cls,
-                         engine_args: AsyncEngineArgs,
-                         start_engine_loop: bool = True) -> "AsyncLLMEngine":
-        """Creates an async LLM engine from the engine arguments."""
-        # Create the engine configs.
-        engine_configs = engine_args.create_engine_configs()
-        parallel_config = engine_configs[2]
-        # Initialize the cluster.
-        placement_group = initialize_cluster(parallel_config,
-                                             engine_args.engine_use_ray)
-        # Create the async LLM engine.
-        engine = cls(parallel_config.worker_use_ray,
-                     engine_args.engine_use_ray,
-                     *engine_configs,
-                     placement_group,
-                     log_requests=not engine_args.disable_log_requests,
-                     log_stats=not engine_args.disable_log_stats,
-                     max_log_len=engine_args.max_log_len,
-                     start_engine_loop=start_engine_loop)
-        return engine
-
     async def do_log_stats(self) -> None:
         if self.engine_use_ray:
             await self.engine.do_log_stats.remote()
         else:
             self.engine.do_log_stats()
 
-    async def check_health(self):
+    async def check_health(self) -> None:
         """Raises an error if engine is unhealthy."""
         t = time.perf_counter()
         logger.debug("Starting health check...")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 6e045cd6d73c6..4cdad4180aa14 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,11 +1,5 @@
-import copy
-from collections import defaultdict
-import os
 import time
-import pickle
-import importlib
-from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple,
-                    Union)
+from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
 
 from transformers import PreTrainedTokenizer
 
@@ -15,8 +9,9 @@
                          ParallelConfig, SchedulerConfig, LoRAConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
+from vllm.executor.executor_base import ExecutorBase
 from vllm.engine.metrics import StatLogger, Stats
-from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
+from vllm.engine.ray_utils import initialize_ray_cluster
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
@@ -24,29 +19,11 @@
                            SequenceGroupOutput, SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                                TokenizerGroup)
-from vllm.utils import (Counter, set_cuda_visible_devices, get_ip,
-                        get_open_port, get_distributed_init_method)
-
-if ray:
-    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
-
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
+from vllm.utils import Counter
 
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
 
-# A map between the device type (in device config) to its worker module.
-DEVICE_TO_WORKER_MODULE_MAP = {
-    "cuda": "vllm.worker.worker",
-    "neuron": "vllm.worker.neuron_worker",
-}
-
-# If the env var is set, it uses the Ray's compiled DAG API
-# which optimizes the control plane overhead.
-# Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
-USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
-
 
 class LLMEngine:
     """An LLM engine that receives requests and generates texts.
@@ -71,8 +48,8 @@ class LLMEngine:
         parallel_config: The configuration related to distributed execution.
         scheduler_config: The configuration related to the request scheduler.
         device_config: The configuration related to the device.
-        placement_group: Ray placement group for distributed execution.
-            Required for distributed execution.
+        executor_class: The model executor class for managing distributed
+            execution.
         log_stats: Whether to log statistics.
     """
 
@@ -84,7 +61,7 @@ def __init__(
         scheduler_config: SchedulerConfig,
         device_config: DeviceConfig,
         lora_config: Optional[LoRAConfig],
-        placement_group: Optional["PlacementGroup"],
+        executor_class: Type[ExecutorBase],
         log_stats: bool,
     ) -> None:
         logger.info(
@@ -121,33 +98,13 @@ def __init__(
         self._init_tokenizer()
         self.seq_counter = Counter()
 
-        # Create the parallel GPU workers.
-        if self.parallel_config.worker_use_ray:
-            # Disable Ray usage stats collection.
-            ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
-            if ray_usage != "1":
-                os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
-            # Pass additional arguments to initialize the worker
-            additional_ray_args = {}
-            if self.parallel_config.ray_workers_use_nsight:
-                logger.info("Configuring Ray workers to use nsight.")
-                additional_ray_args = {
-                    "runtime_env": {
-                        "nsight": {
-                            "t": "cuda,cudnn,cublas",
-                            "o": "'worker_process_%p'",
-                            "cuda-graph-trace": "node",
-                        }
-                    }
-                }
-            self._init_workers_ray(placement_group, **additional_ray_args)
-        else:
-            self._init_workers()
-
-        # Profile the memory usage and initialize the cache.
-        self._init_cache()
+        self.model_executor = executor_class(model_config, cache_config,
+                                             parallel_config, scheduler_config,
+                                             device_config, lora_config)
 
         # Create the scheduler.
+        # NOTE: the cache_config here have been updated with the numbers of
+        # GPU and CPU blocks, which are profiled in the distributed executor.
         self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
 
         # Metric Logging.
@@ -157,9 +114,29 @@ def __init__(
                 labels=dict(model_name=model_config.model))
             self.stat_logger.info("cache_config", self.cache_config)
 
-        self.forward_dag = None
-        if USE_RAY_COMPILED_DAG:
-            self.forward_dag = self._compiled_ray_dag()
+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+
+        # Initialize the cluster and specify the executor class.
+        if parallel_config.worker_use_ray:
+            initialize_ray_cluster(parallel_config)
+            from vllm.executor.ray_gpu_executor import RayGPUExecutor
+            executor_class = RayGPUExecutor
+        else:
+            assert parallel_config.world_size == 1, (
+                "Ray is required if parallel_config.world_size > 1.")
+            from vllm.executor.gpu_executor import GPUExecutor
+            executor_class = GPUExecutor
+
+        # Create the LLM engine.
+        engine = cls(*engine_configs,
+                     executor_class=executor_class,
+                     log_stats=not engine_args.disable_log_stats)
+        return engine
 
     def __reduce__(self):
         # This is to ensure that the LLMEngine is not referenced in
@@ -173,39 +150,6 @@ def get_tokenizer_for_seq(self,
                               sequence: Sequence) -> "PreTrainedTokenizer":
         return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
 
-    def _dispatch_worker(self):
-        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
-            self.device_config.device_type]
-        imported_worker = importlib.import_module(worker_module)
-        Worker = imported_worker.Worker
-        return Worker
-
-    def _init_workers(self):
-        # Lazy import the Worker to avoid importing torch.cuda/xformers
-        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        Worker = self._dispatch_worker()
-
-        assert self.parallel_config.world_size == 1, (
-            "Ray is required if parallel_config.world_size > 1.")
-
-        self.workers: List[Worker] = []
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        self.driver_worker = Worker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            local_rank=0,
-            rank=0,
-            distributed_init_method=distributed_init_method,
-            lora_config=self.lora_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=True,
-        )
-        self._run_workers("init_model")
-        self._run_workers("load_model")
-
     def _init_tokenizer(self, **tokenizer_init_kwargs):
         init_kwargs = dict(
             enable_lora=bool(self.lora_config),
@@ -218,126 +162,6 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
         self.tokenizer: TokenizerGroup = TokenizerGroup(
             self.model_config.tokenizer, **init_kwargs)
 
-    def _init_workers_ray(self, placement_group: "PlacementGroup",
-                          **ray_remote_kwargs):
-        if self.parallel_config.tensor_parallel_size == 1:
-            num_gpus = self.cache_config.gpu_memory_utilization
-        else:
-            num_gpus = 1
-
-        self.driver_dummy_worker: RayWorkerVllm = None
-        self.workers: List[RayWorkerVllm] = []
-
-        driver_ip = get_ip()
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get("GPU", 0):
-                continue
-            scheduling_strategy = PlacementGroupSchedulingStrategy(
-                placement_group=placement_group,
-                placement_group_capture_child_tasks=True,
-                placement_group_bundle_index=bundle_id,
-            )
-            worker = ray.remote(
-                num_cpus=0,
-                num_gpus=num_gpus,
-                scheduling_strategy=scheduling_strategy,
-                **ray_remote_kwargs,
-            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
-
-            worker_ip = ray.get(worker.get_node_ip.remote())
-            if worker_ip == driver_ip and self.driver_dummy_worker is None:
-                # If the worker is on the same node as the driver, we use it
-                # as the resource holder for the driver process.
-                self.driver_dummy_worker = worker
-            else:
-                self.workers.append(worker)
-
-        if self.driver_dummy_worker is None:
-            raise ValueError(
-                "Ray does not allocate any GPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "GPU node.")
-
-        driver_node_id, driver_gpu_ids = ray.get(
-            self.driver_dummy_worker.get_node_and_gpu_ids.remote())
-        worker_node_and_gpu_ids = ray.get(
-            [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
-
-        node_workers = defaultdict(list)
-        node_gpus = defaultdict(list)
-
-        node_workers[driver_node_id].append(0)
-        node_gpus[driver_node_id].extend(driver_gpu_ids)
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
-                                               start=1):
-            node_workers[node_id].append(i)
-            node_gpus[node_id].extend(gpu_ids)
-        for node_id, gpu_ids in node_gpus.items():
-            node_gpus[node_id] = sorted(gpu_ids)
-
-        # Set CUDA_VISIBLE_DEVICES for the driver.
-        set_cuda_visible_devices(node_gpus[driver_node_id])
-        for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
-            worker.set_cuda_visible_devices.remote(node_gpus[node_id])
-
-        distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
-
-        # Lazy import the Worker to avoid importing torch.cuda/xformers
-        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        Worker = self._dispatch_worker()
-
-        # Initialize torch distributed process group for the workers.
-        model_config = copy.deepcopy(self.model_config)
-        parallel_config = copy.deepcopy(self.parallel_config)
-        scheduler_config = copy.deepcopy(self.scheduler_config)
-        device_config = copy.deepcopy(self.device_config)
-        lora_config = copy.deepcopy(self.lora_config)
-        kv_cache_dtype = self.cache_config.cache_dtype
-
-        for rank, (worker, (node_id,
-                            _)) in enumerate(zip(self.workers,
-                                                 worker_node_and_gpu_ids),
-                                             start=1):
-            local_rank = node_workers[node_id].index(rank)
-            worker.init_worker.remote(
-                lambda rank=rank, local_rank=local_rank: Worker(
-                    model_config,
-                    parallel_config,
-                    scheduler_config,
-                    device_config,
-                    local_rank,
-                    rank,
-                    distributed_init_method,
-                    lora_config=lora_config,
-                    kv_cache_dtype=kv_cache_dtype,
-                ))
-
-        driver_rank = 0
-        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
-        self.driver_worker = Worker(
-            self.model_config,
-            self.parallel_config,
-            self.scheduler_config,
-            self.device_config,
-            driver_local_rank,
-            driver_rank,
-            distributed_init_method,
-            lora_config=self.lora_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=True,
-        )
-
-        # don't use cupy for eager mode
-        self._run_workers("init_model",
-                          cupy_port=get_open_port()
-                          if not model_config.enforce_eager else None)
-        self._run_workers(
-            "load_model",
-            max_concurrent_workers=self.parallel_config.
-            max_parallel_loading_workers,
-        )
-
     def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)
@@ -346,81 +170,6 @@ def _verify_args(self) -> None:
             self.lora_config.verify_with_scheduler_config(
                 self.scheduler_config)
 
-    def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache.
-
-        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
-        More details can be found in the
-        :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
-        from class :class:`~vllm.worker.Worker`.
-
-        Afterwards, as there may be multiple workers,
-        we take the minimum number of blocks across all workers
-        to ensure this can be applied to all of them.
-
-        Finally, the engine will initialize the KV cache
-        with the calculated number of blocks.
-
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameters.
-        """
-        # Get the maximum number of blocks that can be allocated on GPU and CPU.
-        num_blocks = self._run_workers(
-            "profile_num_available_blocks",
-            block_size=self.cache_config.block_size,
-            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
-            cpu_swap_space=self.cache_config.swap_space_bytes,
-            cache_dtype=self.cache_config.cache_dtype,
-        )
-
-        # Since we use a shared centralized controller, we take the minimum
-        # number of blocks across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        num_gpu_blocks = min(b[0] for b in num_blocks)
-        num_cpu_blocks = min(b[1] for b in num_blocks)
-        # FIXME(woosuk): Change to debug log.
-        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
-
-        if num_gpu_blocks <= 0:
-            raise ValueError("No available memory for the cache blocks. "
-                             "Try increasing `gpu_memory_utilization` when "
-                             "initializing the engine.")
-        max_seq_len = self.cache_config.block_size * num_gpu_blocks
-        if self.model_config.max_model_len > max_seq_len:
-            raise ValueError(
-                f"The model's max seq len ({self.model_config.max_model_len}) "
-                "is larger than the maximum number of tokens that can be "
-                f"stored in KV cache ({max_seq_len}). Try increasing "
-                "`gpu_memory_utilization` or decreasing `max_model_len` when "
-                "initializing the engine.")
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        # Initialize the cache.
-        self._run_workers("init_cache_engine", cache_config=self.cache_config)
-        # Warm up the model. This includes capturing the model into CUDA graph
-        # if enforce_eager is False.
-        self._run_workers("warm_up_model")
-
-    @classmethod
-    def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
-        """Creates an LLM engine from the engine arguments."""
-        # Create the engine configs.
-        engine_configs = engine_args.create_engine_configs()
-        parallel_config = engine_configs[2]
-        # Initialize the cluster.
-        placement_group = initialize_cluster(parallel_config)
-        # Create the LLM engine.
-        engine = cls(*engine_configs,
-                     placement_group,
-                     log_stats=not engine_args.disable_log_stats)
-        return engine
-
     def encode_request(
         self,
         request_id: str,  # pylint: disable=unused-argument
@@ -826,7 +575,7 @@ def step(self) -> List[RequestOutput]:
                 - A Sequence Group (SG) refer to a group of sequences
                   that are generated from the same prompt.
 
-            - Step 2: Calls the workers to execute the model.
+            - Step 2: Calls the distributed executor to execute the model.
             - Step 3: Processes the model output. This mainly includes:
 
                 - Decodes the relevant outputs.
@@ -862,19 +611,10 @@ def step(self) -> List[RequestOutput]:
         seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
 
         if not scheduler_outputs.is_empty():
-            # Execute the model.
-            all_outputs = self._run_workers(
-                "execute_model",
-                driver_kwargs={
-                    "seq_group_metadata_list": seq_group_metadata_list,
-                    "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in,
-                    "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out,
-                    "blocks_to_copy": scheduler_outputs.blocks_to_copy,
-                },
-                use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
-
-            # Only the driver worker returns the sampling results.
-            output = all_outputs[0]
+            output = self.model_executor.execute_model(
+                seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in,
+                scheduler_outputs.blocks_to_swap_out,
+                scheduler_outputs.blocks_to_copy)
         else:
             output = []
 
@@ -1043,111 +783,13 @@ def _finalize_sequence(self, seq: Sequence,
             seq.output_text = seq.output_text[:-len(stop_string)]
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "add_lora",
-            lora_request=lora_request,
-        )
+        return self.model_executor.add_lora(lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
-        assert lora_id > 0, "lora_id must be greater than 0."
-        return self._run_workers(
-            "remove_lora",
-            lora_id=lora_id,
-        )
+        return self.model_executor.remove_lora(lora_id)
 
     def list_loras(self) -> List[int]:
-        return self._run_workers("list_loras")
-
-    def _run_workers(
-        self,
-        method: str,
-        *args,
-        driver_args: Optional[List[Any]] = None,
-        driver_kwargs: Optional[Dict[str, Any]] = None,
-        max_concurrent_workers: Optional[int] = None,
-        use_ray_compiled_dag: bool = False,
-        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers."""
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        if use_ray_compiled_dag:
-            # Right now, compiled DAG can only accept a single
-            # input. TODO(sang): Fix it.
-            output_channels = self.forward_dag.execute(1)
-        else:
-            # Start the ray workers first.
-            ray_worker_outputs = [
-                worker.execute_method.remote(method, *args, **kwargs)
-                for worker in self.workers
-            ]
-
-        if driver_args is None:
-            driver_args = args
-        if driver_kwargs is None:
-            driver_kwargs = kwargs
-
-        # Start the driver worker after all the ray workers.
-        driver_worker_output = getattr(self.driver_worker,
-                                       method)(*driver_args, **driver_kwargs)
-
-        # Get the results of the ray workers.
-        if self.workers:
-            if use_ray_compiled_dag:
-                try:
-                    ray_worker_outputs = [
-                        pickle.loads(chan.begin_read())
-                        for chan in output_channels
-                    ]
-                finally:
-                    # Has to call end_read in order to reuse the DAG.
-                    for chan in output_channels:
-                        chan.end_read()
-            else:
-                ray_worker_outputs = ray.get(ray_worker_outputs)
-
-        return [driver_worker_output] + ray_worker_outputs
-
-    def _compiled_ray_dag(self):
-        import pkg_resources
-        required_version = "2.9"
-        current_version = pkg_resources.get_distribution("ray").version
-        if current_version < required_version:
-            raise ValueError(f"Ray version {required_version} or greater is "
-                             f"required, but found {current_version}")
-
-        from ray.dag import MultiOutputNode, InputNode
-        assert self.parallel_config.worker_use_ray
-
-        # Right now, compiled DAG requires at least 1 arg. We send
-        # a dummy value for now. It will be fixed soon.
-        with InputNode() as input_data:
-            forward_dag = MultiOutputNode([
-                worker.execute_model_compiled_dag_remote.bind(input_data)
-                for worker in self.workers
-            ])
-        return forward_dag.experimental_compile()
+        return self.model_executor.list_loras()
 
     def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()
-
-    def _check_if_any_actor_is_dead(self):
-        if not self.parallel_config.worker_use_ray:
-            return
-
-        if not self.workers:
-            return
-
-        dead_actors = []
-        for actor in self.workers:
-            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
-            if actor_state["State"] == "DEAD":
-                dead_actors.append(actor)
-        if dead_actors:
-            raise RuntimeError("At least one Worker is dead. "
-                               f"Dead Workers: {dead_actors}. ")
+        self.model_executor.check_health()
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index bbcbbdfea2f00..742f3dc575190 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,6 +1,6 @@
 import pickle
 
-from typing import Optional, List, Tuple, TYPE_CHECKING
+from typing import Optional, List, Tuple
 
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
@@ -65,45 +65,38 @@ def execute_model_compiled_dag_remote(self, ignored):
     ray = None
     RayWorkerVllm = None
 
-if TYPE_CHECKING:
-    from ray.util.placement_group import PlacementGroup
 
-
-def initialize_cluster(
+def initialize_ray_cluster(
     parallel_config: ParallelConfig,
-    engine_use_ray: bool = False,
     ray_address: Optional[str] = None,
-) -> Optional["PlacementGroup"]:
-    """Initialize the distributed cluster probably with Ray.
+):
+    """Initialize the distributed cluster with Ray.
+
+    it will connect to the Ray cluster and create a placement group
+    for the workers, which includes the specification of the resources
+    for each distributed worker.
 
     Args:
         parallel_config: The configurations for parallel execution.
-        engine_use_ray: Whether to use Ray for async engine.
         ray_address: The address of the Ray cluster. If None, uses
             the default Ray cluster address.
-
-    Returns:
-        An optional `PlacementGroup`. It includes the specification
-        of the resources for each distributed worker. None if Ray is
-        not used.
     """
-    if parallel_config.worker_use_ray or engine_use_ray:
-        if ray is None:
-            raise ImportError(
-                "Ray is not installed. Please install Ray to use distributed "
-                "serving.")
-        # Connect to a ray cluster.
-        if is_hip():
-            ray.init(address=ray_address,
-                     ignore_reinit_error=True,
-                     num_gpus=parallel_config.world_size)
-        else:
-            ray.init(address=ray_address, ignore_reinit_error=True)
-
-    if not parallel_config.worker_use_ray:
-        assert parallel_config.world_size == 1, (
-            "Ray is required if parallel_config.world_size > 1.")
-        return None
+    if ray is None:
+        raise ImportError(
+            "Ray is not installed. Please install Ray to use distributed "
+            "serving.")
+
+    # Connect to a ray cluster.
+    if is_hip():
+        ray.init(address=ray_address,
+                 ignore_reinit_error=True,
+                 num_gpus=parallel_config.world_size)
+    else:
+        ray.init(address=ray_address, ignore_reinit_error=True)
+
+    if parallel_config.placement_group:
+        # Placement group is already set.
+        return
 
     # Create placement group for worker processes
     current_placement_group = ray.util.get_current_placement_group()
@@ -138,4 +131,5 @@ def initialize_cluster(
         # if they cannot be provisioned.
         ray.get(current_placement_group.ready(), timeout=1800)
 
-    return current_placement_group
+    # Set the placement group in the parallel config
+    parallel_config.placement_group = current_placement_group
diff --git a/vllm/executor/__init__.py b/vllm/executor/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
new file mode 100644
index 0000000000000..30717e8a87358
--- /dev/null
+++ b/vllm/executor/executor_base.py
@@ -0,0 +1,75 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+
+
+class ExecutorBase(ABC):
+    """Base class for all executors.
+
+    An executor is responsible for executing the model on a specific device
+    type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor
+    that can execute the model on multiple devices.
+    """
+
+    @abstractmethod
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def execute_model(self,
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      blocks_to_swap_in: Dict[int, int],
+                      blocks_to_swap_out: Dict[int, int],
+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+        """Executes one model step on the given sequences."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_loras(self) -> List[int]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def check_health(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        raise NotImplementedError
+
+
+class ExecutorAsyncBase(ExecutorBase):
+
+    @abstractmethod
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        """Executes one model step on the given sequences."""
+        raise NotImplementedError
+
+    @abstractmethod
+    async def check_health_async(self) -> None:
+        """Checks if the executor is healthy. If not, it should raise an
+        exception."""
+        raise NotImplementedError
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
new file mode 100644
index 0000000000000..9019ee7763c77
--- /dev/null
+++ b/vllm/executor/gpu_executor.py
@@ -0,0 +1,163 @@
+import importlib
+from typing import Dict, List, Optional
+
+from vllm.lora.request import LoRARequest
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.utils import check_block_size_valid
+from vllm.logger import init_logger
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import (get_ip, get_open_port, get_distributed_init_method,
+                        make_async)
+
+logger = init_logger(__name__)
+
+# A map between the device type (in device config) to its worker module.
+DEVICE_TO_WORKER_MODULE_MAP = {
+    "cuda": "vllm.worker.worker",
+    "neuron": "vllm.worker.neuron_worker",
+}
+
+
+class GPUExecutor(ExecutorBase):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+
+        # Instantiate the worker and load the model to GPU.
+        self._init_worker()
+
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+    def _dispatch_worker(self):
+        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
+            self.device_config.device_type]
+        imported_worker = importlib.import_module(worker_module)
+        Worker = imported_worker.Worker
+        return Worker
+
+    def _init_worker(self):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        Worker = self._dispatch_worker()
+
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = Worker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_model()
+        self.driver_worker.load_model()
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache.
+
+        The engine first profiles the existing memory usage.
+        Then, it allocates the remaining memory for KV blocks.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_gpu_blocks, num_cpu_blocks = (
+            self.driver_worker.profile_num_available_blocks(
+                block_size=self.cache_config.block_size,
+                gpu_memory_utilization=self.cache_config.
+                gpu_memory_utilization,
+                cpu_swap_space=self.cache_config.swap_space_bytes,
+                cache_dtype=self.cache_config.cache_dtype,
+            ))
+
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+
+        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
+                               self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        self.driver_worker.init_cache_engine(cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self.driver_worker.warm_up_model()
+
+    def execute_model(self,
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      blocks_to_swap_in: Dict[int, int],
+                      blocks_to_swap_out: Dict[int, int],
+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+        output = self.driver_worker.execute_model(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+        )
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.remove_lora(lora_id)
+
+    def list_loras(self) -> List[int]:
+        return self.driver_worker.list_loras()
+
+    def check_health(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+
+class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model)(
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy)
+        return output
+
+    async def check_health_async(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
new file mode 100644
index 0000000000000..261fcfb7dad9b
--- /dev/null
+++ b/vllm/executor/ray_gpu_executor.py
@@ -0,0 +1,442 @@
+import asyncio
+import copy
+from collections import defaultdict
+import os
+import pickle
+import importlib
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.engine.ray_utils import RayWorkerVllm, ray
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.utils import check_block_size_valid
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port,
+                        get_distributed_init_method, make_async)
+
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+# A map between the device type (in device config) to its worker module.
+DEVICE_TO_WORKER_MODULE_MAP = {
+    "cuda": "vllm.worker.worker",
+    "neuron": "vllm.worker.neuron_worker",
+}
+
+# If the env var is set, it uses the Ray's compiled DAG API
+# which optimizes the control plane overhead.
+# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+
+
+class RayGPUExecutor(ExecutorBase):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        lora_config: Optional[LoRAConfig],
+    ) -> None:
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+
+        assert self.parallel_config.worker_use_ray
+        placement_group = self.parallel_config.placement_group
+
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+        self.forward_dag = None
+        if USE_RAY_COMPILED_DAG:
+            self.forward_dag = self._compiled_ray_dag()
+
+    def _dispatch_worker(self):
+        worker_module = DEVICE_TO_WORKER_MODULE_MAP[
+            self.device_config.device_type]
+        imported_worker = importlib.import_module(worker_module)
+        Worker = imported_worker.Worker
+        return Worker
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        if self.parallel_config.tensor_parallel_size == 1:
+            # For single GPU case, we use a ray worker with constrained memory.
+            num_gpus = self.cache_config.gpu_memory_utilization
+        else:
+            # Otherwise, the ray workers are allocated with a full GPU.
+            num_gpus = 1
+
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: RayWorkerVllm = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerVllm] = []
+
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=num_gpus,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+
+        # Get the set of GPU IDs used on each node.
+        driver_node_id, driver_gpu_ids = ray.get(
+            self.driver_dummy_worker.get_node_and_gpu_ids.remote())
+        worker_node_and_gpu_ids = ray.get(
+            [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
+
+        node_workers = defaultdict(list)
+        node_gpus = defaultdict(list)
+
+        node_workers[driver_node_id].append(0)
+        node_gpus[driver_node_id].extend(driver_gpu_ids)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
+                                               start=1):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        # Set CUDA_VISIBLE_DEVICES for the driver and workers.
+        set_cuda_visible_devices(node_gpus[driver_node_id])
+        for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
+            worker.set_cuda_visible_devices.remote(node_gpus[node_id])
+
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        Worker = self._dispatch_worker()
+
+        model_config = copy.deepcopy(self.model_config)
+        parallel_config = copy.deepcopy(self.parallel_config)
+        scheduler_config = copy.deepcopy(self.scheduler_config)
+        device_config = copy.deepcopy(self.device_config)
+        lora_config = copy.deepcopy(self.lora_config)
+        kv_cache_dtype = self.cache_config.cache_dtype
+
+        # Initialize the actual workers with the Worker class.
+        for rank, (worker, (node_id, _)) in enumerate(
+                zip(self.workers, worker_node_and_gpu_ids),
+                start=1,
+        ):
+            local_rank = node_workers[node_id].index(rank)
+            worker.init_worker.remote(
+                lambda rank=rank, local_rank=local_rank: Worker(
+                    model_config,
+                    parallel_config,
+                    scheduler_config,
+                    device_config,
+                    local_rank,
+                    rank,
+                    distributed_init_method,
+                    lora_config=lora_config,
+                    kv_cache_dtype=kv_cache_dtype,
+                ))
+
+        # Initialize the driver worker with the Worker class.
+        driver_rank = 0
+        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
+        self.driver_worker = Worker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            self.device_config,
+            driver_local_rank,
+            driver_rank,
+            distributed_init_method,
+            lora_config=self.lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=True,
+        )
+
+        # FIXME(woosuk): We are not properly initializing cupy NCCL when
+        # we have multiple nodes.
+        self._run_workers("init_model",
+                          cupy_port=get_open_port()
+                          if not model_config.enforce_eager else None)
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache.
+
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+        More details can be found in the
+        :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
+        from class :class:`~vllm.worker.Worker`.
+
+        Afterwards, as there may be multiple workers,
+        we take the minimum number of blocks across all workers
+        to ensure this can be applied to all of them.
+
+        Finally, the engine will initialize the KV cache
+        with the calculated number of blocks.
+
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers(
+            "profile_num_available_blocks",
+            block_size=self.cache_config.block_size,
+            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+            cpu_swap_space=self.cache_config.swap_space_bytes,
+            cache_dtype=self.cache_config.cache_dtype,
+        )
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+
+        check_block_size_valid(num_gpu_blocks, self.cache_config.block_size,
+                               self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        self._run_workers("init_cache_engine", cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self._run_workers("warm_up_model")
+
+    def execute_model(self,
+                      seq_group_metadata_list: List[SequenceGroupMetadata],
+                      blocks_to_swap_in: Dict[int, int],
+                      blocks_to_swap_out: Dict[int, int],
+                      blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+        all_outputs = self._run_workers(
+            "execute_model",
+            driver_kwargs={
+                "seq_group_metadata_list": seq_group_metadata_list,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            },
+            use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
+
+        # Only the driver worker returns the sampling results.
+        output = all_outputs[0]
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "add_lora",
+            lora_request=lora_request,
+        )
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "remove_lora",
+            lora_id=lora_id,
+        )
+
+    def list_loras(self) -> List[int]:
+        return self._run_workers("list_loras")
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        driver_args: Optional[List[Any]] = None,
+        driver_kwargs: Optional[Dict[str, Any]] = None,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+
+        if use_ray_compiled_dag:
+            # Right now, compiled DAG can only accept a single
+            # input. TODO(sang): Fix it.
+            output_channels = self.forward_dag.execute(1)
+        else:
+            # Start the ray workers first.
+            ray_worker_outputs = [
+                worker.execute_method.remote(method, *args, **kwargs)
+                for worker in self.workers
+            ]
+
+        if driver_args is None:
+            driver_args = args
+        if driver_kwargs is None:
+            driver_kwargs = kwargs
+
+        # Start the driver worker after all the ray workers.
+        driver_worker_output = getattr(self.driver_worker,
+                                       method)(*driver_args, **driver_kwargs)
+
+        # Get the results of the ray workers.
+        if self.workers:
+            if use_ray_compiled_dag:
+                try:
+                    ray_worker_outputs = [
+                        pickle.loads(chan.begin_read())
+                        for chan in output_channels
+                    ]
+                finally:
+                    # Has to call end_read in order to reuse the DAG.
+                    for chan in output_channels:
+                        chan.end_read()
+            else:
+                ray_worker_outputs = ray.get(ray_worker_outputs)
+
+        return [driver_worker_output] + ray_worker_outputs
+
+    def _compiled_ray_dag(self):
+        import pkg_resources
+        required_version = "2.9"
+        current_version = pkg_resources.get_distribution("ray").version
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} or greater is "
+                             f"required, but found {current_version}")
+
+        from ray.dag import MultiOutputNode, InputNode
+        assert self.parallel_config.worker_use_ray
+
+        # Right now, compiled DAG requires at least 1 arg. We send
+        # a dummy value for now. It will be fixed soon.
+        with InputNode() as input_data:
+            forward_dag = MultiOutputNode([
+                worker.execute_model_compiled_dag_remote.bind(input_data)
+                for worker in self.workers
+            ])
+        return forward_dag.experimental_compile()
+
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+
+    def _check_if_any_actor_is_dead(self):
+        if not self.workers:
+            return
+
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")
+
+
+class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase):
+
+    async def _run_workers_async(
+        self,
+        method: str,
+        *args,
+        driver_args: Optional[List[Any]] = None,
+        driver_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        coros = []
+
+        if driver_args is None:
+            driver_args = args
+        if driver_kwargs is None:
+            driver_kwargs = kwargs
+
+        # Run the driver worker asynchronously.
+        driver_executor = make_async(getattr(self.driver_worker, method))
+        coros.append(driver_executor(*driver_args, **driver_kwargs))
+
+        # Run the ray workers asynchronously.
+        for worker in self.workers:
+            coros.append(worker.execute_method.remote(method, *args, **kwargs))
+
+        all_outputs = await asyncio.gather(*coros)
+        return all_outputs
+
+    async def execute_model_async(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> SamplerOutput:
+        all_outputs = await self._run_workers_async(
+            "execute_model",
+            driver_kwargs={
+                "seq_group_metadata_list": seq_group_metadata_list,
+                "blocks_to_swap_in": blocks_to_swap_in,
+                "blocks_to_swap_out": blocks_to_swap_out,
+                "blocks_to_copy": blocks_to_copy,
+            },
+            use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
+
+        # Only the driver worker returns the sampling results.
+        output = all_outputs[0]
+        return output
+
+    async def check_health_async(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py
new file mode 100644
index 0000000000000..44976696a77c6
--- /dev/null
+++ b/vllm/executor/utils.py
@@ -0,0 +1,13 @@
+def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None:
+    if num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * num_gpu_blocks
+    if max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")

From c9415c19d3df26d8ede611abefba35c6837cd934 Mon Sep 17 00:00:00 2001
From: kliuae <17350011+kliuae@users.noreply.github.com>
Date: Tue, 12 Mar 2024 04:14:07 +0800
Subject: [PATCH 083/113] [ROCm] Fix warp and lane calculation in
 blockReduceSum (#3321)

---
 csrc/reduction_utils.cuh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh
index 210bf0b023ab2..c25464e866e55 100644
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@@ -29,12 +29,22 @@ __inline__ __device__ T warpReduceSum(T val) {
   return val;
 }
 
+__inline__ __device__ constexpr int _calculateLaneMask(int warp_size) {
+  return warp_size - 1;
+}
+
+__inline__ __device__ constexpr int _calculateWidShift(int warp_size) {
+  return 5 + (warp_size >> 6);
+}
+
 /* Calculate the sum of all elements in a block */
 template<typename T>
 __inline__ __device__ T blockReduceSum(T val) {
   static __shared__ T shared[WARP_SIZE];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
+  constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE);
+  constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE);
+  int lane = threadIdx.x & LANE_MASK;
+  int wid = threadIdx.x >> WID_SHIFT;
 
   val = warpReduceSum<T>(val);
 

From 654865e21df8ac6fe95de926625306e5756c2c0d Mon Sep 17 00:00:00 2001
From: DAIZHENWEI <32122197+DAIZHENWEI@users.noreply.github.com>
Date: Mon, 11 Mar 2024 13:19:51 -0700
Subject: [PATCH 084/113] Support Mistral Model Inference with
 transformers-neuronx (#3153)

---
 examples/offline_inference_neuron.py         | 10 ++-
 vllm/model_executor/models/__init__.py       |  7 +-
 vllm/model_executor/models/neuron/mistral.py | 82 ++++++++++++++++++++
 3 files changed, 93 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 examples/offline_inference_neuron.py
 mode change 100644 => 100755 vllm/model_executor/models/__init__.py
 create mode 100755 vllm/model_executor/models/neuron/mistral.py

diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py
old mode 100644
new mode 100755
index 9b9dc4d94892f..da8874abd92a2
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@@ -14,14 +14,16 @@
 llm = LLM(
     model="openlm-research/open_llama_3b",
     max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as max sequence length,
-    # when targeting neuron device. Currently, this is a known limitation in continuous batching
-    # support in transformers-neuronx.
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in transformers-neuronx.
     # TODO(liangfu): Support paged-attention in transformers-neuronx.
     max_model_len=128,
     block_size=128,
     # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection, or explicitly assigned.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
     device="neuron")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
old mode 100644
new mode 100755
index 75c2ae1e9f48e..bc3b6a582d53d
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -62,8 +62,11 @@
     "Sliding window attention is not yet supported in ROCm's flash attention",
 }
 
-# Models not supported by Neuron.
-_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"}
+# Models supported by Neuron.
+_NEURON_SUPPORTED_MODELS = {
+    "LlamaForCausalLM": "neuron.llama",
+    "MistralForCausalLM": "neuron.mistral"
+}
 
 
 class ModelRegistry:
diff --git a/vllm/model_executor/models/neuron/mistral.py b/vllm/model_executor/models/neuron/mistral.py
new file mode 100755
index 0000000000000..a302cce30abab
--- /dev/null
+++ b/vllm/model_executor/models/neuron/mistral.py
@@ -0,0 +1,82 @@
+"""Inference-only Mistral model compatible with HuggingFace weights."""
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import MistralConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import SamplerOutput
+import os
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class MistralForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: MistralConfig,
+        linear_method=None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = None
+        self.lm_head = None
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> SamplerOutput:
+        with torch.inference_mode():
+            seq_ids = []
+            block_size = self.model.context_buckets[-1]
+            if input_metadata.is_prompt:
+                seq_ids = input_metadata.slot_mapping[:, 0] // block_size
+            else:
+                seq_ids = input_metadata.block_tables
+
+            logits = self.model(input_ids,
+                                cache_ids=positions,
+                                start_ids=seq_ids)
+        return logits
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(self.model.chkpt_model.lm_head,
+                                   hidden_states, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     load_format: str = "auto",
+                     revision: Optional[str] = None,
+                     **kwargs):
+        from transformers_neuronx.mistral.model import MistralForSampling
+
+        split_model_dir = f"{model_name_or_path}-split"
+        if os.path.isdir(os.path.join(model_name_or_path,
+                                      "pytorch_model.bin")):
+            split_model_dir = model_name_or_path
+        elif not os.path.exists(f"{model_name_or_path}-split"):
+            from transformers import MistralForCausalLM
+            from transformers_neuronx.module import save_pretrained_split
+
+            hf_model = MistralForCausalLM.from_pretrained(
+                model_name_or_path, low_cpu_mem_usage=True)
+            save_pretrained_split(hf_model, f"{model_name_or_path}-split")
+
+        self.model = MistralForSampling.from_pretrained(
+            split_model_dir, **kwargs)
+        self.model.to_neuron()

From b0925b38789bb3b20dcc39e229fcfe12a311e487 Mon Sep 17 00:00:00 2001
From: Sherlock Xu <65327072+Sherlock113@users.noreply.github.com>
Date: Wed, 13 Mar 2024 01:34:30 +0800
Subject: [PATCH 085/113] docs: Add BentoML deployment doc (#3336)

Signed-off-by: Sherlock113 <sherlockxu07@gmail.com>
---
 docs/source/index.rst                          | 1 +
 docs/source/serving/deploying_with_bentoml.rst | 8 ++++++++
 2 files changed, 9 insertions(+)
 create mode 100644 docs/source/serving/deploying_with_bentoml.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index c0250bf99f7ae..65bfbbabf8be1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -73,6 +73,7 @@ Documentation
    serving/run_on_sky
    serving/deploying_with_kserve
    serving/deploying_with_triton
+   serving/deploying_with_bentoml
    serving/deploying_with_docker
    serving/serving_with_langchain
    serving/metrics
diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst
new file mode 100644
index 0000000000000..4b9d19f5bdb72
--- /dev/null
+++ b/docs/source/serving/deploying_with_bentoml.rst
@@ -0,0 +1,8 @@
+.. _deploying_with_bentoml:
+
+Deploying with BentoML
+======================
+
+`BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+
+For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.
\ No newline at end of file

From 49a3c8662ba745503890ab8b3c502aad7e1a0a19 Mon Sep 17 00:00:00 2001
From: Breno Faria <breno@veltefaria.de>
Date: Wed, 13 Mar 2024 01:30:08 +0100
Subject: [PATCH 086/113] Fixes #1556 double free (#3347)

---
 tests/core/test_block_manager.py | 87 ++++++++++++++++++++++++++++++++
 vllm/core/block_manager.py       | 17 ++++++-
 2 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index b280fd1d73c2f..44ac05a1430b3 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -274,3 +274,90 @@ def test_reset():
     # Resetting block manager frees all allocated blocks.
     block_manager.reset()
     assert block_manager.get_num_free_gpu_blocks() == original_blocks
+
+
+def test_sliding_window_multi_seq():
+    """
+    Tests that memory allocation and deallocation is handled
+    correctly with multiple sequences that exceed the sliding
+    window's capacity.
+    """
+    block_size = 1
+    num_cpu_blocks = 8
+    num_gpu_blocks = 8
+    sliding_window = 2
+    block_manager = BlockSpaceManager(block_size,
+                                      num_cpu_blocks,
+                                      num_gpu_blocks,
+                                      sliding_window=sliding_window,
+                                      watermark=0)
+
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
+
+    parent = Sequence(1, "one two three", [0, 1, 2], block_size)
+    seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(),
+                              None)
+    block_manager.allocate(seq_group)
+
+    # assert the number of blocks allocated is correct
+    # the parent seq has len 3, but since sliding_window is 2,
+    # we will use at most 2 blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # Fork prompt and copy block tables.
+    child = parent.fork(2)
+    block_manager.fork(parent, child)
+
+    # assert the number of blocks allocated is correct
+    # forking does not increase memory consumption
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # assert both parent and child share all blocks
+    assert block_manager.get_block_table(
+        parent) == block_manager.get_block_table(child)
+
+    token_id = 4
+    # Append token to child. Block is shared so copy on write occurs.
+    child.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(child)
+
+    # assert the number of blocks allocated is correct
+    # we will use now one block more. Each seq will use 2 blocks,
+    # but only one can be shared
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+
+    token_id = 5
+    parent.append_token_id(token_id, {token_id: Logprob(0.0)})
+    block_manager.append_slot(parent)
+
+    # assert the number of blocks allocated is correct
+    # no change, because both sequences are still just sharing one block
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window - 1
+
+    block_table_parent = block_manager.get_block_table(parent)
+    block_table_child = block_manager.get_block_table(child)
+
+    assert block_table_parent != block_table_child
+
+    # assert both blocks are sharing the second-last block
+    assert block_table_parent[-2] == block_table_child[-2]
+
+    # now let's clean up...
+    block_manager.free(parent)
+
+    # assert the number of blocks allocated is correct
+    # We have freed one seq, reducing the ref count of two blocks by one.
+    # One of the two was only used by the parent seq, so this is now free.
+    # The child seq still consumes sliding_window blocks
+    assert block_manager.get_num_free_gpu_blocks(
+    ) == num_gpu_blocks - sliding_window
+
+    # free all blocks
+    block_manager.free(child)
+
+    # assert all blocks are free now
+    assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 8bfc14999f0a7..8b089a5650f48 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -312,7 +312,12 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         # Thus, it is always safe from OOM.
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.copy()
-        for block in src_block_table:
+        # When using a sliding window, blocks will be eventually reused.
+        # In this case the block tables will contain repeated blocks.
+        # When forking, we must make sure that each block's `ref_count`
+        # is only incremented by one, so we deduplicate them by wrapping
+        # them in a set.
+        for block in set(src_block_table):
             block.ref_count += 1
 
     def _get_physical_blocks(
@@ -393,7 +398,15 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         return block_number_mapping
 
     def _free_block_table(self, block_table: BlockTable) -> None:
-        for block in set(block_table):
+        # when using a sliding window, each seq will only use up
+        # to `self.block_sliding_window` blocks. When freeing
+        # the block table, we must make sure to not free blocks more
+        # than once. If no sliding window is used, there is no block
+        # reuse in the block table, so we must free all blocks.
+        blocks_to_free = (block_table[-self.block_sliding_window:]
+                          if self.block_sliding_window is not None else
+                          block_table)
+        for block in set(blocks_to_free):
             if block.device == Device.GPU:
                 self.gpu_allocator.free(block)
             else:

From 602358f8a86ef9fc0ba882e083e19b44e00b9302 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Mar 2024 22:06:17 -0700
Subject: [PATCH 087/113] Add kernel for GeGLU with approximate GELU (#3337)

---
 csrc/activation_kernels.cu               | 22 +++++++++++++++++++++-
 csrc/ops.h                               |  4 ++++
 csrc/pybind.cpp                          |  6 +++++-
 tests/kernels/test_activation.py         | 11 ++++++++---
 vllm/model_executor/layers/activation.py | 13 +++++++++++--
 5 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 22b10f0571d1c..24d972702c858 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -33,12 +33,25 @@ template<typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
   // Equivalent to PyTorch GELU with 'none' approximation.
   // Refer to:
-  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
   const float f = (float) x;
   constexpr float ALPHA = M_SQRT1_2;
   return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }
 
+template<typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float) x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+}
+
 } // namespace vllm
 
 // Launch activation and gating kernel.
@@ -73,6 +86,13 @@ void gelu_and_mul(
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
 
+void gelu_tanh_and_mul(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input)    // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+}
+
 namespace vllm {
 
 // Element-wise activation kernel template.
diff --git a/csrc/ops.h b/csrc/ops.h
index 249c7451bf73c..53222972abb70 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -61,6 +61,10 @@ void gelu_and_mul(
   torch::Tensor& out,
   torch::Tensor& input);
 
+void gelu_tanh_and_mul(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
 void gelu_new(
   torch::Tensor& out,
   torch::Tensor& input);
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 4b6ade7566398..39384f08d928c 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -25,7 +25,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def(
     "gelu_and_mul",
     &gelu_and_mul,
-    "Activation function used in GeGLU.");
+    "Activation function used in GeGLU with `none` approximation.");
+  ops.def(
+    "gelu_tanh_and_mul",
+    &gelu_tanh_and_mul,
+    "Activation function used in GeGLU with `tanh` approximation.");
   ops.def(
     "gelu_new",
     &gelu_new,
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index e0dec144eba11..f78913f120aa4 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -16,7 +16,7 @@
 ]
 
 
-@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul])
+@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"])
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("d", D)
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -24,7 +24,7 @@
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_act_and_mul(
-    activation: Type[torch.nn.Module],
+    activation: str,
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
@@ -36,7 +36,12 @@ def test_act_and_mul(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
     x = torch.randn(num_tokens, 2 * d, dtype=dtype)
-    layer = activation()
+    if activation == "silu":
+        layer = SiluAndMul()
+    elif activation == "gelu":
+        layer = GeluAndMul(approximate="none")
+    elif activation == "gelu_tanh":
+        layer = GeluAndMul(approximate="tanh")
     out = layer(x)
     ref_out = layer._forward(x)
     # The SiLU and GELU implementations are equivalent to the native PyTorch
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 5a3a7b2dbaee7..3eb73ee109f50 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -47,16 +47,25 @@ class GeluAndMul(nn.Module):
         return: (batch_size, seq_len, d) or (num_tokens, d)
     """
 
+    def __init__(self, approximate: str = "none"):
+        super().__init__()
+        self.approximate = approximate
+        if approximate not in ("none", "tanh"):
+            raise ValueError(f"Unknown approximate mode: {approximate}")
+
     def _forward(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
-        return F.gelu(x[..., :d]) * x[..., d:]
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        ops.gelu_and_mul(out, x)
+        if self.approximate == "none":
+            ops.gelu_and_mul(out, x)
+        elif self.approximate == "tanh":
+            ops.gelu_tanh_and_mul(out, x)
         return out
 
 

From b167109ba12f18d028d2be8a61d3dce950eb2724 Mon Sep 17 00:00:00 2001
From: Bo-Wen Wang <1849994161@qq.com>
Date: Wed, 13 Mar 2024 13:51:42 +0800
Subject: [PATCH 088/113] [Fix] Fix quantization="gptq" when using Marlin
 (#3319)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/config.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d2b68b6fa1fe2..319c1569f5e98 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -168,13 +168,18 @@ def _verify_quantization(self) -> None:
         # Parse quantization method from the HF model config, if available.
         hf_quant_config = getattr(self.hf_config, "quantization_config", None)
         if hf_quant_config is not None:
-
             hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+
             # If the GPTQ model is serialized in marlin format, use marlin.
             if (hf_quant_method == "gptq"
                     and "is_marlin_format" in hf_quant_config
                     and hf_quant_config["is_marlin_format"]):
+                logger.info("The model is serialized in Marlin format. "
+                            "Using Marlin kernel.")
                 hf_quant_method = "marlin"
+                if self.quantization == "gptq":
+                    self.quantization = hf_quant_method
+
             if self.quantization is None:
                 self.quantization = hf_quant_method
             elif self.quantization != hf_quant_method:

From e221910e77087743a50560e4ae69c3c2a12beb53 Mon Sep 17 00:00:00 2001
From: Ronan McGovern <78278410+RonanKMcGovern@users.noreply.github.com>
Date: Wed, 13 Mar 2024 06:33:43 +0000
Subject: [PATCH 089/113] add hf_transfer to requirements.txt (#3031)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index dd4867702d3de..18770f994ebd2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -96,7 +96,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate
+    pip install accelerate hf_transfer
 
 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm

From ba8dc958a3d8533a6e5b7debda47e4d42a062b78 Mon Sep 17 00:00:00 2001
From: Hui Liu <96135754+hliuca@users.noreply.github.com>
Date: Wed, 13 Mar 2024 09:16:55 -0700
Subject: [PATCH 090/113] [Minor] Fix bias in if to remove ambiguity (#3259)

---
 vllm/model_executor/layers/linear.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 60f6fc83b200f..40e681df48f86 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -73,7 +73,7 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         weight = weights["weight"]
         if self.separate_bias_add:
-            if bias:
+            if bias is not None:
                 return F.linear(x, weight) + bias
             return F.linear(x, weight)
         return F.linear(x, weight, bias)

From 739c350c1926682f435316294491aa54661849b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Thu, 14 Mar 2024 00:43:24 +0800
Subject: [PATCH 091/113] [Minor Fix] Use cupy-cuda11x in CUDA 11.8 build
 (#3256)

---
 setup.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/setup.py b/setup.py
index 023c3cde1910c..accf6bb400310 100644
--- a/setup.py
+++ b/setup.py
@@ -431,6 +431,12 @@ def get_requirements() -> List[str]:
     else:
         with open(get_path("requirements.txt")) as f:
             requirements = f.read().strip().split("\n")
+        if nvcc_cuda_version <= Version("11.8"):
+            # replace cupy-cuda12x with cupy-cuda11x for cuda 11.x
+            for i in range(len(requirements)):
+                if requirements[i].startswith("cupy-cuda12x"):
+                    requirements[i] = "cupy-cuda11x"
+                    break
     return requirements
 
 

From ae0ccb40170d140ded8de99fc905fd8cb0bd409c Mon Sep 17 00:00:00 2001
From: Or Sharir <or+github@sharir.org>
Date: Wed, 13 Mar 2024 21:18:25 +0200
Subject: [PATCH 092/113] Add missing kernel for CodeLlama-34B on A/H100 (no
 tensor parallelism) when using Multi-LoRA. (#3350)

---
 csrc/punica/bgmv/bgmv_config.h | 1 +
 tests/lora/test_punica.py      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index 4dc90de1ab42a..a7415dfc91369 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -43,6 +43,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 14336) \
     f(in_T, out_T, W_T, narrow, 16384) \
     f(in_T, out_T, W_T, narrow, 20480) \
+    f(in_T, out_T, W_T, narrow, 22016) \
     f(in_T, out_T, W_T, narrow, 24576) \
     f(in_T, out_T, W_T, narrow, 28672) \
     f(in_T, out_T, W_T, narrow, 32000) \
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index cbe0f6fa2e851..fd707766c6a30 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -45,7 +45,7 @@ def _lora_ref_impl(
 H1 = H2 = [
     128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120,
     5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336,
-    24576, 32000, 32256, 32512, 32768, 33024
+    22016, 24576, 32000, 32256, 32512, 32768, 33024
 ]
 SEED = [0xabcdabcd987]
 

From 7e9bd08f60a4b18e3646ff986caeacde9ffffa53 Mon Sep 17 00:00:00 2001
From: Terry <149540247+tterrysun@users.noreply.github.com>
Date: Wed, 13 Mar 2024 13:45:26 -0700
Subject: [PATCH 093/113] Add batched RoPE kernel (#3095)

---
 benchmarks/kernels/benchmark_rope.py          | 120 ++++++++++++++++
 csrc/ops.h                                    |  10 ++
 csrc/pos_encoding_kernels.cu                  | 126 ++++++++++++++--
 csrc/pybind.cpp                               |   5 +
 tests/kernels/test_pos_encoding.py            | 135 +++++++++++++++++-
 .../model_executor/layers/rotary_embedding.py |  58 +++++---
 6 files changed, 417 insertions(+), 37 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_rope.py

diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
new file mode 100644
index 0000000000000..f9564dd9588f0
--- /dev/null
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -0,0 +1,120 @@
+from typing import Optional
+
+import argparse
+import torch
+import nvtx
+from itertools import accumulate
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                            is_neox_style, {
+                                "type": "linear",
+                                "factor": tuple(scaling_factors)
+                            })
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                     {
+                         "type": "linear",
+                         "factor": (scaling_factor, )
+                     }))
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the rotary embedding kernels.")
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 256],
+                        default=128)
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["bfloat16", "float"],
+                        default="float")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device",
+                        type=str,
+                        choices=["cuda:0", "cuda:1"],
+                        default="cuda:0")
+    args = parser.parse_args()
+    print(args)
+
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
diff --git a/csrc/ops.h b/csrc/ops.h
index 53222972abb70..d5d6e240da7c4 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -53,6 +53,16 @@ void rotary_embedding(
   torch::Tensor& cos_sin_cache,
   bool is_neox);
 
+void batched_rotary_embedding(
+  torch::Tensor& positions,
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int head_size,
+  torch::Tensor& cos_sin_cache,
+  bool is_neox,
+  int rot_dim,
+  torch::Tensor& cos_sin_cache_offsets);
+
 void silu_and_mul(
   torch::Tensor& out,
   torch::Tensor& input);
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
index 5f522795619e1..d80cb6973fad6 100644
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -8,7 +8,7 @@
 namespace vllm {
 
 template<typename scalar_t, bool IS_NEOX>
-inline __device__ void apply_rotary_embedding(
+inline __device__ void apply_token_rotary_embedding(
   scalar_t* __restrict__ arr,
   const scalar_t* __restrict__ cos_ptr,
   const scalar_t* __restrict__ sin_ptr,
@@ -38,22 +38,18 @@ inline __device__ void apply_rotary_embedding(
 }
 
 template<typename scalar_t, bool IS_NEOX>
-__global__ void rotary_embedding_kernel(
-  const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
+inline __device__ void apply_rotary_embedding(
   scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
   scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
-  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
-  const int rot_dim,
-  const int64_t query_stride,
-  const int64_t key_stride,
+  const scalar_t* cache_ptr,
+  const int head_size,
   const int num_heads,
   const int num_kv_heads,
-  const int head_size) {
-  // Each thread block is responsible for one token.
-  const int token_idx = blockIdx.x;
-  int64_t pos = positions[token_idx];
-  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
-
+  const int rot_dim,
+  const int token_idx,
+  const int64_t query_stride,
+  const int64_t key_stride)
+{
   const int embed_dim = rot_dim / 2;
   const scalar_t* cos_ptr = cache_ptr;
   const scalar_t* sin_ptr = cache_ptr + embed_dim;
@@ -63,7 +59,7 @@ __global__ void rotary_embedding_kernel(
     const int head_idx = i / embed_dim;
     const int64_t token_head = token_idx * query_stride + head_idx * head_size;
     const int rot_offset = i % embed_dim;
-    apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
                                               sin_ptr, rot_offset, embed_dim);
   }
 
@@ -72,11 +68,53 @@ __global__ void rotary_embedding_kernel(
     const int head_idx = i / embed_dim;
     const int64_t token_head = token_idx * key_stride + head_idx * head_size;
     const int rot_offset = i % embed_dim;
-    apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
                                               sin_ptr, rot_offset, embed_dim);
   }
 }
 
+template<typename scalar_t, bool IS_NEOX>
+__global__ void rotary_embedding_kernel(
+  const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
+  const int rot_dim,
+  const int64_t query_stride,
+  const int64_t key_stride,
+  const int num_heads,
+  const int num_kv_heads,
+  const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
+}
+
+template<typename scalar_t, bool IS_NEOX>
+__global__ void batched_rotary_embedding_kernel(
+  const int64_t* __restrict__ positions,              // [batch_size, seq_len] or [num_tokens]
+  scalar_t* __restrict__ query,                       // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                         // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const scalar_t* __restrict__ cos_sin_cache,         // [max_position, 2, rot_dim // 2]
+  const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
+  const int rot_dim,
+  const int64_t query_stride,
+  const int64_t key_stride,
+  const int num_heads,
+  const int num_kv_heads,
+  const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
+
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
+}
+
 } // namespace vllm
 
 void rotary_embedding(
@@ -128,3 +166,61 @@ void rotary_embedding(
       }
     });
 }
+
+/*
+Batched version of rotary embedding, pack multiple LoRAs together
+and process in batched manner.
+*/
+void batched_rotary_embedding(
+  torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
+  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
+  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
+  int head_size,
+  torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
+  bool is_neox,
+  int rot_dim,
+  torch::Tensor& cos_sin_cache_offsets // [num_tokens]
+) {
+  int64_t num_tokens = cos_sin_cache_offsets.size(0);
+  int num_heads = query.size(-1) / head_size;
+  int num_kv_heads = key.size(-1) / head_size;
+  int64_t query_stride = query.stride(-2);
+  int64_t key_stride = key.stride(-2);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    query.scalar_type(),
+    "rotary_embedding",
+    [&] {
+      if (is_neox) {
+        vllm::batched_rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+          positions.data_ptr<int64_t>(),
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_sin_cache.data_ptr<scalar_t>(),
+          cos_sin_cache_offsets.data_ptr<int64_t>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      } else {
+        vllm::batched_rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
+          positions.data_ptr<int64_t>(),
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_sin_cache.data_ptr<scalar_t>(),
+          cos_sin_cache_offsets.data_ptr<int64_t>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      }
+    });
+}
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index 39384f08d928c..a5c6439fd6909 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -56,6 +56,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     &rotary_embedding,
     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
 
+  ops.def(
+    "batched_rotary_embedding",
+    &batched_rotary_embedding,
+    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)");
+
 // Quantization ops
 #ifndef USE_ROCM
   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 0d27bbaff9fc5..ffdcc1e8c80fd 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,8 +1,9 @@
-from typing import Optional
+from typing import List, Optional
 
 import pytest
 import torch
 from allclose_default import get_default_atol, get_default_rtol
+from itertools import accumulate
 from vllm.model_executor.layers.rotary_embedding import get_rope
 
 IS_NEOX_STYLE = [True, False]
@@ -72,3 +73,135 @@ def test_rotary_embedding(
                           ref_key,
                           atol=get_default_atol(out_key),
                           rtol=get_default_rtol(out_key))
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_batched_rotary_embedding(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+        "type": "linear",
+        "factor": (1, )
+    })
+    rope = rope.to(dtype=dtype)
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope._forward(positions, query, key)
+    out_query, out_key = rope.forward(positions,
+                                      query,
+                                      key,
+                                      offsets=torch.zeros(batch_size * seq_len,
+                                                          dtype=int,
+                                                          device=device))
+    # Compare the results.
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
+
+
+@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_batched_rotary_embedding_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    scaling_factors: List[int] = [1, 2, 4]
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
+        "type": "linear",
+        "factor": tuple(scaling_factors)
+    })
+    rope = rope.to(dtype=dtype)
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    query_offsets = offset_map[query_types]
+
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope._forward(positions, query, key, query_offsets)
+    out_query, out_key = rope.forward(positions, query, key,
+                                      query_offsets.flatten())
+    # Compare the results.
+    assert torch.allclose(out_query,
+                          ref_query,
+                          atol=get_default_atol(out_query),
+                          rtol=get_default_rtol(out_query))
+    assert torch.allclose(out_key,
+                          ref_key,
+                          atol=get_default_atol(out_key),
+                          rtol=get_default_rtol(out_key))
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 13749570f28a2..db5c7080b50b0 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Rotary Positional Embeddings."""
 import math
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -96,6 +96,7 @@ def _forward(
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         query = query.view(*query.shape[:-1], -1, self.head_size)
@@ -107,7 +108,9 @@ def _forward(
             query_pass = query[..., self.rotary_dim:]
             key_pass = key[..., self.rotary_dim:]
 
-        cos_sin = self.cos_sin_cache[positions]
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device())
+        cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
+                                     if offsets is not None else positions]
         cos, sin = cos_sin.chunk(2, dim=-1)
         if self.is_neox_style:
             # NOTE(woosuk): Here we assume that the positions tensor has the
@@ -137,11 +140,19 @@ def forward(
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # ops.rotary_embedding() is an in-place operation that
-        # updates the query and key tensors.
-        ops.rotary_embedding(positions, query, key, self.head_size,
-                             self.cos_sin_cache, self.is_neox_style)
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device())
+        # ops.rotary_embedding()/batched_rotary_embedding() are in-place operations that
+        # update the query and key tensors.
+        if offsets is not None:
+            ops.batched_rotary_embedding(positions, query, key, self.head_size,
+                                         self.cos_sin_cache,
+                                         self.is_neox_style, self.rotary_dim,
+                                         offsets)
+        else:
+            ops.rotary_embedding(positions, query, key, self.head_size,
+                                 self.cos_sin_cache, self.is_neox_style)
         return query, key
 
 
@@ -158,27 +169,32 @@ def __init__(
         max_position_embeddings: int,
         base: int,
         is_neox_style: bool,
-        scaling_factor: float,
+        scaling_factors: Union[List[float], float],
     ) -> None:
-        self.scaling_factor = scaling_factor
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors = scaling_factors
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style)
 
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.base)
-        # NOTE(woosuk): self.max_position_embeddings is the original
-        # maximum length before applying the rope scaling.
-        # Thus, the maximum length after applying the rope scaling is
-        # self.max_position_embeddings * self.scaling_factor.
-        max_len = self.max_position_embeddings * self.scaling_factor
-        t = torch.arange(max_len, dtype=torch.float)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
+        cache_list = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            cache_list.append(cache)
+        return torch.cat(cache_list, dim=0)
 
 
 class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):

From c33afd89f56ba5c260275fdd6723c59642f82f22 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Wed, 13 Mar 2024 13:56:49 -0700
Subject: [PATCH 094/113] Fix lint (#3388)

---
 vllm/model_executor/layers/rotary_embedding.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index db5c7080b50b0..71af9b26e2e93 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -143,8 +143,8 @@ def forward(
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device())
-        # ops.rotary_embedding()/batched_rotary_embedding() are in-place operations that
-        # update the query and key tensors.
+        # ops.rotary_embedding()/batched_rotary_embedding()
+        # are in-place operations that update the query and key tensors.
         if offsets is not None:
             ops.batched_rotary_embedding(positions, query, key, self.head_size,
                                          self.cos_sin_cache,

From eeab52a4ff02e15f970880a689df2861ad173770 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Wed, 13 Mar 2024 14:18:40 -0700
Subject: [PATCH 095/113] [FIX] Simpler fix for async engine running on ray
 (#3371)

---
 vllm/executor/ray_gpu_executor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 261fcfb7dad9b..82a2b456895e8 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -430,8 +430,7 @@ async def execute_model_async(
                 "blocks_to_swap_in": blocks_to_swap_in,
                 "blocks_to_swap_out": blocks_to_swap_out,
                 "blocks_to_copy": blocks_to_copy,
-            },
-            use_ray_compiled_dag=USE_RAY_COMPILED_DAG)
+            })
 
         # Only the driver worker returns the sampling results.
         output = all_outputs[0]

From 81653d968842d2ec51b2642b6b5d83786271f9af Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 13 Mar 2024 17:02:21 -0700
Subject: [PATCH 096/113] [Hotfix] [Debug]
 test_openai_server.py::test_guided_regex_completion (#3383)

---
 .buildkite/test-pipeline.yaml | 2 +-
 requirements.txt              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 42a1eacb6de57..6a130f6fadcc3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -13,7 +13,7 @@ steps:
 
 - label: Basic Correctness Test
   command: pytest -v -s --forked basic_correctness
-  
+
 - label: Core Test
   command: pytest -v -s core
 
diff --git a/requirements.txt b/requirements.txt
index 05ec2e804e13b..d6c33ad85da58 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,5 +12,5 @@ pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 pynvml == 11.5.0
 triton >= 2.1.0
-outlines >= 0.0.27
+outlines == 0.0.34
 cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.

From a37415c31b3b5c7ab40d2d897192025f0ca7be08 Mon Sep 17 00:00:00 2001
From: "Allen.Dou" <allen.dou@hotmail.com>
Date: Thu, 14 Mar 2024 14:35:13 +0800
Subject: [PATCH 097/113] allow user to chose which vllm's merics to display in
 grafana (#3393)

---
 examples/production_monitoring/grafana.json | 184 ++++++++++----------
 1 file changed, 88 insertions(+), 96 deletions(-)

diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json
index f48b6314eb055..071f134c6e5e0 100644
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
@@ -1,35 +1,4 @@
 {
-  "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
-  ],
-  "__elements": {},
-  "__requires": [
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "10.2.3"
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    },
-    {
-      "type": "panel",
-      "id": "timeseries",
-      "name": "Time series",
-      "version": ""
-    }
-  ],
   "annotations": {
     "list": [
       {
@@ -42,6 +11,12 @@
         "hide": true,
         "iconColor": "rgba(0, 211, 255, 1)",
         "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
         "type": "dashboard"
       }
     ]
@@ -50,14 +25,14 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": null,
+  "id": 29,
   "links": [],
   "liveNow": false,
   "panels": [
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "description": "End to end request latency measured in seconds.",
       "fieldConfig": {
@@ -66,7 +41,6 @@
             "mode": "palette-classic"
           },
           "custom": {
-            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
             "axisLabel": "",
@@ -80,7 +54,6 @@
               "tooltip": false,
               "viz": false
             },
-            "insertNulls": false,
             "lineInterpolation": "linear",
             "lineWidth": 1,
             "pointSize": 5,
@@ -138,11 +111,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -154,11 +127,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -171,11 +144,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -188,11 +161,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -205,10 +178,10 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])",
+          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
           "hide": false,
           "instant": false,
           "legendFormat": "Average",
@@ -222,7 +195,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "description": "Number of tokens processed per second",
       "fieldConfig": {
@@ -231,7 +204,6 @@
             "mode": "palette-classic"
           },
           "custom": {
-            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
             "axisLabel": "",
@@ -245,7 +217,6 @@
               "tooltip": false,
               "viz": false
             },
-            "insertNulls": false,
             "lineInterpolation": "linear",
             "lineWidth": 1,
             "pointSize": 5,
@@ -302,11 +273,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])",
+          "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -318,11 +289,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "rate(vllm:generation_tokens_total[$__rate_interval])",
+          "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -339,7 +310,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "description": "Inter token latency in seconds.",
       "fieldConfig": {
@@ -348,7 +319,6 @@
             "mode": "palette-classic"
           },
           "custom": {
-            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
             "axisLabel": "",
@@ -362,7 +332,6 @@
               "tooltip": false,
               "viz": false
             },
-            "insertNulls": false,
             "lineInterpolation": "linear",
             "lineWidth": 1,
             "pointSize": 5,
@@ -420,11 +389,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -436,11 +405,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -453,11 +422,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -470,11 +439,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -487,10 +456,10 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])",
+          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
           "hide": false,
           "instant": false,
           "legendFormat": "Mean",
@@ -504,7 +473,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
       "fieldConfig": {
@@ -513,7 +482,6 @@
             "mode": "palette-classic"
           },
           "custom": {
-            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
             "axisLabel": "",
@@ -527,7 +495,6 @@
               "tooltip": false,
               "viz": false
             },
-            "insertNulls": false,
             "lineInterpolation": "linear",
             "lineWidth": 1,
             "pointSize": 5,
@@ -585,11 +552,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "vllm:num_requests_running",
+          "expr": "vllm:num_requests_running{model_name=\"$model_name\"}",
           "fullMetaSearch": false,
           "includeNullMetadata": true,
           "instant": false,
@@ -601,11 +568,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "vllm:num_requests_swapped",
+          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": true,
@@ -618,11 +585,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "vllm:num_requests_waiting",
+          "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": true,
@@ -639,7 +606,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
       "fieldConfig": {
@@ -648,7 +615,6 @@
             "mode": "palette-classic"
           },
           "custom": {
-            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
             "axisLabel": "",
@@ -662,7 +628,6 @@
               "tooltip": false,
               "viz": false
             },
-            "insertNulls": false,
             "lineInterpolation": "linear",
             "lineWidth": 1,
             "pointSize": 5,
@@ -720,11 +685,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -737,11 +702,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
@@ -753,11 +718,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -770,11 +735,11 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "disableTextWrap": false,
           "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
           "fullMetaSearch": false,
           "hide": false,
           "includeNullMetadata": false,
@@ -787,10 +752,10 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])",
+          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
           "hide": false,
           "instant": false,
           "legendFormat": "Average",
@@ -804,7 +769,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
       },
       "description": "Percentage of used cache blocks by vLLM.",
       "fieldConfig": {
@@ -813,7 +778,6 @@
             "mode": "palette-classic"
           },
           "custom": {
-            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
             "axisLabel": "",
@@ -827,7 +791,6 @@
               "tooltip": false,
               "viz": false
             },
-            "insertNulls": false,
             "lineInterpolation": "linear",
             "lineWidth": 1,
             "pointSize": 5,
@@ -885,10 +848,10 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "vllm:gpu_cache_usage_perc",
+          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
           "instant": false,
           "legendFormat": "GPU Cache Usage",
           "range": true,
@@ -897,10 +860,10 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
           },
           "editorMode": "code",
-          "expr": "vllm:cpu_cache_usage_perc",
+          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
           "hide": false,
           "instant": false,
           "legendFormat": "CPU Cache Usage",
@@ -913,10 +876,39 @@
     }
   ],
   "refresh": "",
-  "schemaVersion": 39,
+  "schemaVersion": 37,
+  "style": "dark",
   "tags": [],
   "templating": {
-    "list": []
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "vllm",
+          "value": "vllm"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "label_values(model_name)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "model_name",
+        "multi": false,
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "query": "label_values(model_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
   },
   "time": {
     "from": "now-5m",

From 8fe838659164b415d7f3044ec6b7e5bc52c6b6a5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 14 Mar 2024 01:11:48 -0700
Subject: [PATCH 098/113] [Kernel] change benchmark script so that result can
 be directly used; tune moe kernel in A100/H100 with tp=2,4,8 (#3389)

---
 benchmarks/kernels/benchmark_mixtral_moe.py   |  30 ++--
 .../layers/fused_moe/__init__.py              |   6 +-
 ...792,device_name=NVIDIA_A100-SXM4-80GB.json | 146 +++++++++++++++
 ...792,device_name=NVIDIA_H100_80GB_HBM3.json | 146 +++++++++++++++
 ...584,device_name=NVIDIA_A100-SXM4-80GB.json | 162 +++++++++++++++--
 ...584,device_name=NVIDIA_H100_80GB_HBM3.json | 146 +++++++++++++++
 ...168,device_name=NVIDIA_A100-SXM4-80GB.json | 146 +++++++++++++++
 ...168,device_name=NVIDIA_H100_80GB_HBM3.json | 166 +++++++++++++++---
 .../layers/fused_moe/fused_moe.py             |  10 +-
 9 files changed, 903 insertions(+), 55 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json

diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
index 9e08df76947f8..964eca5aaf72b 100644
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -2,13 +2,13 @@
 import os
 import sys
 
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name
 import torch
 import torch.nn.functional as F
 import triton
 
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
 
 def main():
     method = fused_moe
@@ -64,7 +64,7 @@ def run_grid(bs, method):
         print(f'{tp_size=} {bs=}')
         print(f'{config}')
         # warmup
-        print(f'warming up')
+        print('warming up')
         try:
             for _ in range(num_warmup_trials):
                 run_timing(
@@ -82,7 +82,7 @@ def run_grid(bs, method):
             continue
 
         # trial
-        print(f'benchmarking')
+        print('benchmarking')
         for _ in range(num_trials):
             kernel_dur_ms = run_timing(
                 num_calls=num_calls,
@@ -103,17 +103,25 @@ def run_grid(bs, method):
                 best_config = config
                 best_time_us = kernel_dur_us
 
-            print(
-                f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}'
-            )
+            print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                  f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                  f'{d_model=} {model_intermediate_size=} {num_layers=}')
 
     print("best_time_us", best_time_us)
     print("best_config", best_config)
 
-    filename = "/tmp/config.jsonl"
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size)
     print(f"writing config to file {filename}")
-    with open(filename, "a") as f:
-        f.write(json.dumps({str(bs): best_config}) + "\n")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
 
 
 def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 1391d43c8abeb..299ab44f8f3d5 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,9 @@
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_moe,
+    get_config_file_name,
+)
 
 __all__ = [
     "fused_moe",
+    "get_config_file_name",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000..5c8185cfdeec1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000..97c9f4445b166
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
index 1fefb5ff7e42d..edf2a38d12ad3 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -1,20 +1,146 @@
 {
-    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7},
-    "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6},
-    "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7},
-    "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7},
-    "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "96": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4},
-    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6},
-    "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6},
-    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4},
-    "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4},
-    "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
-    "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4},
-    "2048": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4},
-    "3072": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4},
-    "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
 }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000..b2100cebb7f58
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
new file mode 100644
index 0000000000000..f578c8d0160ac
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
index 64d49ca66c1c8..e341a67917d51 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -1,24 +1,146 @@
 {
-    "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4},
-    "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 4},
-    "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4},
-    "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4},
-    "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "80": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4},
-    "200": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4},
-    "208": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4},
-    "216": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4},
-    "224": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4},
-    "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4},
-    "512": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
-    "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
-    "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
-    "2048": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
-    "3072": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4},
-    "4096": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
 }
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3e6dd0dfe2eb3..1ec09f0cd4c28 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -245,6 +245,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
     )
 
 
+def get_config_file_name(E: int, N: int) -> str:
+    device_name = torch.cuda.get_device_name().replace(" ", "_")
+    return f"E={E},N={N},device_name={device_name}.json"
+
+
 @functools.lru_cache
 def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
     """
@@ -258,11 +263,10 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
 
     # First look up if an optimized configuration is available in the configs
     # directory
-    device_name = torch.cuda.get_device_name().replace(" ", "_")
+    json_file_name = get_config_file_name(E, N)
 
     config_file_path = os.path.join(
-        os.path.dirname(os.path.realpath(__file__)), "configs",
-        f"E={E},N={N},device_name={device_name}.json")
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
     if os.path.exists(config_file_path):
         with open(config_file_path) as f:
             logger.info(

From 06ec486794f42db656c3cc16c8c5ed56ce4f696b Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tom.parnell@gmail.com>
Date: Thu, 14 Mar 2024 18:55:54 +0100
Subject: [PATCH 099/113] Install `flash_attn` in Docker image (#3396)

---
 Dockerfile | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 18770f994ebd2..8be03b3567f0e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
 RUN python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################
 
+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# flash attention version
+ARG flash_attn_version=v2.5.6
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
+
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### FLASH_ATTENTION Build IMAGE ####################
 
 #################### TEST IMAGE ####################
 # image to run unit testing suite
@@ -68,6 +84,9 @@ WORKDIR /vllm-workspace
 # ADD is used to preserve directory structure
 ADD . /vllm-workspace/
 COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
@@ -88,6 +107,11 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
+
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+
 #################### RUNTIME BASE IMAGE ####################
 
 

From c17ca8ef186b5e90a500d3e37724b220944450f7 Mon Sep 17 00:00:00 2001
From: Dan Clark <44146800+declark1@users.noreply.github.com>
Date: Thu, 14 Mar 2024 13:11:45 -0700
Subject: [PATCH 100/113] Add args for mTLS support (#3410)

Co-authored-by: Daniel Clark <daniel.clark@ibm.com>
---
 vllm/entrypoints/api_server.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 86b6c4c67cfa4..5130586e036b2 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -82,6 +82,14 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--ssl-keyfile", type=str, default=None)
     parser.add_argument("--ssl-certfile", type=str, default=None)
+    parser.add_argument("--ssl-ca-certs",
+                        type=str,
+                        default=None,
+                        help="The CA certificates file")
+    parser.add_argument("--ssl-cert-reqs",
+                        type=int,
+                        default=0,
+                        help="Whether client certificate is required")
     parser.add_argument(
         "--root-path",
         type=str,
@@ -100,4 +108,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                 log_level="debug",
                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                 ssl_keyfile=args.ssl_keyfile,
-                ssl_certfile=args.ssl_certfile)
+                ssl_certfile=args.ssl_certfile,
+                ssl_ca_certs=args.ssl_ca_certs,
+                ssl_cert_reqs=args.ssl_cert_reqs)

From dfc77408bdca19308cbb28a54dfe697442fbf335 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@126.com>
Date: Thu, 14 Mar 2024 13:16:00 -0700
Subject: [PATCH 101/113] [issue templates] add some issue templates (#3412)

---
 .github/ISSUE_TEMPLATE/100-documentation.yml  |  22 +
 .github/ISSUE_TEMPLATE/200-installation.yml   |  39 +
 .github/ISSUE_TEMPLATE/300-usage.yml          |  37 +
 .github/ISSUE_TEMPLATE/400-bug report.yml     |  81 +++
 .../ISSUE_TEMPLATE/500-feature request.yml    |  31 +
 .github/ISSUE_TEMPLATE/600-new model.yml      |  33 +
 .../700-performance discussion.yml            |  51 ++
 .../ISSUE_TEMPLATE/800-misc discussion.yml    |  21 +
 .github/ISSUE_TEMPLATE/config.yml             |   1 +
 .yapfignore                                   |   1 +
 collect_env.py                                | 688 ++++++++++++++++++
 11 files changed, 1005 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/100-documentation.yml
 create mode 100644 .github/ISSUE_TEMPLATE/200-installation.yml
 create mode 100644 .github/ISSUE_TEMPLATE/300-usage.yml
 create mode 100644 .github/ISSUE_TEMPLATE/400-bug report.yml
 create mode 100644 .github/ISSUE_TEMPLATE/500-feature request.yml
 create mode 100644 .github/ISSUE_TEMPLATE/600-new model.yml
 create mode 100644 .github/ISSUE_TEMPLATE/700-performance discussion.yml
 create mode 100644 .github/ISSUE_TEMPLATE/800-misc discussion.yml
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .yapfignore
 create mode 100644 collect_env.py

diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml
new file mode 100644
index 0000000000000..7ef052a525963
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@@ -0,0 +1,22 @@
+name: 📚 Documentation
+description: Report an issue related to https://docs.vllm.ai/
+title: "[Doc]: "
+labels: ["doc"]
+
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
new file mode 100644
index 0000000000000..4c6c96187cc6c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -0,0 +1,39 @@
+name: 🛠️ Installation
+description: Report an issue here when you hit errors during installation.
+title: "[Installation]: "
+labels: ["installation"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How you are installing vllm
+    description: |
+      Paste the full command you are trying to execute.
+    value: |
+      ```sh
+      pip install -vvv vllm
+      ```
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
new file mode 100644
index 0000000000000..88227b4b2e7b9
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -0,0 +1,37 @@
+name: 💻 Usage
+description: Raise an issue here if you don't know how to use vllm.
+title: "[Usage]: "
+labels: ["usage"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How would you like to use vllm
+    description: |
+      A detailed description of how you want to use vllm.
+    value: |
+      I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
new file mode 100644
index 0000000000000..f1124dfa78bbc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml	
@@ -0,0 +1,81 @@
+name: 🐛 Bug report
+description: Raise an issue here if you find a bug.
+title: "[Bug]: "
+labels: ["bug"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug
+    description: |
+      Please provide a clear and concise description of what the bug is.
+
+      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
+
+      ```python
+      from vllm import LLM, SamplingParams
+
+      prompts = [
+          "Hello, my name is",
+          "The president of the United States is",
+          "The capital of France is",
+          "The future of AI is",
+      ]
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+      llm = LLM(model="facebook/opt-125m")
+
+      outputs = llm.generate(prompts, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+          prompt = output.prompt
+          generated_text = output.outputs[0].text
+          print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+    placeholder: |
+      A clear and concise description of what the bug is.
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
+
+      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
+
+      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
+
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature request.yml
new file mode 100644
index 0000000000000..0dd5a3e5d14de
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/500-feature request.yml	
@@ -0,0 +1,31 @@
+name: 🚀 Feature request
+description: Submit a proposal/request for a new vllm feature
+title: "[Feature]: "
+labels: ["feature"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🚀 The feature, motivation and pitch
+    description: >
+      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Alternatives
+    description: >
+      A description of any alternative solutions or features you've considered, if any.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: >
+      Add any other context or screenshots about the feature request.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new model.yml
new file mode 100644
index 0000000000000..bbddbfd67138a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/600-new model.yml	
@@ -0,0 +1,33 @@
+name: 🤗 Support request for a new model from huggingface
+description: Submit a proposal/request for a new model from huggingface
+title: "[New Model]: "
+labels: ["new model"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+- type: textarea
+  attributes:
+    label: The model to consider.
+    description: >
+      A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: The closest model vllm already supports.
+    description: >
+      Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
+- type: textarea
+  attributes:
+    label: What's your difficulty of supporting the model you want?
+    description: >
+      For example, any new operators or new architecture?
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
new file mode 100644
index 0000000000000..9e8e7b4aa3530
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml	
@@ -0,0 +1,51 @@
+name: ⚡ Discussion on the performance of vllm
+description: Submit a proposal/discussion about the performance of vllm
+title: "[Performance]: "
+labels: ["performance"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Proposal to improve performance
+    description: >
+      How do you plan to improve vllm's performance?
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Report of performance regression
+    description: >
+      Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Misc discussion on performance
+    description: >
+      Anything about the performance.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Your current environment (if you think it is necessary)
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc discussion.yml
new file mode 100644
index 0000000000000..ddb10f72db293
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/800-misc discussion.yml	
@@ -0,0 +1,21 @@
+name: 🎲 Misc/random discussions that do not fit into the above categories.
+description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
+title: "[Misc]: "
+labels: ["misc"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Anything you want to discuss about vllm.
+    description: >
+      Anything you want to discuss about vllm.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000..3ba13e0cec6cb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
diff --git a/.yapfignore b/.yapfignore
new file mode 100644
index 0000000000000..2d6dcf8380cac
--- /dev/null
+++ b/.yapfignore
@@ -0,0 +1 @@
+collect_env.py
diff --git a/collect_env.py b/collect_env.py
new file mode 100644
index 0000000000000..a886db693e2f1
--- /dev/null
+++ b/collect_env.py
@@ -0,0 +1,688 @@
+# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
+
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+import datetime
+import locale
+import re
+import subprocess
+import sys
+import os
+from collections import namedtuple
+
+
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple('SystemEnv', [
+    'torch_version',
+    'is_debug_build',
+    'cuda_compiled_version',
+    'gcc_version',
+    'clang_version',
+    'cmake_version',
+    'os',
+    'libc_version',
+    'python_version',
+    'python_platform',
+    'is_cuda_available',
+    'cuda_runtime_version',
+    'cuda_module_loading',
+    'nvidia_driver_version',
+    'nvidia_gpu_models',
+    'cudnn_version',
+    'pip_version',  # 'pip' or 'pip3'
+    'pip_packages',
+    'conda_packages',
+    'hip_compiled_version',
+    'hip_runtime_version',
+    'miopen_runtime_version',
+    'caching_allocator_config',
+    'is_xnnpack_available',
+    'cpu_info',
+    'rocm_version',  # vllm specific field
+    'neuron_sdk_version', # vllm specific field
+    'vllm_version',  # vllm specific field
+    'vllm_build_flags',  # vllm specific field
+    'gpu_topo',  # vllm specific field
+])
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, shell=shell)
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == 'win32':
+        enc = 'oem'
+    else:
+        enc = locale.getpreferredencoding()
+    output = raw_output.decode(enc)
+    err = raw_err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split('\n')[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+    out = run_and_read_all(run_lambda, "{} list".format(conda))
+    if out is None:
+        return out
+
+    return "\n".join(
+        line
+        for line in out.splitlines()
+        if not line.startswith("#")
+        and any(name in line for name in patterns)
+    )
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version', r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)')
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == 'darwin':
+        cmd = 'kextstat | grep -i cuda'
+        return run_and_parse_first_match(run_lambda, cmd,
+                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ')
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(torch.version, 'hip') and torch.version.hip is not None):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r' \(UUID: .+?\)')
+    rc, out, _ = run_lambda(smi + ' -L')
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, '', out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'nvcc --version', r'release .+ V(.*)')
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, 'System32', 'where')
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == 'darwin':
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        l = os.environ.get('CUDNN_LIBRARY')
+        if l is not None and os.path.isfile(l):
+            return os.path.realpath(l)
+        return None
+    files_set = set()
+    for fn in out.split('\n'):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = '\n'.join(files)
+    return 'Probably one of the following:\n{}'.format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = 'nvidia-smi'
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files')
+        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi)
+        new_path = os.path.join(system_root, 'System32', smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+def get_rocm_version(run_lambda):
+    """Returns the ROCm version if available, otherwise 'N/A'."""
+    return run_and_parse_first_match(run_lambda, 'hipcc --version', r'HIP version: (\S+)')
+
+
+def get_neuron_sdk_version(run_lambda):
+    # Adapted from your install script
+    try:
+        result = run_lambda(["neuron-ls"])
+        return result if result[0] == 0 else 'N/A'
+    except Exception:
+        return 'N/A'
+
+
+def get_vllm_version():
+    try:
+        import vllm
+        return vllm.__version__
+    except ImportError:
+        return 'N/A'
+
+
+def summarize_vllm_build_flags():
+    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
+        os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
+        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
+        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
+    )
+
+
+def get_gpu_topo(run_lambda):
+    if get_platform() == 'linux':
+        return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+    return None
+
+
+# example outputs of CPU infos
+#  * linux
+#    Architecture:            x86_64
+#      CPU op-mode(s):        32-bit, 64-bit
+#      Address sizes:         46 bits physical, 48 bits virtual
+#      Byte Order:            Little Endian
+#    CPU(s):                  128
+#      On-line CPU(s) list:   0-127
+#    Vendor ID:               GenuineIntel
+#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#        CPU family:          6
+#        Model:               106
+#        Thread(s) per core:  2
+#        Core(s) per socket:  32
+#        Socket(s):           2
+#        Stepping:            6
+#        BogoMIPS:            5799.78
+#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
+#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
+#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
+#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
+#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
+#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
+#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
+#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
+#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
+#    Virtualization features:
+#      Hypervisor vendor:     KVM
+#      Virtualization type:   full
+#    Caches (sum of all):
+#      L1d:                   3 MiB (64 instances)
+#      L1i:                   2 MiB (64 instances)
+#      L2:                    80 MiB (64 instances)
+#      L3:                    108 MiB (2 instances)
+#    NUMA:
+#      NUMA node(s):          2
+#      NUMA node0 CPU(s):     0-31,64-95
+#      NUMA node1 CPU(s):     32-63,96-127
+#    Vulnerabilities:
+#      Itlb multihit:         Not affected
+#      L1tf:                  Not affected
+#      Mds:                   Not affected
+#      Meltdown:              Not affected
+#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+#      Retbleed:              Not affected
+#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
+#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
+#      Srbds:                 Not affected
+#      Tsx async abort:       Not affected
+#  * win32
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU0
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+#
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU1
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda('wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE')
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
+    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    return run_and_read_all(run_lambda, '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+        return "\n".join(
+            line
+            for line in out.splitlines()
+            if any(name in line for name in patterns)
+        )
+
+    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    out = run_with_pip([sys.executable, '-mpip'])
+
+    return pip_version, out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get('CUDA_MODULE_LOADING', '')
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if not hasattr(torch.version, 'hip') or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        else:  # HIP version
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else 'N/A'
+
+            cfg = torch._C._show_config().split('\n')
+            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
+            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
+            cuda_version_str = 'N/A'
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    rocm_version = get_rocm_version(run_lambda)
+    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
+    vllm_version = get_vllm_version()
+    vllm_build_flags = summarize_vllm_build_flags()
+    gpu_topo = get_gpu_topo(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version='{} ({}-bit runtime)'.format(sys_version, sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+        rocm_version=rocm_version,
+        neuron_sdk_version=neuron_sdk_version,
+        vllm_version=vllm_version,
+        vllm_build_flags=vllm_build_flags,
+        gpu_topo=gpu_topo,
+    )
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+CUDA used to build PyTorch: {cuda_compiled_version}
+ROCM used to build PyTorch: {hip_compiled_version}
+
+OS: {os}
+GCC version: {gcc_version}
+Clang version: {clang_version}
+CMake version: {cmake_version}
+Libc version: {libc_version}
+
+Python version: {python_version}
+Python platform: {python_platform}
+Is CUDA available: {is_cuda_available}
+CUDA runtime version: {cuda_runtime_version}
+CUDA_MODULE_LOADING set to: {cuda_module_loading}
+GPU models and configuration: {nvidia_gpu_models}
+Nvidia driver version: {nvidia_driver_version}
+cuDNN version: {cudnn_version}
+HIP runtime version: {hip_runtime_version}
+MIOpen runtime version: {miopen_runtime_version}
+Is XNNPACK available: {is_xnnpack_available}
+
+CPU:
+{cpu_info}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+env_info_fmt += """
+ROCM Version: {rocm_version}
+Neuron SDK Version: {neuron_sdk_version}
+vLLM Version: {vllm_version}
+vLLM Build Flags:
+{vllm_build_flags}
+GPU Topology:
+{gpu_topo}
+""".strip()
+
+
+def pretty_str(envinfo):
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict['nvidia_gpu_models'] = \
+        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        'cuda_runtime_version',
+        'nvidia_gpu_models',
+        'nvidia_driver_version',
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
+    all_dynamic_cuda_fields_missing = all(
+        mutable_dict[field] is None for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = 'No CUDA'
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict['cuda_compiled_version'] = 'None'
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'],
+                                               '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'],
+                                                 '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+
+if __name__ == '__main__':
+    main()

From 54be8a0be2819340ce7c2d7993382559597f5665 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=BA=8F?= <chenxu2048@gmail.com>
Date: Fri, 15 Mar 2024 04:56:57 +0800
Subject: [PATCH 102/113] Fix assertion failure in Qwen 1.5 with prefix caching
 enabled (#3373)

Co-authored-by: Cade Daniel <edacih@gmail.com>
---
 tests/test_config.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 vllm/config.py       | 14 ++++++++++++--
 2 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_config.py

diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 0000000000000..13a9f76212679
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,43 @@
+from vllm.config import ModelConfig
+
+
+def test_get_sliding_window():
+    TEST_SLIDING_WINDOW = 4096
+    # Test that the sliding window is correctly computed.
+    # For Qwen1.5/Qwen2, get_sliding_window() should be None
+    # when use_sliding_window is False.
+    qwen2_model_config = ModelConfig(
+        "Qwen/Qwen1.5-7B",
+        "Qwen/Qwen1.5-7B",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        download_dir=None,
+        load_format="dummy",
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    qwen2_model_config.hf_config.use_sliding_window = False
+    qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
+    assert qwen2_model_config.get_sliding_window() is None
+
+    qwen2_model_config.hf_config.use_sliding_window = True
+    assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
+
+    mistral_model_config = ModelConfig(
+        "mistralai/Mistral-7B-v0.1",
+        "mistralai/Mistral-7B-v0.1",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        download_dir=None,
+        load_format="dummy",
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+    mistral_model_config.hf_config.sliding_window = None
+    assert mistral_model_config.get_sliding_window() is None
+
+    mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
+    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
\ No newline at end of file
diff --git a/vllm/config.py b/vllm/config.py
index 319c1569f5e98..de687395a0001 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -103,6 +103,7 @@ def __init__(
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             from modelscope.hub.snapshot_download import snapshot_download  # pylint: disable=C
+
             if not os.path.exists(model):
                 model_path = snapshot_download(model_id=model,
                                                cache_dir=download_dir,
@@ -139,7 +140,7 @@ def _verify_load_format(self) -> None:
                 if (f not in rocm_not_supported_load_format)
             ]
             raise ValueError(
-                f"load format \'{load_format}\' is not supported in ROCm. "
+                f"load format '{load_format}' is not supported in ROCm. "
                 f"Supported load format are "
                 f"{rocm_supported_load_format}")
 
@@ -232,6 +233,15 @@ def verify_with_parallel_config(
                 f"({pipeline_parallel_size}).")
 
     def get_sliding_window(self) -> Optional[int]:
+        """Get the sliding window size, or None if disabled.
+        """
+
+        # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
+        # addition to sliding window size. We check if that field is present
+        # and if it's False, return None.
+        if (hasattr(self.hf_config, "use_sliding_window")
+                and not self.hf_config.use_sliding_window):
+            return None
         return getattr(self.hf_config, "sliding_window", None)
 
     def get_vocab_size(self) -> int:
@@ -624,7 +634,7 @@ def _get_and_verify_dtype(
             k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
             if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
         ]
-        raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
+        raise ValueError(f"dtype '{dtype}' is not supported in ROCm. "
                          f"Supported dtypes are {rocm_supported_dtypes}")
 
     # Verify the dtype.

From 4518f5a981aba715d7f0e8b2aab4cbef18196aa6 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 14 Mar 2024 23:18:25 +0000
Subject: [PATCH 103/113] format

---
 vllm/config.py                       |  3 ++-
 vllm/model_executor/layers/linear.py | 13 +++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c1fe1397ca7e9..56fe6b522e7ee 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -172,7 +172,8 @@ def _verify_sparsity(self) -> None:
             raise ValueError("Both sparsity and quantization detected. Only "
                              "one or the other is supported at a time.")
 
-        if self.sparsity is not None and self.sparsity not in supported_sparsity:
+        if (self.sparsity is not None
+                and self.sparsity not in supported_sparsity):
             raise ValueError(f"Unknown sparse method: {self.sparsity}. Must "
                              f"be one of {supported_sparsity}.")
 
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dbddea7a3b5f3..131f1ea2208b2 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -338,9 +338,10 @@ def weight_loader(self,
         assert param_data.shape == loaded_weight.shape
         param_data.copy_(loaded_weight)
 
-        # This is super hacky for now but we basically want to only compress once all
-        # of the shards are loaded, right now we just check if the number of shards
-        # loaded matches the number of outputs expected, assuming one shard per output
+        # This is super hacky for now but we basically want to only compress
+        # once all of the shards are loaded, right now we just check if the
+        # number of shards loaded matches the number of outputs expected,
+        # assuming one shard per output
         all_shards_loaded = (len(self.loaded_shards) == len(self.output_sizes))
         if all_shards_loaded and isinstance(param, LazyCompressedParameter):
             param.compress()
@@ -489,9 +490,9 @@ def weight_loader(self,
 
         self.loaded_shards.add(loaded_shard_id)
 
-        # This is super hacky for now but we basically want to only compress once
-        # all of the shards are loaded, for the QKV matrix this means
-        # loading shards "q", "k" and "v"
+        # This is super hacky for now but we basically want to only
+        # compress once all of the shards are loaded, for the QKV matrix
+        # this means loading shards "q", "k" and "v"
         all_shards_loaded = (self.loaded_shards == set(["q", "k", "v"]))
         if all_shards_loaded and isinstance(param, LazyCompressedParameter):
             param.compress()

From 5bc7a73116a67525b3692a2ecd9a51a5838b4e29 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 14 Mar 2024 23:24:49 +0000
Subject: [PATCH 104/113] formating

---
 .../layers/parameters/__init__.py             |  4 +++-
 .../layers/parameters/lazy_compressed.py      | 21 +++++++++++--------
 .../layers/sparsity/__init__.py               |  8 ++++---
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py
index c05cdf56e27a4..d05d73a79c13e 100644
--- a/vllm/model_executor/layers/parameters/__init__.py
+++ b/vllm/model_executor/layers/parameters/__init__.py
@@ -1,4 +1,6 @@
-from vllm.model_executor.layers.parameters.lazy_compressed import LazyCompressedParameter
+from vllm.model_executor.layers.parameters.lazy_compressed import (
+    LazyCompressedParameter
+)
 
 __all__ = [
     "LazyCompressedParameter",
diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py
index 37128a6ed54b7..65d44167c004a 100644
--- a/vllm/model_executor/layers/parameters/lazy_compressed.py
+++ b/vllm/model_executor/layers/parameters/lazy_compressed.py
@@ -66,7 +66,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         def unwrap(e):
             nonlocal ret_storage_format_cls
             if isinstance(e, LazyCompressedParameter):
-                assert ret_storage_format_cls is None or ret_storage_format_cls == e.storage_format_cls
+                assert (ret_storage_format_cls is None or 
+                        ret_storage_format_cls == e.storage_format_cls)
                 ret_storage_format_cls = e.storage_format_cls
 
                 if e.is_empty:
@@ -86,7 +87,8 @@ def wrap(e):
                           torch.Tensor) and ret_storage_format_cls is not None:
                 return LazyCompressedParameter(
                     e,
-                    # Here, "e" is the output of "func" so it is real data and we store it
+                    # Here, "e" is the output of "func" so it is real
+                    # data and we store it
                     is_empty=False,
                     storage_format_cls=ret_storage_format_cls)
             return e
@@ -98,9 +100,10 @@ def compress(self) -> None:
         from magic_wand import SparseSemiStructuredStorageFormat
 
         if self.storage_format_cls == SparseSemiStructuredStorageFormat:
-            # Semi-structured sparsity assumes a 2:4 pattern, where each 4 elements
-            # have at minimum 2 zeros. We need to validate this pattern exists, so
-            # we check the whole tensor before committing to compression.
+            # Semi-structured sparsity assumes a 2:4 pattern, where 
+            # each 4 elements have at minimum 2 zeros. We need to validate 
+            # this pattern exists, so we check the whole tensor 
+            # before committing to compression.
 
             # Count zeros in each group of 4
             reshaped_tensor = self.uncompressed_data.view(-1, 4)
@@ -112,8 +115,8 @@ def compress(self) -> None:
 
             if not has_semi_structured_sparsity:
                 logger.warning(
-                    f"Called compress() on tensor of shape {self.shape} but does not "
-                    "have 2:4 sparsity, skipping compression")
+                    f"Called compress() on tensor of shape {self.shape} but "
+                     "does not have 2:4 sparsity, skipping compression")
                 return
 
         else:
@@ -123,8 +126,8 @@ def compress(self) -> None:
             # Only compress if we have sufficient sparsity (>=40%)
             if sparsity < 0.4:
                 logger.warning(
-                    f"Called compress() on tensor of shape {self.shape} but only has "
-                    f"{sparsity:.2}% sparsity, skipping compression")
+                    f"Called compress() on tensor of shape {self.shape}, but "
+                    f"only has {sparsity:.2}% sparsity, skipping compression")
                 return
 
         if self.uncompressed_data is None:
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
index 874819f343373..204281924e9ad 100644
--- a/vllm/model_executor/layers/sparsity/__init__.py
+++ b/vllm/model_executor/layers/sparsity/__init__.py
@@ -7,9 +7,11 @@
         "magic_wand is not available and required for sparsity "
         "support. Please install it with `pip install nm-magic-wand`")
 
-from vllm.model_executor.layers.sparsity.base_config import SparsityConfig  # noqa: E402
-from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config  # noqa: E402
-from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import SemiStructuredSparseW16A16Config  # noqa: E402
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig          # noqa: E402
+from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config    # noqa: E402
+from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import (
+    SemiStructuredSparseW16A16Config  # noqa: E402
+)
 
 _SPARSITY_CONFIG_REGISTRY = {
     "sparse_w16a16": SparseW16A16Config,

From 6f60731d98dcbd763c86e643194e207be0b2f65f Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 14 Mar 2024 23:28:34 +0000
Subject: [PATCH 105/113] ruff

---
 tests/models/compare_utils.py          | 16 ++++++++++------
 tests/models/test_compressed.py        |  6 +++---
 tests/models/test_compressed_memory.py | 12 ++++++++----
 tests/models/test_marlin.py            |  4 +++-
 4 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py
index aefaf881048b8..235ccad0549a7 100644
--- a/tests/models/compare_utils.py
+++ b/tests/models/compare_utils.py
@@ -1,5 +1,5 @@
-"""Compare the logprobs of two sequences generated by different models, which should
-be similar but not necessarily equal.
+"""Compare the logprobs of two sequences generated by different models, 
+which should be similar but not necessarily equal.
 """
 
 
@@ -15,14 +15,18 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
         for idx, (output_id_0,
                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
 
-            # If generated tokens don't match ...
+            # If generated tokens don't match, then
             if output_id_0 != output_id_1:
-                # ... each predicted token must be in top N logprobs of the other's
+                # Each predicted token must be in top N logprobs of the other
                 assert output_id_0 in logprobs_1[idx], (
-                    f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}"
                 )
                 assert output_id_1 in logprobs_0[idx], (
-                    f"Test{prompt_idx}:\n{name_0}:\t{output_str_0!r}\n{name_1}:\t{output_str_1!r}"
+                    f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}"
                 )
 
                 # Break out since sequences will now diverge.
diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py
index fed9dfb35e881..c6fce5ef8ae7d 100644
--- a/tests/models/test_compressed.py
+++ b/tests/models/test_compressed.py
@@ -1,4 +1,4 @@
-"""Compare the outputs of a sparse model running sparse vs sparse model running dense.
+"""Compare the outputs of a sparse model vs sparse model running dense.
 Note: sparse kernels do not have bitwise correctness vs the dense models. 
 As a result, in this test, we just confirm that the top selected tokens of the 
 sparse models are in the top N selections of same model running dense.
@@ -41,7 +41,7 @@ def test_models(
     sparse_outputs = sparse_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    # Deleting just the model does not always free the GPU memory.
     del sparse_model.model.llm_engine.driver_worker
     del sparse_model
     gc.collect()
@@ -53,7 +53,7 @@ def test_models(
     dense_outputs = dense_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    # Deleting just the model does not always free the GPU memory.
     del dense_model.model.llm_engine.driver_worker
     del dense_model
     gc.collect()
diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py
index 1abb9269dc15e..c331e7132272d 100644
--- a/tests/models/test_compressed_memory.py
+++ b/tests/models/test_compressed_memory.py
@@ -36,9 +36,11 @@ def test_models(
                                  sparsity=None,
                                  dtype=dtype,
                                  max_model_len=1024)
-    dense_num_kv_blocks = dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
+    dense_num_kv_blocks = (
+        dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
+    )
 
-    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    # Deleting just the model does not always free the GPU memory.
     del dense_model.model.llm_engine.driver_worker
     del dense_model
     torch.cuda.empty_cache()
@@ -48,9 +50,11 @@ def test_models(
                                   sparsity=sparsity,
                                   dtype=dtype,
                                   max_model_len=1024)
-    sparse_num_kv_blocks = sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
+    sparse_num_kv_blocks = (
+        sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
+    )
 
-    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    # Deleting just the model does not always free the GPU memory.
     del sparse_model.model.llm_engine.driver_worker
     del sparse_model
     torch.cuda.empty_cache()
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 1dca24ffa9a53..35dfd7c19d8df 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -20,7 +20,9 @@
 import gc
 from compare_utils import check_logprobs_close
 from dataclasses import dataclass
-from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
+from vllm.model_executor.layers.quantization import (
+    _QUANTIZATION_CONFIG_REGISTRY
+)
 
 MAX_MODEL_LEN = 1024
 

From 5ba2ee147fa014b31fa6a78692a05fdd5e046c62 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 14 Mar 2024 23:29:59 +0000
Subject: [PATCH 106/113] ruff again

---
 vllm/model_executor/layers/sparsity/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
index 204281924e9ad..24292d849b905 100644
--- a/vllm/model_executor/layers/sparsity/__init__.py
+++ b/vllm/model_executor/layers/sparsity/__init__.py
@@ -9,8 +9,8 @@
 
 from vllm.model_executor.layers.sparsity.base_config import SparsityConfig          # noqa: E402
 from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config    # noqa: E402
-from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import (
-    SemiStructuredSparseW16A16Config  # noqa: E402
+from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import (     # noqa: E402
+    SemiStructuredSparseW16A16Config  
 )
 
 _SPARSITY_CONFIG_REGISTRY = {

From d342426135b1da66561cb64065617d946b9d337f Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 14 Mar 2024 23:39:16 +0000
Subject: [PATCH 107/113] yapf

---
 tests/models/compare_utils.py                        |  6 ++----
 tests/models/test_compressed_memory.py               | 10 ++++------
 tests/models/test_marlin.py                          |  3 +--
 vllm/model_executor/layers/parameters/__init__.py    |  3 +--
 .../layers/parameters/lazy_compressed.py             | 12 ++++++------
 vllm/model_executor/layers/sparsity/__init__.py      |  9 ++++-----
 6 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/tests/models/compare_utils.py b/tests/models/compare_utils.py
index 235ccad0549a7..44319b6ca45ff 100644
--- a/tests/models/compare_utils.py
+++ b/tests/models/compare_utils.py
@@ -21,13 +21,11 @@ def check_logprobs_close(outputs_0_lst, outputs_1_lst, name_0, name_1):
                 assert output_id_0 in logprobs_1[idx], (
                     f"Test{prompt_idx}:"
                     f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}"
-                )
+                    f"\n{name_1}:\t{output_str_1!r}")
                 assert output_id_1 in logprobs_0[idx], (
                     f"Test{prompt_idx}:"
                     f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}"
-                )
+                    f"\n{name_1}:\t{output_str_1!r}")
 
                 # Break out since sequences will now diverge.
                 break
diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py
index c331e7132272d..056452b77e020 100644
--- a/tests/models/test_compressed_memory.py
+++ b/tests/models/test_compressed_memory.py
@@ -36,9 +36,8 @@ def test_models(
                                  sparsity=None,
                                  dtype=dtype,
                                  max_model_len=1024)
-    dense_num_kv_blocks = (
-        dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
-    )
+    dense_num_kv_blocks = (dense_model.model.llm_engine.scheduler.
+                           block_manager.gpu_allocator.num_blocks)
 
     # Deleting just the model does not always free the GPU memory.
     del dense_model.model.llm_engine.driver_worker
@@ -50,9 +49,8 @@ def test_models(
                                   sparsity=sparsity,
                                   dtype=dtype,
                                   max_model_len=1024)
-    sparse_num_kv_blocks = (
-        sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
-    )
+    sparse_num_kv_blocks = (sparse_model.model.llm_engine.scheduler.
+                            block_manager.gpu_allocator.num_blocks)
 
     # Deleting just the model does not always free the GPU memory.
     del sparse_model.model.llm_engine.driver_worker
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 35dfd7c19d8df..e524b785af389 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -21,8 +21,7 @@
 from compare_utils import check_logprobs_close
 from dataclasses import dataclass
 from vllm.model_executor.layers.quantization import (
-    _QUANTIZATION_CONFIG_REGISTRY
-)
+    _QUANTIZATION_CONFIG_REGISTRY)
 
 MAX_MODEL_LEN = 1024
 
diff --git a/vllm/model_executor/layers/parameters/__init__.py b/vllm/model_executor/layers/parameters/__init__.py
index d05d73a79c13e..6cb53db01d3f6 100644
--- a/vllm/model_executor/layers/parameters/__init__.py
+++ b/vllm/model_executor/layers/parameters/__init__.py
@@ -1,6 +1,5 @@
 from vllm.model_executor.layers.parameters.lazy_compressed import (
-    LazyCompressedParameter
-)
+    LazyCompressedParameter)
 
 __all__ = [
     "LazyCompressedParameter",
diff --git a/vllm/model_executor/layers/parameters/lazy_compressed.py b/vllm/model_executor/layers/parameters/lazy_compressed.py
index 65d44167c004a..05d6bfb27008f 100644
--- a/vllm/model_executor/layers/parameters/lazy_compressed.py
+++ b/vllm/model_executor/layers/parameters/lazy_compressed.py
@@ -66,8 +66,8 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
         def unwrap(e):
             nonlocal ret_storage_format_cls
             if isinstance(e, LazyCompressedParameter):
-                assert (ret_storage_format_cls is None or 
-                        ret_storage_format_cls == e.storage_format_cls)
+                assert (ret_storage_format_cls is None
+                        or ret_storage_format_cls == e.storage_format_cls)
                 ret_storage_format_cls = e.storage_format_cls
 
                 if e.is_empty:
@@ -100,9 +100,9 @@ def compress(self) -> None:
         from magic_wand import SparseSemiStructuredStorageFormat
 
         if self.storage_format_cls == SparseSemiStructuredStorageFormat:
-            # Semi-structured sparsity assumes a 2:4 pattern, where 
-            # each 4 elements have at minimum 2 zeros. We need to validate 
-            # this pattern exists, so we check the whole tensor 
+            # Semi-structured sparsity assumes a 2:4 pattern, where
+            # each 4 elements have at minimum 2 zeros. We need to validate
+            # this pattern exists, so we check the whole tensor
             # before committing to compression.
 
             # Count zeros in each group of 4
@@ -116,7 +116,7 @@ def compress(self) -> None:
             if not has_semi_structured_sparsity:
                 logger.warning(
                     f"Called compress() on tensor of shape {self.shape} but "
-                     "does not have 2:4 sparsity, skipping compression")
+                    "does not have 2:4 sparsity, skipping compression")
                 return
 
         else:
diff --git a/vllm/model_executor/layers/sparsity/__init__.py b/vllm/model_executor/layers/sparsity/__init__.py
index 24292d849b905..df2ca0f1b773f 100644
--- a/vllm/model_executor/layers/sparsity/__init__.py
+++ b/vllm/model_executor/layers/sparsity/__init__.py
@@ -7,11 +7,10 @@
         "magic_wand is not available and required for sparsity "
         "support. Please install it with `pip install nm-magic-wand`")
 
-from vllm.model_executor.layers.sparsity.base_config import SparsityConfig          # noqa: E402
-from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config    # noqa: E402
-from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import (     # noqa: E402
-    SemiStructuredSparseW16A16Config  
-)
+from vllm.model_executor.layers.sparsity.base_config import SparsityConfig  # noqa: E402
+from vllm.model_executor.layers.sparsity.sparse_w16a16 import SparseW16A16Config  # noqa: E402
+from vllm.model_executor.layers.sparsity.semi_structured_sparse_w16a16 import (  # noqa: E402
+    SemiStructuredSparseW16A16Config)
 
 _SPARSITY_CONFIG_REGISTRY = {
     "sparse_w16a16": SparseW16A16Config,

From e2835280cc5415b80d7597e1c5ffb681bf3c790d Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 15 Mar 2024 00:08:02 +0000
Subject: [PATCH 108/113] finalized ruff

---
 benchmarks/backend_request_func.py            |  2 +
 benchmarks/benchmark_prefix_caching.py        |  3 +
 benchmarks/benchmark_serving.py               |  3 +
 collect_env.py                                |  4 ++
 csrc/punica/bgmv/generator.py                 |  2 +-
 examples/multilora_inference.py               |  2 +
 examples/offline_inference_with_prefix.py     |  3 +
 neuralmagic/benchmarks/common.py              |  8 +--
 .../benchmarks/run_benchmark_serving.py       | 33 ++++++---
 .../benchmarks/run_benchmark_throughput.py    | 10 ++-
 .../scripts/backend_request_func.py           |  5 +-
 .../benchmarks/scripts/benchmark_serving.py   | 67 ++++++++++---------
 .../scripts/benchmark_throughput.py           |  8 ++-
 neuralmagic/benchmarks/scripts/common.py      | 13 ++--
 .../benchmarks/scripts/datasets_registry.py   |  4 +-
 .../scripts/logging/benchmark_result.py       | 10 +--
 .../scripts/logging/gha_benchmark_logging.py  | 50 +++++++-------
 neuralmagic/tools/call_cmd.py                 | 11 +--
 setup.py                                      |  2 +
 19 files changed, 144 insertions(+), 96 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 26d2c24d5655c..8782f5546b21e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,3 +1,5 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation
 # This file has been modified by Neural Magic
 
 import json
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index a0307439cd5f1..5867e3b171919 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,3 +1,6 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation
+
 import argparse
 import time
 
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 3f5e2d9c8f4dc..040e96458a14b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,3 +1,6 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation
+
 """Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
diff --git a/collect_env.py b/collect_env.py
index a886db693e2f1..3c914795222ee 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -1,3 +1,7 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff.
+# This file has been modified by Neural Magic
+
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
 
 # Unlike the rest of the PyTorch this file must be python2 compliant.
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
index 66de56d74f3e7..a92c67180372a 100644
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@@ -10,7 +10,7 @@
 #include "bgmv_impl.cuh"
 
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()
+""".lstrip()    # noqa: E501 (UPSTREAM SYNC nm-automation)
 
 for input_dtype in DTYPES:
     for output_dtype in DTYPES:
diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
index cd4451481ca83..7b1d580a9a7f6 100644
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -1,3 +1,5 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation
 """
 This example shows how to use the multi-LoRA functionality for offline inference.
 
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 1aa718b88907c..2c6c6aa63944d 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -1,3 +1,6 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation
+
 from vllm import LLM, SamplingParams
 
 prefix = (
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
index 398f8973cc8d2..089a347e49194 100644
--- a/neuralmagic/benchmarks/common.py
+++ b/neuralmagic/benchmarks/common.py
@@ -27,8 +27,8 @@ def max_model_length_from_model_id(model: str,
 
 
 def script_args_to_cla(config: NamedTuple) -> Iterable[dict]:
-    #config is a NamedTuple constructed from some JSON in neuralmagic/benchmarks/configs
-
+    # config is a NamedTuple constructed from some JSON 
+    # in neuralmagic/benchmarks/configs
     kv = vars(config.script_args)
 
     keys = kv.keys()
@@ -57,8 +57,8 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[dict]:
 
 def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]:
     """
-    Give a path to a config file in `neuralmagic/benchmarks/configs/*` return an Iterable of
-    (sub)configs in the file
+    Give a path to a config file in `neuralmagic/benchmarks/configs/*` 
+    return an Iterable of (sub)configs in the file
     """
     assert config_file_path.exists()
 
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
index 110d47e354e24..a9eccb3666d20 100644
--- a/neuralmagic/benchmarks/run_benchmark_serving.py
+++ b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -8,7 +8,10 @@
 from typing import NamedTuple, Optional
 from pathlib import Path
 
-from .common import download_model, max_model_length_from_model_id, script_args_to_cla, benchmark_configs
+from .common import (
+    download_model, max_model_length_from_model_id, 
+    script_args_to_cla, benchmark_configs
+)
 from .scripts.common import warmup_server, num_available_gpus
 from ..tools.call_cmd import call_cmd
 
@@ -56,18 +59,25 @@ def try_connection() -> bool:
     return False
 
 
-def run_benchmark_serving_script(config: NamedTuple,
-                                 output_directory: Optional[Path] = None
-                                 ) -> None:
+def run_benchmark_serving_script(
+    config: NamedTuple,
+    output_directory: Optional[Path] = None
+) -> None:
     assert config.script_name == 'benchmark_serving'
 
-    def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
+    def run_bench(
+        server_cmd: str, 
+        bench_cmd: list[str], 
+        model: str
+    ) -> None:
         try:
             # start server
-            server_process = subprocess.Popen("exec " + server_cmd, shell=True)
+            server_process = subprocess.Popen(
+                "exec " + server_cmd, shell=True)
             if not is_server_running(BENCH_SERVER_HOST, BENCH_SERVER_PORT):
                 raise ValueError(
-                    f"Aborting bench run with : server-cmd {server_cmd} , bench-cmd {bench_cmd}. Reason: Cannot start Server"
+                    f"Aborting bench run with : server-cmd {server_cmd} , "
+                    f"bench-cmd {bench_cmd}. Reason: Cannot start Server"
                 )
 
             # server warmup
@@ -96,13 +106,15 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
 
         supported_max_model_len = max_model_length_from_model_id(model)
 
-        # If the requested model-len is too big, try running with the maximum supported for this model.
+        # If the requested model-len is too big, try running with the 
+        # maximum supported for this model.
         max_model_lens = set(
             map(lambda v: min(v, supported_max_model_len),
                 config.max_model_lens))
         if (config.max_model_lens != list(max_model_lens)):
             print(
-                f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}"
+                f"WARNING: max_model_len modified to {max_model_lens} " 
+                f"from {config.max_model_lens} for model {model}"
             )
 
         for max_model_len in max_model_lens:
@@ -120,7 +132,8 @@ def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
                 server_args["sparsity"] = sparsity
 
             server_cmd = "python3 -m vllm.entrypoints.api_server " + \
-                            " ".join([f"--{k} {v}" for k, v in server_args.items()])
+                            " ".join([f"--{k} {v}" 
+                                      for k, v in server_args.items()])
 
             for script_args in script_args_to_cla(config):
 
diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py
index d6a505df71559..a28c1a9b73ea4 100644
--- a/neuralmagic/benchmarks/run_benchmark_throughput.py
+++ b/neuralmagic/benchmarks/run_benchmark_throughput.py
@@ -3,7 +3,9 @@
 from pathlib import Path
 from typing import NamedTuple, Optional
 
-from .common import script_args_to_cla, benchmark_configs, max_model_length_from_model_id
+from .common import (
+    script_args_to_cla, benchmark_configs, max_model_length_from_model_id
+)
 from ..tools.call_cmd import call_cmd
 
 
@@ -19,13 +21,15 @@ def run_benchmark_throughput_script(config: NamedTuple,
 
         supported_max_model_len = max_model_length_from_model_id(model)
 
-        # If the requested model-len is too big, try running with the maximum supported for this model.
+        # If the requested model-len is too big, try running with 
+        # the maximum supported for this model.
         max_model_lens = set(
             map(lambda v: min(v, supported_max_model_len),
                 config.max_model_lens))
         if (config.max_model_lens != list(max_model_lens)):
             print(
-                f"WARNING: max_model_len modified to {max_model_lens} from {config.max_model_lens} for model {model}"
+                f"WARNING: max_model_len modified to {max_model_lens} "
+                f"from {config.max_model_lens} for model {model}"
             )
 
         for max_model_len in max_model_lens:
diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py
index 078cfd1c6a7fc..dc3855f54418e 100644
--- a/neuralmagic/benchmarks/scripts/backend_request_func.py
+++ b/neuralmagic/benchmarks/scripts/backend_request_func.py
@@ -135,7 +135,7 @@ async def async_request_vllm(
                         data = part_data
                     output.latency = time.perf_counter() - st
 
-                    # When streaming, '\0' is appended to the end of the response.
+                    # When streaming, '\0' is appended to the end.
                     body = trim_suffix(data.decode('utf-8'), "\0")
                     output.generated_text = json.loads(
                         body)["text"][0][len(request_func_input.prompt):]
@@ -220,7 +220,8 @@ async def async_request_deepspeed_mii(
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
-        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
+        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, 
+        # will use 0 as placeholder.
         # https://github.com/microsoft/DeepSpeed-MII/pull/311
         output.ttft = 0
 
diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
index f0c1d8d9951fc..5e0ca7d52aa43 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_serving.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -238,18 +238,14 @@ async def benchmark(backend: str, api_url: str, model_id: str,
     print(f"Benchmark duration: {metrics.metadata.duration:2f} s")
     print(f"Total input tokens: {metrics.metadata.total_input}")
     print(f"Total generated tokens: {metrics.metadata.total_output}")
-    print(
-        f"Request throughput: {metrics.metrics.request_throughput:.2f} requests/s"
-    )
-    print(
-        f"Input token throughput: {metrics.metrics.input_throughput:.2f} tokens/s"
-    )
-    print(
-        f"Output token throughput: {metrics.metrics.output_throughput:.2f} tokens/s"
-    )
-    print(
-        f"Median request latency: {metrics.metrics.median_request_latency:.2f} ms"
-    )
+    print(f"Request throughput: "
+          f"{metrics.metrics.request_throughput:.2f} requests/s")
+    print(f"Input token throughput: "
+          f"{metrics.metrics.input_throughput:.2f} tokens/s")
+    print(f"Output token throughput: "
+          f"{metrics.metrics.output_throughput:.2f} tokens/s")
+    print(f"Median request latency: "
+          f"{metrics.metrics.median_request_latency:.2f} ms")
     print(f"P90 request latency: {metrics.metrics.p90_request_latency:.2f} ms")
     print(f"P99 request latency: {metrics.metrics.p99_request_latency:.2f} ms")
     print(f"Mean TTFT: {metrics.metrics.mean_ttft_ms:.2f} ms")
@@ -349,9 +345,10 @@ def script_args_as_json_dict(script_args: argparse.Namespace):
         result = metrics.update_benchmark_result(result)
 
         # Add information about the derived variables as metadata
-        result[BenchmarkResult.METADATA_KEY_][
+        metadata_key = BenchmarkResult.METADATA_KEY_
+        result[metadata_key][
             ResultMetadataKeys.num_prompts] = num_prompts
-        result[BenchmarkResult.METADATA_KEY_][ResultMetadataKeys.request_rate] = \
+        result[metadata_key][ResultMetadataKeys.request_rate] = \
             request_rate if request_rate < float("inf") else "inf"
 
         # Save to file
@@ -388,7 +385,8 @@ def from_str(arg: str):
         type=str,
         default="benchmark-serving",
         help=
-        "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts"
+        "Benchmark description. This is primarily useful when "
+        "we log the benchmark results and process them for plotting charts"
     )
     parser.add_argument(
         "--backend",
@@ -437,8 +435,8 @@ def from_str(arg: str):
     parser.add_argument(
         "--tokenizer",
         type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default model tokenizer.",
+        help="Name or path of the tokenizer, "
+             "if not using the default model tokenizer.",
     )
     parser.add_argument(
         "--best-of",
@@ -482,15 +480,15 @@ def from_str(arg: str):
         "Otherwise, we use Poisson process to synthesize "
         "the request arrival times.",
     )
-    parser.add_argument("--nr-qps-pair_",
-                        type=NumPrompts_RequestRate_T.from_str,
-                        help="""
-                            First argument in the pair is num_prompts: Number of prompts to process.
-                            Second argument in the pair is request_rate : Number of requests per second. If this is inf,
-                            then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize
-                            the request arrival times.
-                            """,
-                        default=None)
+    parser.add_argument(
+        "--nr-qps-pair_",
+        type=NumPrompts_RequestRate_T.from_str,
+        help="""
+        First argument in the pair is num_prompts to process.
+        Second argument in the pair is request_rate per second.
+            If this is inf, then all the requests are sent at time 0. 
+            Otherwise, we use Poisson process to synthesize""",
+        default=None)
 
     # Server command args
     parser.add_argument(
@@ -498,29 +496,34 @@ def from_str(arg: str):
         type=int,
         default=None,
         help=
-        "tensor-parallel-size that the benchmarking script was invoked with. It is useful to log this information when storing benchmarking results"
+        "tensor-parallel-size that the benchmarking script was invoked with. "
+        "It is useful to log this information when storing benchmarking results"
     )
     parser.add_argument(
         "--server-args",
         type=str,
         default=None,
         help=
-        "When we are logging the output, it is useful to log the arguments passed to the server"
+        "When we are logging the output, it is useful to log the "
+        "arguments passed to the server"
     )
 
     def args_sanity_check(args):
         # Sanity check real-dataset vs synthetic-dataset usecase
         if args.dataset is None:
-            assert args.num_input_tokens is not None and args.num_output_tokens is not None
+            assert (args.num_input_tokens is not None and 
+                    args.num_output_tokens is not None)
         else:
-            assert args.num_input_tokens is None and args.num_output_tokens is None
-        # Sanity check num_prompts, request_rate as separate args vs joint args usecase
+            assert (args.num_input_tokens is None and 
+                    args.num_output_tokens is None)
+        # Sanity check num_prompts, request_rate as separate args vs joint args
         assert not all([
             args.num_prompts_ is None, args.request_rate_ is None,
             args.nr_qps_pair_ is None
         ])
         if args.nr_qps_pair_ is None:
-            assert args.num_prompts_ is not None and args.request_rate_ is not None
+            assert (args.num_prompts_ is not None and 
+                    args.request_rate_ is not None)
         else:
             assert args.num_prompts_ is None and args.request_rate_ is None
         # Sanity check required logging args
diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
index 9138ea0f8ad47..f351a70abdb60 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
@@ -12,7 +12,8 @@
 from pathlib import Path
 from typing import List, Optional, Tuple
 from transformers import AutoTokenizer
-from .common import generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs
+from .common import (generate_synthetic_requests, warmup_vllm_engine, 
+                     num_available_gpus, print_request_outputs)
 from .datasets_registry import get_dataset, DatasetArgs
 from .logging.benchmark_result import (BenchmarkResult,
                                        BenchmarkThroughputResultMetricTemplates
@@ -163,7 +164,7 @@ def main(args: argparse.Namespace):
         current_dt_str = current_dt.strftime("%Y%m%d-%H%M%S")
         file_name = Path(
             args.save_directory
-        ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json"
+        ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" # noqa: E501
         result.store(file_name)
 
 
@@ -174,7 +175,8 @@ def main(args: argparse.Namespace):
         type=str,
         default="benchmark-throughput",
         help=
-        "Benchmark description. This is primarily useful when we log the benchmark results and process them for plotting charts"
+        "Benchmark description. This is primarily useful when "
+        "we log the benchmark results and process them for plotting charts"
     )
     parser.add_argument("--backend",
                         type=str,
diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py
index d4addb99a2878..8fbe292d6abc8 100644
--- a/neuralmagic/benchmarks/scripts/common.py
+++ b/neuralmagic/benchmarks/scripts/common.py
@@ -12,7 +12,8 @@
 from vllm.outputs import RequestOutput
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR
-from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm
+from .backend_request_func import (
+    RequestFuncInput, RequestFuncOutput, async_request_vllm)
 from ...tools.call_cmd import call_cmd
 
 
@@ -23,7 +24,7 @@ def num_available_gpus() -> int:
 
 def get_benchmarking_context() -> dict:
     """
-    Return the current python version, pytorch version and CUDA version as a dict
+    Return the current python, pytorch and CUDA version as a dict
     """
     import sys
     import torch
@@ -100,7 +101,7 @@ def warmup_requests(tokenizer: PreTrainedTokenizerBase,
                     num_input_tokens: int = 128,
                     num_output_tokens: int = 1) -> List[Tuple[str, int, int]]:
     """
-    Given a tokenizer, generate `num_requests` requests that would be used for vllm engine warmup 
+    Given a tokenizer, generate `num_requests` requests used for warmup
     """
     words = list(tokenizer.get_vocab().keys())
     requests = []
@@ -187,7 +188,7 @@ async def process_requests(input_requests):
 
 def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int,
                   n_output_tokens: int) -> str:
-    return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n"
+    return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" # noqa: E501
 
 
 def print_request_outputs(results: List[RequestOutput]) -> None:
@@ -202,8 +203,8 @@ def print_request_outputs(results: List[RequestOutput]) -> None:
 def print_serving_request_io(inputs: List[Tuple[str, int, int]],
                              outputs: List[RequestFuncOutput]) -> None:
     """
-        inputs: list of tuples where the tuple is [prompt, prompt_length, output_length],
-        outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py)
+        inputs: list of tuples of form [prompt, prompt_length, output_length],
+        outputs: list of RequestFuncOutput output from benchmark_serving.py
         Format and print the inputs and outputs.
     """
     for i, o in zip(inputs, outputs):
diff --git a/neuralmagic/benchmarks/scripts/datasets_registry.py b/neuralmagic/benchmarks/scripts/datasets_registry.py
index b710c712d24cb..c1c4d02e725a0 100644
--- a/neuralmagic/benchmarks/scripts/datasets_registry.py
+++ b/neuralmagic/benchmarks/scripts/datasets_registry.py
@@ -63,8 +63,8 @@ def get_ultrachat(tokenizer: PreTrainedTokenizerBase,
     prompts = []
     completions = []
     system_message = {
-        "content":
-        "You are a chatbot with the explicit goal of helping the user as best as possible",
+        "content":  "You are a chatbot with the explicit goal of "
+                    "helping the user as best as possible",
         "role": "system",
     }
     for messages in ds["messages"]:
diff --git a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py
index a997cbb855698..37b9c49aa9fd4 100644
--- a/neuralmagic/benchmarks/scripts/logging/benchmark_result.py
+++ b/neuralmagic/benchmarks/scripts/logging/benchmark_result.py
@@ -1,5 +1,5 @@
 """
-Defines a BenchmarkResult class that all the benchmarks use store the benchmark results.
+Defines a BenchmarkResult class that all the benchmarks use to save results.
 """
 
 import json
@@ -16,9 +16,9 @@
 # NOTE - PLEASE READ:
 # Any modifications that adds/removes the keys in the JSON that BenchmarkResult
 # produces should also update the BENCHMARK_RESULTS_SCHEMA_VERSION.
-# The primary use case is to establish a set of keys that can be queried against reliably.
-# TODO (varun) : Initial version is named 0.0.0 as things are under development. Update it
-# when things are stable.
+# The primary use case is to establish a set of keys that can be queried.
+# TODO (varun) : Initial version is named 0.0.0 as things are under development.
+# Update it when things are stable.
 BENCHMARK_RESULTS_SCHEMA_VERSION = "0.0.0"
 
 
@@ -158,7 +158,7 @@ def __init__(self, description: str, date: datetime, script_name: str,
             dataset if dataset is not None else "synthetic",
             self.SCRIPT_ARGS_KEY_:
             script_args,
-            # Any metadata that the caller script wants to store should be stored here.
+            # Any metadata that the caller script wants to store.
             self.METADATA_KEY_: {},
             # Any benchmarking metrics should be stored here.
             self.METRICS_KEY_: {}
diff --git a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
index a7564417ba702..116eba43f13d2 100644
--- a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
+++ b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
@@ -10,7 +10,9 @@
 from dataclasses import dataclass
 from typing import List, Iterable, NamedTuple
 
-from .benchmark_result import GHABenchmarkToolName, BenchmarkResult, MetricTemplate
+from .benchmark_result import (
+    GHABenchmarkToolName, BenchmarkResult, MetricTemplate
+)
 
 
 @dataclass
@@ -123,29 +125,29 @@ def dump_to_json(gha_records: List[GHARecord], output_path: Path):
         Reference : https://github.com/benchmark-action/github-action-benchmark
         """)
 
-    parser.add_argument("-i",
-                        "--input-json-directory",
-                        required=True,
-                        type=str,
-                        help="""
-            Path to the directory containing BenchmarkResult jsons.
-            This is typically the output directory passed to the benchmark
-            runner scripts like neuralmagic/benchmarks/run_benchmarks.py.
-        """)
-
-    parser.add_argument("--bigger-is-better-output-file-path",
-                        type=str,
-                        required=True,
-                        help="""
-            An output file path, where the GHABenchmarkToolName BiggerIsBetter metrics are to be stored.
-                        """)
-
-    parser.add_argument("--smaller-is-better-output-file-path",
-                        type=str,
-                        required=True,
-                        help="""
-            An output file path, where the GHABenchmarkToolName SmallerIsBetter metrics are to be stored
-                        """)
+    parser.add_argument(
+        "-i",
+        "--input-json-directory",
+        required=True,
+        type=str,
+        help="""Path to the directory containing BenchmarkResult 
+                jsons. This is typically the output directory passed 
+                to the benchmark runner scripts like 
+                neuralmagic/benchmarks/run_benchmarks.py.""")
+
+    parser.add_argument(
+        "--bigger-is-better-output-file-path",
+        type=str,
+        required=True,
+        help="""An output file path, where the GHABenchmarkToolName 
+                BiggerIsBetter metrics are to be stored.""")
+
+    parser.add_argument(
+        "--smaller-is-better-output-file-path",
+        type=str,
+        required=True,
+        help="""An output file path, where the GHABenchmarkToolName 
+                SmallerIsBetter metrics are to be stored""")
 
     args = parser.parse_args()
 
diff --git a/neuralmagic/tools/call_cmd.py b/neuralmagic/tools/call_cmd.py
index 2ff84a0c02a5f..2e3f06c267fd3 100644
--- a/neuralmagic/tools/call_cmd.py
+++ b/neuralmagic/tools/call_cmd.py
@@ -1,6 +1,9 @@
 #
-# Run cmd as a sub-process.  Capture stdout, stderr, return status, elapsed time and
-# optionally process statistics (user time, system time, peak memory usage, etc.)
+# Run cmd as a sub-process.  
+#
+# Capture stdout, stderr, return status, elapsed time and
+# optionally process statistics 
+# (user time, system time, peak memory usage, etc.)
 #
 import os
 import re
@@ -12,8 +15,8 @@
 
 def parse_process_stats(str):
     exp = (
-        "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) "
-        "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)"
+        "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) "       # noqa: E501
+        "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)"  # noqa: E501
     )
     results = re.search(exp, str)
     if results:
diff --git a/setup.py b/setup.py
index f76447c0e7424..6c1b4a91134d0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,3 +1,5 @@
+# flake8: noqa
+# UPSTREAM SYNC: noqa is required for passing ruff.
 # This file has been modified by Neural Magic
 
 import contextlib

From c5633f2a2b540b1f29ae37eeb7b7377e9aa4045e Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 15 Mar 2024 00:12:46 +0000
Subject: [PATCH 109/113] yapf after ruff :)

---
 benchmarks/benchmark_serving.py               |  1 -
 csrc/punica/bgmv/generator.py                 |  2 +-
 neuralmagic/benchmarks/common.py              |  2 +-
 .../benchmarks/run_benchmark_serving.py       | 35 ++++++-----------
 .../benchmarks/run_benchmark_throughput.py    | 13 +++----
 .../scripts/backend_request_func.py           |  2 +-
 .../benchmarks/scripts/benchmark_serving.py   | 38 ++++++++-----------
 .../scripts/benchmark_throughput.py           | 10 ++---
 neuralmagic/benchmarks/scripts/common.py      |  6 +--
 .../benchmarks/scripts/datasets_registry.py   |  4 +-
 .../scripts/logging/gha_benchmark_logging.py  |  5 +--
 neuralmagic/tools/call_cmd.py                 |  6 +--
 12 files changed, 50 insertions(+), 74 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 040e96458a14b..7699304769653 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,6 +1,5 @@
 # flake8: noqa
 # UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation
-
 """Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
index a92c67180372a..7ceaf9e6892a5 100644
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@@ -10,7 +10,7 @@
 #include "bgmv_impl.cuh"
 
 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()    # noqa: E501 (UPSTREAM SYNC nm-automation)
+""".lstrip()  # noqa: E501 (UPSTREAM SYNC nm-automation)
 
 for input_dtype in DTYPES:
     for output_dtype in DTYPES:
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
index 089a347e49194..b0fa4fbe45187 100644
--- a/neuralmagic/benchmarks/common.py
+++ b/neuralmagic/benchmarks/common.py
@@ -27,7 +27,7 @@ def max_model_length_from_model_id(model: str,
 
 
 def script_args_to_cla(config: NamedTuple) -> Iterable[dict]:
-    # config is a NamedTuple constructed from some JSON 
+    # config is a NamedTuple constructed from some JSON
     # in neuralmagic/benchmarks/configs
     kv = vars(config.script_args)
 
diff --git a/neuralmagic/benchmarks/run_benchmark_serving.py b/neuralmagic/benchmarks/run_benchmark_serving.py
index a9eccb3666d20..0c10219501ea1 100644
--- a/neuralmagic/benchmarks/run_benchmark_serving.py
+++ b/neuralmagic/benchmarks/run_benchmark_serving.py
@@ -8,10 +8,8 @@
 from typing import NamedTuple, Optional
 from pathlib import Path
 
-from .common import (
-    download_model, max_model_length_from_model_id, 
-    script_args_to_cla, benchmark_configs
-)
+from .common import (download_model, max_model_length_from_model_id,
+                     script_args_to_cla, benchmark_configs)
 from .scripts.common import warmup_server, num_available_gpus
 from ..tools.call_cmd import call_cmd
 
@@ -59,26 +57,19 @@ def try_connection() -> bool:
     return False
 
 
-def run_benchmark_serving_script(
-    config: NamedTuple,
-    output_directory: Optional[Path] = None
-) -> None:
+def run_benchmark_serving_script(config: NamedTuple,
+                                 output_directory: Optional[Path] = None
+                                 ) -> None:
     assert config.script_name == 'benchmark_serving'
 
-    def run_bench(
-        server_cmd: str, 
-        bench_cmd: list[str], 
-        model: str
-    ) -> None:
+    def run_bench(server_cmd: str, bench_cmd: list[str], model: str) -> None:
         try:
             # start server
-            server_process = subprocess.Popen(
-                "exec " + server_cmd, shell=True)
+            server_process = subprocess.Popen("exec " + server_cmd, shell=True)
             if not is_server_running(BENCH_SERVER_HOST, BENCH_SERVER_PORT):
                 raise ValueError(
                     f"Aborting bench run with : server-cmd {server_cmd} , "
-                    f"bench-cmd {bench_cmd}. Reason: Cannot start Server"
-                )
+                    f"bench-cmd {bench_cmd}. Reason: Cannot start Server")
 
             # server warmup
             warmup_server(server_host=BENCH_SERVER_HOST,
@@ -106,16 +97,14 @@ def run_bench(
 
         supported_max_model_len = max_model_length_from_model_id(model)
 
-        # If the requested model-len is too big, try running with the 
+        # If the requested model-len is too big, try running with the
         # maximum supported for this model.
         max_model_lens = set(
             map(lambda v: min(v, supported_max_model_len),
                 config.max_model_lens))
         if (config.max_model_lens != list(max_model_lens)):
-            print(
-                f"WARNING: max_model_len modified to {max_model_lens} " 
-                f"from {config.max_model_lens} for model {model}"
-            )
+            print(f"WARNING: max_model_len modified to {max_model_lens} "
+                  f"from {config.max_model_lens} for model {model}")
 
         for max_model_len in max_model_lens:
 
@@ -132,7 +121,7 @@ def run_bench(
                 server_args["sparsity"] = sparsity
 
             server_cmd = "python3 -m vllm.entrypoints.api_server " + \
-                            " ".join([f"--{k} {v}" 
+                            " ".join([f"--{k} {v}"
                                       for k, v in server_args.items()])
 
             for script_args in script_args_to_cla(config):
diff --git a/neuralmagic/benchmarks/run_benchmark_throughput.py b/neuralmagic/benchmarks/run_benchmark_throughput.py
index a28c1a9b73ea4..debb98f8a3279 100644
--- a/neuralmagic/benchmarks/run_benchmark_throughput.py
+++ b/neuralmagic/benchmarks/run_benchmark_throughput.py
@@ -3,9 +3,8 @@
 from pathlib import Path
 from typing import NamedTuple, Optional
 
-from .common import (
-    script_args_to_cla, benchmark_configs, max_model_length_from_model_id
-)
+from .common import (script_args_to_cla, benchmark_configs,
+                     max_model_length_from_model_id)
 from ..tools.call_cmd import call_cmd
 
 
@@ -21,16 +20,14 @@ def run_benchmark_throughput_script(config: NamedTuple,
 
         supported_max_model_len = max_model_length_from_model_id(model)
 
-        # If the requested model-len is too big, try running with 
+        # If the requested model-len is too big, try running with
         # the maximum supported for this model.
         max_model_lens = set(
             map(lambda v: min(v, supported_max_model_len),
                 config.max_model_lens))
         if (config.max_model_lens != list(max_model_lens)):
-            print(
-                f"WARNING: max_model_len modified to {max_model_lens} "
-                f"from {config.max_model_lens} for model {model}"
-            )
+            print(f"WARNING: max_model_len modified to {max_model_lens} "
+                  f"from {config.max_model_lens} for model {model}")
 
         for max_model_len in max_model_lens:
             for script_args in script_args_to_cla(config):
diff --git a/neuralmagic/benchmarks/scripts/backend_request_func.py b/neuralmagic/benchmarks/scripts/backend_request_func.py
index dc3855f54418e..b5e0308848e25 100644
--- a/neuralmagic/benchmarks/scripts/backend_request_func.py
+++ b/neuralmagic/benchmarks/scripts/backend_request_func.py
@@ -220,7 +220,7 @@ async def async_request_deepspeed_mii(
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
-        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, 
+        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
         # will use 0 as placeholder.
         # https://github.com/microsoft/DeepSpeed-MII/pull/311
         output.ttft = 0
diff --git a/neuralmagic/benchmarks/scripts/benchmark_serving.py b/neuralmagic/benchmarks/scripts/benchmark_serving.py
index 5e0ca7d52aa43..6dc32e9d552ea 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_serving.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_serving.py
@@ -346,8 +346,7 @@ def script_args_as_json_dict(script_args: argparse.Namespace):
 
         # Add information about the derived variables as metadata
         metadata_key = BenchmarkResult.METADATA_KEY_
-        result[metadata_key][
-            ResultMetadataKeys.num_prompts] = num_prompts
+        result[metadata_key][ResultMetadataKeys.num_prompts] = num_prompts
         result[metadata_key][ResultMetadataKeys.request_rate] = \
             request_rate if request_rate < float("inf") else "inf"
 
@@ -384,10 +383,8 @@ def from_str(arg: str):
         "--description",
         type=str,
         default="benchmark-serving",
-        help=
-        "Benchmark description. This is primarily useful when "
-        "we log the benchmark results and process them for plotting charts"
-    )
+        help="Benchmark description. This is primarily useful when "
+        "we log the benchmark results and process them for plotting charts")
     parser.add_argument(
         "--backend",
         type=str,
@@ -436,7 +433,7 @@ def from_str(arg: str):
         "--tokenizer",
         type=str,
         help="Name or path of the tokenizer, "
-             "if not using the default model tokenizer.",
+        "if not using the default model tokenizer.",
     )
     parser.add_argument(
         "--best-of",
@@ -480,15 +477,14 @@ def from_str(arg: str):
         "Otherwise, we use Poisson process to synthesize "
         "the request arrival times.",
     )
-    parser.add_argument(
-        "--nr-qps-pair_",
-        type=NumPrompts_RequestRate_T.from_str,
-        help="""
+    parser.add_argument("--nr-qps-pair_",
+                        type=NumPrompts_RequestRate_T.from_str,
+                        help="""
         First argument in the pair is num_prompts to process.
         Second argument in the pair is request_rate per second.
             If this is inf, then all the requests are sent at time 0. 
             Otherwise, we use Poisson process to synthesize""",
-        default=None)
+                        default=None)
 
     # Server command args
     parser.add_argument(
@@ -503,27 +499,25 @@ def from_str(arg: str):
         "--server-args",
         type=str,
         default=None,
-        help=
-        "When we are logging the output, it is useful to log the "
-        "arguments passed to the server"
-    )
+        help="When we are logging the output, it is useful to log the "
+        "arguments passed to the server")
 
     def args_sanity_check(args):
         # Sanity check real-dataset vs synthetic-dataset usecase
         if args.dataset is None:
-            assert (args.num_input_tokens is not None and 
-                    args.num_output_tokens is not None)
+            assert (args.num_input_tokens is not None
+                    and args.num_output_tokens is not None)
         else:
-            assert (args.num_input_tokens is None and 
-                    args.num_output_tokens is None)
+            assert (args.num_input_tokens is None
+                    and args.num_output_tokens is None)
         # Sanity check num_prompts, request_rate as separate args vs joint args
         assert not all([
             args.num_prompts_ is None, args.request_rate_ is None,
             args.nr_qps_pair_ is None
         ])
         if args.nr_qps_pair_ is None:
-            assert (args.num_prompts_ is not None and 
-                    args.request_rate_ is not None)
+            assert (args.num_prompts_ is not None
+                    and args.request_rate_ is not None)
         else:
             assert args.num_prompts_ is None and args.request_rate_ is None
         # Sanity check required logging args
diff --git a/neuralmagic/benchmarks/scripts/benchmark_throughput.py b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
index f351a70abdb60..ba586772d5d09 100644
--- a/neuralmagic/benchmarks/scripts/benchmark_throughput.py
+++ b/neuralmagic/benchmarks/scripts/benchmark_throughput.py
@@ -12,7 +12,7 @@
 from pathlib import Path
 from typing import List, Optional, Tuple
 from transformers import AutoTokenizer
-from .common import (generate_synthetic_requests, warmup_vllm_engine, 
+from .common import (generate_synthetic_requests, warmup_vllm_engine,
                      num_available_gpus, print_request_outputs)
 from .datasets_registry import get_dataset, DatasetArgs
 from .logging.benchmark_result import (BenchmarkResult,
@@ -164,7 +164,7 @@ def main(args: argparse.Namespace):
         current_dt_str = current_dt.strftime("%Y%m%d-%H%M%S")
         file_name = Path(
             args.save_directory
-        ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json" # noqa: E501
+        ) / f"benchmark_throughput-{args.backend}-{model_id}-{current_dt_str}.json"  # noqa: E501
         result.store(file_name)
 
 
@@ -174,10 +174,8 @@ def main(args: argparse.Namespace):
         "--description",
         type=str,
         default="benchmark-throughput",
-        help=
-        "Benchmark description. This is primarily useful when "
-        "we log the benchmark results and process them for plotting charts"
-    )
+        help="Benchmark description. This is primarily useful when "
+        "we log the benchmark results and process them for plotting charts")
     parser.add_argument("--backend",
                         type=str,
                         choices=["vllm"],
diff --git a/neuralmagic/benchmarks/scripts/common.py b/neuralmagic/benchmarks/scripts/common.py
index 8fbe292d6abc8..9333939300e92 100644
--- a/neuralmagic/benchmarks/scripts/common.py
+++ b/neuralmagic/benchmarks/scripts/common.py
@@ -12,8 +12,8 @@
 from vllm.outputs import RequestOutput
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR
-from .backend_request_func import (
-    RequestFuncInput, RequestFuncOutput, async_request_vllm)
+from .backend_request_func import (RequestFuncInput, RequestFuncOutput,
+                                   async_request_vllm)
 from ...tools.call_cmd import call_cmd
 
 
@@ -188,7 +188,7 @@ async def process_requests(input_requests):
 
 def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int,
                   n_output_tokens: int) -> str:
-    return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n" # noqa: E501
+    return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n"  # noqa: E501
 
 
 def print_request_outputs(results: List[RequestOutput]) -> None:
diff --git a/neuralmagic/benchmarks/scripts/datasets_registry.py b/neuralmagic/benchmarks/scripts/datasets_registry.py
index c1c4d02e725a0..919abb72ee39b 100644
--- a/neuralmagic/benchmarks/scripts/datasets_registry.py
+++ b/neuralmagic/benchmarks/scripts/datasets_registry.py
@@ -63,8 +63,8 @@ def get_ultrachat(tokenizer: PreTrainedTokenizerBase,
     prompts = []
     completions = []
     system_message = {
-        "content":  "You are a chatbot with the explicit goal of "
-                    "helping the user as best as possible",
+        "content": "You are a chatbot with the explicit goal of "
+        "helping the user as best as possible",
         "role": "system",
     }
     for messages in ds["messages"]:
diff --git a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
index 116eba43f13d2..a89820da7dae9 100644
--- a/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
+++ b/neuralmagic/benchmarks/scripts/logging/gha_benchmark_logging.py
@@ -10,9 +10,8 @@
 from dataclasses import dataclass
 from typing import List, Iterable, NamedTuple
 
-from .benchmark_result import (
-    GHABenchmarkToolName, BenchmarkResult, MetricTemplate
-)
+from .benchmark_result import (GHABenchmarkToolName, BenchmarkResult,
+                               MetricTemplate)
 
 
 @dataclass
diff --git a/neuralmagic/tools/call_cmd.py b/neuralmagic/tools/call_cmd.py
index 2e3f06c267fd3..1168ab5043bfd 100644
--- a/neuralmagic/tools/call_cmd.py
+++ b/neuralmagic/tools/call_cmd.py
@@ -1,8 +1,8 @@
 #
-# Run cmd as a sub-process.  
+# Run cmd as a sub-process.
 #
 # Capture stdout, stderr, return status, elapsed time and
-# optionally process statistics 
+# optionally process statistics
 # (user time, system time, peak memory usage, etc.)
 #
 import os
@@ -15,7 +15,7 @@
 
 def parse_process_stats(str):
     exp = (
-        "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) "       # noqa: E501
+        "\[Timing\].*: elapsed=([0-9\.]+) user=([0-9\.]+) system=([0-9\.]+) "  # noqa: E501
         "maxrss=([0-9\.]+) avgrss=([0-9\.]+) avgmem=([0-9\.]+) avgdata=([0-9\.]+)"  # noqa: E501
     )
     results = re.search(exp, str)

From 1271e3c735b8bdf952d2c89ca1e6ec61e53ed052 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 15 Mar 2024 00:54:54 +0000
Subject: [PATCH 110/113] yapf after ruff :)

---
 tests/models/test_marlin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index e524b785af389..34bc6d0e77f61 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -48,6 +48,7 @@ class ModelPair:
 
 
 @pytest.mark.flaky(reruns=2)
+@pytest.mark.skip(reason="OOM Again in Automation")
 @pytest.mark.skipif(marlin_not_supported,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)

From c47bd6b71096e07ecba05936f61908f8212e41f9 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 15 Mar 2024 02:09:18 +0000
Subject: [PATCH 111/113] fixed tests post update

---
 tests/models/test_compressed.py        | 4 ----
 tests/models/test_compressed_memory.py | 4 ----
 tests/models/test_marlin.py            | 8 --------
 3 files changed, 16 deletions(-)

diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py
index c6fce5ef8ae7d..aa885661a0af3 100644
--- a/tests/models/test_compressed.py
+++ b/tests/models/test_compressed.py
@@ -41,8 +41,6 @@ def test_models(
     sparse_outputs = sparse_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # Deleting just the model does not always free the GPU memory.
-    del sparse_model.model.llm_engine.driver_worker
     del sparse_model
     gc.collect()
 
@@ -53,8 +51,6 @@ def test_models(
     dense_outputs = dense_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # Deleting just the model does not always free the GPU memory.
-    del dense_model.model.llm_engine.driver_worker
     del dense_model
     gc.collect()
 
diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py
index 056452b77e020..fddebd58104e3 100644
--- a/tests/models/test_compressed_memory.py
+++ b/tests/models/test_compressed_memory.py
@@ -39,8 +39,6 @@ def test_models(
     dense_num_kv_blocks = (dense_model.model.llm_engine.scheduler.
                            block_manager.gpu_allocator.num_blocks)
 
-    # Deleting just the model does not always free the GPU memory.
-    del dense_model.model.llm_engine.driver_worker
     del dense_model
     torch.cuda.empty_cache()
     gc.collect()
@@ -52,8 +50,6 @@ def test_models(
     sparse_num_kv_blocks = (sparse_model.model.llm_engine.scheduler.
                             block_manager.gpu_allocator.num_blocks)
 
-    # Deleting just the model does not always free the GPU memory.
-    del sparse_model.model.llm_engine.driver_worker
     del sparse_model
     torch.cuda.empty_cache()
     gc.collect()
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 34bc6d0e77f61..7c0382dfa7b34 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -48,7 +48,6 @@ class ModelPair:
 
 
 @pytest.mark.flaky(reruns=2)
-@pytest.mark.skip(reason="OOM Again in Automation")
 @pytest.mark.skipif(marlin_not_supported,
                     reason="Marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model_pair", model_pairs)
@@ -69,11 +68,7 @@ def test_models(
     marlin_outputs = marlin_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # vllm memory cleanup is poor. This seems to fix things.
-    # NOTE: upstream sync should use downstream version.
-    del marlin_model.model.llm_engine.driver_worker
     del marlin_model
-
     gc.collect()
     torch.cuda.empty_cache()
 
@@ -84,9 +79,6 @@ def test_models(
                                                        max_tokens,
                                                        num_logprobs)
 
-    # vllm memory cleanup is poor. This seems to fix things.
-    # NOTE: upstream sync should use downstream version.
-    del gptq_model.model.llm_engine.driver_worker
     del gptq_model
     gc.collect()
     torch.cuda.empty_cache()

From b9c3578f1dbb72979a3cf238b07509bc5fb8fd42 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Fri, 15 Mar 2024 02:25:01 +0000
Subject: [PATCH 112/113] missed one test

---
 tests/models/test_models_logprobs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index 80cbf2a48efc4..8878510bd0a93 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -51,7 +51,6 @@ def test_models(
                                                        max_tokens,
                                                        num_logprobs)
 
-    del vllm_model.model.llm_engine.driver_worker
     del vllm_model
 
     # loop through the prompts

From 1e36b51af1b9fda33e1bdf09ce246704571bcef3 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 15 Mar 2024 11:56:26 -0400
Subject: [PATCH 113/113] Update test-pipeline.yaml

removed duplicated test in buildkite (bad merge)
---
 .buildkite/test-pipeline.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fa3487c9f12cd..42a1eacb6de57 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -17,9 +17,6 @@ steps:
 - label: Core Test
   command: pytest -v -s core
 
-- label: Core Test
-  command: pytest -v -s core
-
 - label: Distributed Comm Ops Test
   command: pytest -v -s --forked test_comm_ops.py
   working_dir: "/vllm-workspace/tests/distributed"