diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4cd9b43e59378..31cb4238eb18f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -214,14 +214,20 @@ def add_cli_args( choices=['outlines', 'lm-format-enforcer'], help='Which engine will be used for guided decoding' <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> cc74b2b (Updating lm-format-enforcer version and adding links to decoding libraries in docs (#4222)) ' (JSON schema / regex etc) by default. Currently support ' 'https://github.com/outlines-dev/outlines and ' 'https://github.com/noamgat/lm-format-enforcer.' ' Can be overridden per request via guided_decoding_backend' ' parameter.') +<<<<<<< HEAD ======= ' (JSON schema / regex etc).') >>>>>>> 682789d (Fix missing docs and out of sync `EngineArgs` (#4219)) +======= +>>>>>>> cc74b2b (Updating lm-format-enforcer version and adding links to decoding libraries in docs (#4222)) # Parallel arguments parser.add_argument('--worker-use-ray', action='store_true', diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b32cd3b851d23..781bb5c7ea182 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,8 +1,12 @@ <<<<<<< HEAD +<<<<<<< HEAD from typing import Any, Dict, List, Optional, Tuple ======= from typing import Any, Dict, List, Optional >>>>>>> a22cdea ([Kernel][FP8] Initial support with dynamic per-tensor scaling (#4118)) +======= +from typing import Any, Dict, List, Optional, Tuple +>>>>>>> cc74b2b (Updating lm-format-enforcer version and adding links to decoding libraries in docs (#4222)) import torch from torch.nn import Module @@ -118,11 +122,15 @@ def apply_weights(self, return output +<<<<<<< HEAD <<<<<<< HEAD def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]: ======= def per_tensor_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, float]: >>>>>>> a22cdea ([Kernel][FP8] Initial support with dynamic per-tensor scaling (#4118)) +======= +def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]: +>>>>>>> cc74b2b (Updating lm-format-enforcer version and adding links to decoding libraries in docs (#4222)) """Quantize a tensor using per-tensor static scaling factor. Args: