From 49317272c729dddf9d3758c43168d3e0a499cbc5 Mon Sep 17 00:00:00 2001
From: Sergey Lyalin <sergey.lyalin@intel.com>
Date: Fri, 19 Apr 2024 12:02:03 +0000
Subject: [PATCH] Enabled int8 weights by default for performnace benchmarking
 purposes

---
 use_with_openvino.md                         | 2 ++
 vllm/model_executor/openvino_model_loader.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/use_with_openvino.md b/use_with_openvino.md
index 7795b4fdceec3..452c3cf5a5f46 100644
--- a/use_with_openvino.md
+++ b/use_with_openvino.md
@@ -85,6 +85,8 @@ docker run --rm -it --entrypoint python3 -v $HOME/.cache/huggingface:/root/.cach
 
 ## Use Int-8 Weights Compression
 
+(Note: for debugging purposes the default value for the variable described below `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`. Set it to `0` to have better accuracy results. Remove this note before creating vLLM PR).
+
 Weights int-8 compression is disabled by default. For better performance and lesser memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
 To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
 
diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py
index ccbf6090f3e3d..b13ef38a377a9 100644
--- a/vllm/model_executor/openvino_model_loader.py
+++ b/vllm/model_executor/openvino_model_loader.py
@@ -601,7 +601,7 @@ def get_model(model_config: ModelConfig,
         else:
             print(f'[ INFO ] OpenVINO IR is avaialble for provided model id {model_config.model}. '
                   'This IR will be used for inference as-is, all possible options that may affect model conversion are ignored.')
-        load_in_8bit = None if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '0') == '1' else False
+        load_in_8bit = False if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '1') == '0' else None
         pt_model = OVModelForCausalLM.from_pretrained(
             model_config.model,
             export=export,