Merge pull request #31 from slyalin/int8_enabled_by_default

Enabled int8 weights by default
ilya-lavrenov · Apr 20, 2024 · 469a4d0 · 469a4d0
2 parents d848897 + 4931727
commit 469a4d0
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 1 deletion.
diff --git a/use_with_openvino.md b/use_with_openvino.md
@@ -85,6 +85,8 @@ docker run --rm -it --entrypoint python3 -v $HOME/.cache/huggingface:/root/.cach
 
 ## Use Int-8 Weights Compression
 
+(Note: for debugging purposes the default value for the variable described below `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`. Set it to `0` to have better accuracy results. Remove this note before creating vLLM PR).
+
 Weights int-8 compression is disabled by default. For better performance and lesser memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
 To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
 

diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py
@@ -601,7 +601,7 @@ def get_model(model_config: ModelConfig,
         else:
             print(f'[ INFO ] OpenVINO IR is avaialble for provided model id {model_config.model}. '
                   'This IR will be used for inference as-is, all possible options that may affect model conversion are ignored.')
-        load_in_8bit = None if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '0') == '1' else False
+        load_in_8bit = False if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '1') == '0' else None
         pt_model = OVModelForCausalLM.from_pretrained(
             model_config.model,
             export=export,