diff --git a/use_with_openvino.md b/use_with_openvino.md index 7795b4fdceec3..452c3cf5a5f46 100644 --- a/use_with_openvino.md +++ b/use_with_openvino.md @@ -85,6 +85,8 @@ docker run --rm -it --entrypoint python3 -v $HOME/.cache/huggingface:/root/.cach ## Use Int-8 Weights Compression +(Note: for debugging purposes the default value for the variable described below `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`. Set it to `0` to have better accuracy results. Remove this note before creating vLLM PR). + Weights int-8 compression is disabled by default. For better performance and lesser memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`. To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above. diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py index ccbf6090f3e3d..b13ef38a377a9 100644 --- a/vllm/model_executor/openvino_model_loader.py +++ b/vllm/model_executor/openvino_model_loader.py @@ -601,7 +601,7 @@ def get_model(model_config: ModelConfig, else: print(f'[ INFO ] OpenVINO IR is avaialble for provided model id {model_config.model}. ' 'This IR will be used for inference as-is, all possible options that may affect model conversion are ignored.') - load_in_8bit = None if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '0') == '1' else False + load_in_8bit = False if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '1') == '0' else None pt_model = OVModelForCausalLM.from_pretrained( model_config.model, export=export,