Skip to content

Commit

Permalink
Merge branch 'master' into as/npuw_new_compute_patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
smirnov-alexey authored Nov 22, 2024
2 parents 65686c2 + f6e0ba0 commit bf1c5a9
Show file tree
Hide file tree
Showing 99 changed files with 1,859 additions and 2,351 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ function(ov_developer_package_export_targets)
endforeach()
endif()
else()
message(FATAL_ERROR "Internal error: ${target_name} does not represent a cmake target")
message(FATAL_ERROR "Internal error: '${EXPORT_TARGET}' does not represent a cmake target")
endif()

list(REMOVE_DUPLICATES _OPENVINO_DEVELOPER_PACKAGE_TARGETS)
Expand Down
19 changes: 14 additions & 5 deletions cmake/developer_package/compile_flags/sdl.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,20 @@ if(ENABLE_INTEGRITYCHECK)
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /INTEGRITYCHECK")
endif()

set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${OV_C_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${OV_C_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} ${OV_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_MODULE_LINKER_FLAGS_RELEASE} ${OV_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} ${OV_LINKER_FLAGS}")
if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR (OV_COMPILER_IS_INTEL_LLVM AND WIN32))
# add sdl required flags to both Debug and Release on Windows
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OV_C_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OV_C_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OV_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${OV_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OV_LINKER_FLAGS}")
else()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} ${OV_C_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${OV_C_CXX_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} ${OV_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS_RELEASE "${CMAKE_MODULE_LINKER_FLAGS_RELEASE} ${OV_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} ${OV_LINKER_FLAGS}")
endif()

unset(OV_C_CXX_FLAGS)
unset(OV_LINKER_FLAGS)
3 changes: 3 additions & 0 deletions cmake/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,9 @@ ov_dependent_option (ENABLE_SYSTEM_PROTOBUF "Enables use of system Protobuf" OFF
# the option is turned off by default, because we don't want to have a dependency on libsnappy.so
ov_dependent_option (ENABLE_SYSTEM_SNAPPY "Enables use of system version of Snappy" OFF
"ENABLE_SNAPPY_COMPRESSION" OFF)
# the option is turned off by default, because we are not sure that system version of ZE loader is fresh enough
ov_dependent_option (ENABLE_SYSTEM_LEVEL_ZERO "Enables use of system version of Level Zero" OFF
"ENABLE_INTEL_NPU" OFF)

ov_dependent_option(ENABLE_JS "Enables JS API building" ${ENABLE_JS_DEFAULT} "NOT ANDROID;NOT EMSCRIPTEN" OFF)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ Feature Support and API Coverage
| HETERO | 61.22 % | 99.24 % | 86.05 % |
+-------------------------+-----------+------------------+-------------------+
| || Percentage of API supported by the device, |
| || as of OpenVINO 2024.4, 25 Oct, 2024. |
| || as of OpenVINO 2024.5, 20 Nov. 2024. |
+-------------------------+-----------+------------------+-------------------+

For setting up a relevant configuration, refer to the
:doc:`Integrate with Customer Application <../../openvino-workflow/running-inference/integrate-openvino-with-your-application>`
topic (step 3 "Configure input and output").

.. dropdown:: Device support across OpenVINO 2024.4 distributions
.. dropdown:: Device support across OpenVINO 2024.5 distributions

=============== ========== ====== =============== ======== ============ ========== ========== ==========
Device Archives PyPI APT/YUM/ZYPPER Conda Homebrew vcpkg Conan npm
Expand Down
6 changes: 3 additions & 3 deletions docs/articles_en/about-openvino/performance-benchmarks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Performance Benchmarks
Efficient LLMs for AI PC <performance-benchmarks/generative-ai-performance>
Performance Information F.A.Q. <performance-benchmarks/performance-benchmarks-faq>
OpenVINO Accuracy <performance-benchmarks/model-accuracy-int8-fp32>
Getting Performance Numbers <performance-benchmarks/getting-performance-numbers>
Getting Performance Numbers <performance-benchmarks/getting-performance-numbers>


This page presents benchmark results for the
Expand Down Expand Up @@ -160,10 +160,10 @@ For a listing of all platforms and configurations used for testing, refer to the
**Disclaimers**

* Intel® Distribution of OpenVINO™ toolkit performance results are based on release
2024.3, as of July 31, 2024.
2024.5, as of November 20, 2024.

* OpenVINO Model Server performance results are based on release
2024.3, as of Aug. 19, 2024.
2024.4, as of Sept. 30, 2024.

The results may not reflect all publicly available updates. Intel technologies' features and
benefits depend on system configuration and may require enabled hardware, software, or service
Expand Down
74 changes: 37 additions & 37 deletions docs/articles_en/about-openvino/release-notes-openvino.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,22 @@ OpenVINO Release Notes
What's new
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

* More Gen AI coverage and framework integrations to minimize code changes.
* More GenAI coverage and framework integrations to minimize code changes.

* New models supported: Llama 3.2 (1B & 3B), Gemma 2 (2B & 9B), and YOLO11.
* LLM support on NPU: Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, Qwen2-7B-Instruct and Phi-3 Mini.
* LLM support on NPU: Llama 3 8B, Llama 2 7B, Mistral-v0.2-7B, Qwen2-7B-Instruct and Phi-3
Mini-Instruct.
* Noteworthy notebooks added: Sam2, Llama3.2, Llama3.2 - Vision, Wav2Lip, Whisper, and Llava.
Preview: Support for Flax, a high-performance Python neural network library based on JAX.
* Preview: support for Flax, a high-performance Python neural network library based on JAX.
Its modular design allows for easy customization and accelerated inference on GPUs.

* Broader Large Language Model (LLM) support and more model compression techniques.

* Optimizations for built-in GPUs on Intel® Core Ultra Processors (Series 1) and Intel® Arc™
* Optimizations for built-in GPUs on Intel® Core Ultra Processors (Series 1) and Intel® Arc™
Graphics include KV Cache compression for memory reduction along with improved usability,
and model load time optimizations to improve first token latency for LLMs.
* Dynamic quantization was enabled to improve first token latency for LLMs on built-in
Intel® GPUs without impacting accuracy on Intel Core Ultra Processors (Series 1). Second
Intel® GPUs without impacting accuracy on Intel® Core Ultra Processors (Series 1). Second
token latency will also improve for large batch inference.
* A new method to generate synthetic text data is implemented in the Neural Network
Compression Framework (NNCF). This will allow LLMs to be compressed more accurately using
Expand All @@ -52,9 +53,9 @@ What's new
* More portability and performance to run AI at the edge, in the cloud, or locally.

* Support for
`Intel® Xeon 6 Processors with P-cores <https://ark.intel.com/content/www/us/en/ark/products/codename/128428/products-formerly-granite-rapids.html>`__
`Intel® Xeon® 6 Processors with P-cores <https://ark.intel.com/content/www/us/en/ark/products/codename/128428/products-formerly-granite-rapids.html>`__
(formerly codenamed Granite Rapids) and
`Intel® Core Ultra 200V series processors <https://ark.intel.com/content/www/us/en/ark/products/codename/225837/products-formerly-arrow-lake.html>`__
`Intel® Core Ultra 200V series processors <https://ark.intel.com/content/www/us/en/ark/products/codename/225837/products-formerly-arrow-lake.html>`__
(formerly codenamed Arrow Lake-S).
* Preview: GenAI API enables multimodal AI deployment with support for multimodal pipelines
for improved contextual awareness, transcription pipelines for easy audio-to-text
Expand Down Expand Up @@ -95,9 +96,9 @@ Common
CPU Device Plugin
-----------------------------------------------------------------------------------------------

* Gold support of the Intel Xeon 6 platform with P-cores (formerly code name Granite Rapids)
* Gold support of the Intel® Xeon® 6 platform with P-cores (formerly code name Granite Rapids)
has been reached.
* Support of Intel® Core Ultra 200V series processors (formerly codenamed Arrow Lake-S) has
* Support of Intel® Core Ultra 200V series processors (formerly codenamed Arrow Lake-S) has
been implemented.
* LLM performance has been further improved with Rotary Position Embedding optimization; Query,
Key, and Value; and multi-layer perceptron fusion optimization.
Expand All @@ -117,7 +118,7 @@ GPU Device Plugin
* A new feature of GPU weightless blob caching enables caching model structure only and reusing
the weights from the original model file. Use the new OPTIMIZE_SIZE property to activate.
* Dynamic quantization with INT4 and INT8 precisions has been implemented and enabled by
default on Intel Core Ultra platforms, improving LLM first token latency.
default on Intel® Core Ultra platforms, improving LLM first token latency.


NPU Device Plugin
Expand All @@ -138,14 +139,11 @@ NPU Device Plugin
* LLM-related improvements have been implemented in terms of both memory usage and performance.
* AvgPool and MaxPool operator support has been extended, adding support for more PyTorch models.

* NOTE: for systems based on Intel® Core Ultra Processors Series 2, more than 16GB of RAM may
* NOTE: for systems based on Intel® Core Ultra Processors Series 2, more than 16GB of RAM may
be required to use larger models, such as Llama-2-7B, Mistral-0.2-7B, and Qwen-2-7B
(exceeding 4b parameters).
(exceeding 4B parameters) with prompt sizes over 1024 tokens.


prompts longer then 1024 characters will not work with a model of 7B or more parameters,
such as .

OpenVINO Python API
-----------------------------------------------------------------------------------------------

Expand Down Expand Up @@ -328,17 +326,17 @@ Known Issues
| Description:
| Description: When using new version of Transformer version to convert some of LLMs
(GPT-J/GPT-NeoX or falcon-7b), the inference accuracy may be impacted on 4th or 5th
generation of Xeon CPU platforms, due to model structure update triggering inference
generation of Intel® Xeon® processors, due to model structure update triggering inference
precision difference in part of the model. The workaround is to use transformer version of
4.44.2 or lower.
| **Component: GPU Plugin**
| ID: 154583
| Description:
| LLM accuracy can be low especially on non-systolic platform like Intel Core Ultra. When
| LLM accuracy can be low especially on non-systolic platforms like Intel® Core Ultra. When
facing the low accuracy issue, user needs to manually set a config ACTIVATION_SCALING_FACOTR
with a value 8.0 in compile_model() function. From the next release, scaling factor value
will be automatically applied through updated IR.
with a value of 8.0 in the compile_model() function. From the next release, scaling factor
value will be automatically applied through updated IR.
| **Component: GenAI**
| ID: 156437, 148933
Expand Down Expand Up @@ -402,7 +400,7 @@ Previous 2024 releases

* More portability and performance to run AI at the edge, in the cloud, or locally.

* Support for Intel® Core Ultra Processors Series 2 (formerly codenamed Lunar Lake) on Windows.
* Support for Intel® Core Ultra Processors Series 2 (formerly codenamed Lunar Lake) on Windows.
* OpenVINO™ Model Server now comes with production-quality support for OpenAI-compatible API
which enables significantly higher throughput for parallel inferencing on Intel® Xeon®
processors when serving LLMs to many concurrent users.
Expand Down Expand Up @@ -450,10 +448,10 @@ Previous 2024 releases
* 1st token performance with Llama series of models, with additional CPU operator optimization
(such as MLP, SDPA) on BF16 precision.
* Default oneTBB version on Linux is now 2021.13.0, improving overall performance on latest
Intel XEON platforms.
Intel® Xeon® platforms.
* MXFP4 weight compression models (compressing weights to 4-bit with the e2m1 data type
without a zero point and with 8-bit e8m0 scales) have been optimized for Xeon platforms
thanks to fullyconnected compressed weight LLM support.
without a zero point and with 8-bit e8m0 scales) have been optimized for Intel® Xeon®
platforms thanks to fullyconnected compressed weight LLM support.

* The following has been fixed:

Expand Down Expand Up @@ -622,7 +620,7 @@ Previous 2024 releases
| ID: CVS-150542, CVS-145996
| Description:
| The upgrade of default oneTBB on Linux platforms to 2021.13.0 improves overall
performance on latest Intel XEON platform but causes regression in some cases. Limit the
performance on latest Intel® Xeon® platform but causes regression in some cases. Limit the
threads usage of postprocessing done by Torch can mitigate the regression (For example:
torch.set_num_threads(n), n can be 1, beam search number, prompt batch size or other
numbers).
Expand Down Expand Up @@ -949,7 +947,7 @@ Previous 2024 releases
* Preview: addition of the :doc:`Generate API <../learn-openvino/llm_inference_guide/genai-guide>`,
a simplified API for text generation using large language models with only a few lines of
code. The API is available through the newly launched OpenVINO GenAI package.
* Support for Intel Atom® Processor X Series. For more details, see :doc:`System Requirements <./release-notes-openvino/system-requirements>`.
* Support for Intel® Atom® Processor X Series. For more details, see :doc:`System Requirements <./release-notes-openvino/system-requirements>`.
* Preview: Support for Intel® Xeon® 6 processor.

**OpenVINO™ Runtime**
Expand All @@ -973,8 +971,8 @@ Previous 2024 releases
*CPU Device Plugin*

* Performance when using latency mode in FP32 precision has been improved on Intel client
platforms, including Core Ultra (formerly codenamed Meteor Lake) and 13th Gen Core processors
(formerly codenamed Raptor Lake).
platforms, including Intel® Core Ultra (formerly codenamed Meteor Lake) and 13th Gen Core
processors (formerly codenamed Raptor Lake).
* 2nd token latency and memory footprint for FP16 LLMs have been improved significantly on AVX2
and AVX512 based CPU platforms, particularly for small batch sizes.
* PagedAttention has been optimized on AVX2, AVX512 and AMX platforms together with INT8 KV cache
Expand All @@ -988,9 +986,9 @@ Previous 2024 releases

* Both first token and average token latency of LLMs is improved on all GPU platforms, most
significantly on discrete GPUs. Memory usage of LLMs has been reduced as well.
* Stable Diffusion FP16 performance improved on Core Ultra platforms, with significant pipeline
improvement for models with dynamic-shaped input. Memory usage of the pipeline has been reduced,
as well.
* Stable Diffusion FP16 performance improved on Intel® Core Ultra platforms, with significant
pipeline improvement for models with dynamic-shaped input. Memory usage of the pipeline
has been reduced, as well.
* Optimized permute_f_y kernel performance has been improved.

*NPU Device Plugin*
Expand Down Expand Up @@ -1045,7 +1043,7 @@ Previous 2024 releases

* OpenVINO Model server can be now used for text generation use cases using OpenAI compatible API.
* Added support for continuous batching and PagedAttention algorithms for text generation with
fast and efficient in high concurrency load especially on Intel Xeon processors.
fast and efficient in high concurrency load especially on Intel® Xeon® processors.
`Learn more about it <https://github.com/openvinotoolkit/model_server/tree/releases/2024/2/demos/continuous_batching>`__.

**Neural Network Compression Framework**
Expand Down Expand Up @@ -1088,8 +1086,9 @@ Previous 2024 releases
| Description:
| In 2024.2, oneTBB 2021.2.x is used for Intel Distribution of OpenVINO Ubuntu and Red Hat
archives, instead of system TBB/oneTBB. This improves performance on the new generation of
Xeon platforms but may increase latency of some models on the previous generation. You can
build OpenVINO with **-DSYSTEM_TBB=ON** to get better latency performance for these models.
Intel® Xeon® platforms but may increase latency of some models on the previous generation.
You can build OpenVINO with **-DSYSTEM_TBB=ON** to get better latency performance for
these models.
| **Component: python API**
| ID: CVS-141744
Expand Down Expand Up @@ -1598,10 +1597,11 @@ Previous 2024 releases
| **Component: CPU runtime**
| *ID:* N/A
| *Description:*
| Performance results (first token latency) may vary from those offered by the previous OpenVINO version, for
“latency” hint inference of LLMs with long prompts on Xeon platforms with 2 or more
sockets. The reason is that all CPU cores of just the single socket running the application
are employed, lowering the memory overhead for LLMs when numa control is not used.
| Performance results (first token latency) may vary from those offered by the previous
OpenVINO version, for “latency” hint inference of LLMs with long prompts on Intel® Xeon®
platforms with 2 or more sockets. The reason is that all CPU cores of just the single
socket running the application are employed, lowering the memory overhead for LLMs when
numa control is not used.
| *Workaround:*
| The behavior is expected but stream and thread configuration may be used to include cores
from all sockets.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ OpenVINO™ GenAI Dependencies
OpenVINO™ GenAI depends on both `OpenVINO <https://github.com/openvinotoolkit/openvino>`__ and
`OpenVINO Tokenizers <https://github.com/openvinotoolkit/openvino_tokenizers>`__. During OpenVINO™
GenAI installation from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers
are used (e.g. ``openvino==2024.4.0`` and ``openvino-tokenizers==2024.4.0.0`` are installed for
``openvino-genai==2024.4.0``).
are used (e.g. ``openvino==2024.5.0`` and ``openvino-tokenizers==2024.5.0.0`` are installed for
``openvino-genai==2024.5.0``).

Trying to update any of the dependency packages might result in a version incompatiblibty
due to different Application Binary Interfaces (ABIs), which will result in errors while running
Expand Down
6 changes: 3 additions & 3 deletions docs/articles_en/get-started/install-openvino.rst
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Install OpenVINO™ 2024.4
Install OpenVINO™ 2024.5
==========================


Expand All @@ -19,9 +19,9 @@ Install OpenVINO™ 2024.4

.. raw:: html

<script type="module" crossorigin src="../_static/selector-tool/assets/index-f34d1fad.js"></script>
<script type="module" crossorigin src="../_static/selector-tool/assets/index-Codcw3jz.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<iframe id="selector" src="../_static/selector-tool/selector-8d4cf1d.html" style="width: 100%; border: none" title="Download Intel® Distribution of OpenVINO™ Toolkit"></iframe>
<iframe id="selector" src="../_static/selector-tool/selector-451bede.html" style="width: 100%; border: none" title="Download Intel® Distribution of OpenVINO™ Toolkit"></iframe>

OpenVINO 2024.5, described here, is not a Long-Term-Support version!
All currently supported versions are:
Expand Down
Loading

0 comments on commit bf1c5a9

Please sign in to comment.