From 67ad3b5071be75c33accce6c25ae94002c1f26ca Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Mon, 13 Jan 2025 09:03:53 +0100 Subject: [PATCH] [Samples] merge LLM samples to "text_generation" folder (#1411) --- .github/workflows/causal_lm_cpp.yml | 32 ++-- .github/workflows/linux.yml | 2 +- .github/workflows/mac.yml | 2 +- .github/workflows/windows.yml | 2 +- samples/CMakeLists.txt | 19 --- .../cpp/beam_search_causal_lm/CMakeLists.txt | 22 --- samples/cpp/beam_search_causal_lm/README.md | 38 ----- samples/cpp/benchmark_genai/CMakeLists.txt | 32 ---- samples/cpp/benchmark_genai/README.md | 49 ------ samples/cpp/chat_sample/CMakeLists.txt | 22 --- samples/cpp/chat_sample/README.md | 46 ------ .../cpp/lora_greedy_causal_lm/CMakeLists.txt | 19 --- .../cpp/multinomial_causal_lm/CMakeLists.txt | 22 --- samples/cpp/multinomial_causal_lm/README.md | 38 ----- .../prompt_lookup_decoding_lm/CMakeLists.txt | 23 --- .../cpp/prompt_lookup_decoding_lm/README.md | 41 ----- .../speculative_decoding_lm/CMakeLists.txt | 23 --- samples/cpp/speculative_decoding_lm/README.md | 45 ------ samples/cpp/text_generation/CMakeLists.txt | 62 +++++--- samples/cpp/text_generation/README.md | 139 +++++++++++++++-- .../beam_search_causal_lm.cpp | 0 .../benchmark_genai.cpp | 0 .../chat_sample.cpp | 0 .../lora_greedy_causal_lm.cpp | 0 .../multinomial_causal_lm.cpp | 0 .../prompt_lookup_decoding_lm.cpp | 0 .../speculative_decoding_lm.cpp | 0 .../python/beam_search_causal_lm/README.md | 38 ----- samples/python/benchmark_genai/README.md | 50 ------ samples/python/chat_sample/README.md | 46 ------ .../python/multinomial_causal_lm/README.md | 48 ------ .../prompt_lookup_decoding_lm/README.md | 41 ----- .../python/speculative_decoding_lm/README.md | 50 ------ samples/python/text_generation/README.md | 143 ++++++++++++++---- .../beam_search_causal_lm.py | 0 .../benchmark_genai.py | 0 .../chat_sample.py | 0 .../multinomial_causal_lm.py | 0 .../prompt_lookup_decoding_lm.py | 0 .../speculative_decoding_lm.py | 0 src/README.md | 4 +- 41 files changed, 303 insertions(+), 795 deletions(-) delete mode 100644 samples/cpp/beam_search_causal_lm/CMakeLists.txt delete mode 100644 samples/cpp/beam_search_causal_lm/README.md delete mode 100644 samples/cpp/benchmark_genai/CMakeLists.txt delete mode 100644 samples/cpp/benchmark_genai/README.md delete mode 100644 samples/cpp/chat_sample/CMakeLists.txt delete mode 100644 samples/cpp/chat_sample/README.md delete mode 100644 samples/cpp/lora_greedy_causal_lm/CMakeLists.txt delete mode 100644 samples/cpp/multinomial_causal_lm/CMakeLists.txt delete mode 100644 samples/cpp/multinomial_causal_lm/README.md delete mode 100644 samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt delete mode 100644 samples/cpp/prompt_lookup_decoding_lm/README.md delete mode 100644 samples/cpp/speculative_decoding_lm/CMakeLists.txt delete mode 100644 samples/cpp/speculative_decoding_lm/README.md rename samples/cpp/{beam_search_causal_lm => text_generation}/beam_search_causal_lm.cpp (100%) rename samples/cpp/{benchmark_genai => text_generation}/benchmark_genai.cpp (100%) rename samples/cpp/{chat_sample => text_generation}/chat_sample.cpp (100%) rename samples/cpp/{lora_greedy_causal_lm => text_generation}/lora_greedy_causal_lm.cpp (100%) rename samples/cpp/{multinomial_causal_lm => text_generation}/multinomial_causal_lm.cpp (100%) rename samples/cpp/{prompt_lookup_decoding_lm => text_generation}/prompt_lookup_decoding_lm.cpp (100%) rename samples/cpp/{speculative_decoding_lm => text_generation}/speculative_decoding_lm.cpp (100%) delete mode 100644 samples/python/beam_search_causal_lm/README.md delete mode 100644 samples/python/benchmark_genai/README.md delete mode 100644 samples/python/chat_sample/README.md delete mode 100644 samples/python/multinomial_causal_lm/README.md delete mode 100644 samples/python/prompt_lookup_decoding_lm/README.md delete mode 100644 samples/python/speculative_decoding_lm/README.md rename samples/python/{beam_search_causal_lm => text_generation}/beam_search_causal_lm.py (100%) rename samples/python/{benchmark_genai => text_generation}/benchmark_genai.py (100%) rename samples/python/{chat_sample => text_generation}/chat_sample.py (100%) rename samples/python/{multinomial_causal_lm => text_generation}/multinomial_causal_lm.py (100%) rename samples/python/{prompt_lookup_decoding_lm => text_generation}/prompt_lookup_decoding_lm.py (100%) rename samples/python/{speculative_decoding_lm => text_generation}/speculative_decoding_lm.py (100%) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index e53ffbd643..5fc4617f2c 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -53,12 +53,12 @@ jobs: wget https://huggingface.co/smangrul/tinyllama_lora_sql/resolve/main/adapter_model.safetensors?download=true -O adapter_model.safetensors - run: > . ./ov/setupvars.sh - && timeout 35s ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a + && timeout 35s ./build/samples/cpp/text_generation/multinomial_causal_lm ./open_llama_3b_v2/ a env: PYTHONPATH: "./build" - run: > . ./ov/setupvars.sh - && timeout 35s ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b + && timeout 35s ./samples/python/text_generation/multinomial_causal_lm.py ./open_llama_3b_v2/ b env: PYTHONPATH: "./build" - run: > @@ -78,8 +78,8 @@ jobs: matrix: executable: [ - ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm, - python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, + ./build/samples/cpp/text_generation/beam_search_causal_lm, + python ./samples/python/text_generation/beam_search_causal_lm.py, ] runs-on: ubuntu-20.04 defaults: @@ -338,8 +338,8 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - run: > . ./ov/setupvars.sh - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - + && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - env: PYTHONPATH: "./build" @@ -373,8 +373,8 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - run: > . ./ov/setupvars.sh - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - + && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./phi-2/ 69 + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./phi-2/ 69) - env: PYTHONPATH: "./build" @@ -408,8 +408,8 @@ jobs: optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - run: > . ./ov/setupvars.sh - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - + && timeout 50s ./build/samples/cpp/text_generation/beam_search_causal_lm ./notus-7b-v1/ 69 + | diff <(timeout 50s ./samples/python/text_generation/beam_search_causal_lm.py ./notus-7b-v1/ 69) - env: PYTHONPATH: "./build" @@ -445,9 +445,9 @@ jobs: - name: run and compare run: | source ./ov/setupvars.sh - ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt + ./build/samples/cpp/text_generation/speculative_decoding_lm ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_speculative.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt - python ./samples/python/speculative_decoding_lm/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt + python ./samples/python/text_generation/speculative_decoding_lm.py ./dolly-v2-7b/ ./dolly-v2-3b/ "Alan Turing was a" > predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -502,9 +502,9 @@ jobs: Question: Can you please add 2 and 3 A:' > ./prompt.txt - ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt + ./build/samples/cpp/text_generation/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt ./build/samples/cpp/text_generation/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt - python ./samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt + python ./samples/python/text_generation/prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_py.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -664,7 +664,7 @@ jobs: run: | source ./ov/setupvars.sh printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt - timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + timeout 30s ./build/samples/cpp/text_generation/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt python -c " from transformers import AutoTokenizer, AutoModelForCausalLM model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' @@ -693,7 +693,7 @@ jobs: " diff pred.txt ref.txt echo "Chat sample cpp" passed - timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt diff pred2.txt ref.txt echo "Chat sample python" passed diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 0a991e2a54..5fc5568853 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -374,7 +374,7 @@ jobs: - name: Test multinomial_causal_lm.py if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only timeout-minutes: 1 - run: ${{ env.INSTALL_DIR }}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + run: ${{ env.INSTALL_DIR }}/samples/python/text_generation/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 working-directory: ${{ env.MODELS_DIR }} - name: Test whisper_speech_recognition.py diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 7cb0ff98d3..f377d3e6a5 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -395,7 +395,7 @@ jobs: if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only run: | source ${OV_INSTALL_DIR}/setupvars.sh - ${OV_INSTALL_DIR}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + ${OV_INSTALL_DIR}/samples/python/text_generation/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 timeout-minutes: 1 - name: Test python samples (whisper_speech_recognition) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e65972110b..ea07316942 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -470,7 +470,7 @@ jobs: if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python ${{ env.OV_INSTALL_DIR }}\samples\python\multinomial_causal_lm\multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 0 + python ${{ env.OV_INSTALL_DIR }}\samples\python\text_generation\multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 0 - name: Test python samples (whisper_speech_recognition) if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 02539df6e7..d32eb832a6 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -2,14 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # -add_subdirectory(cpp/beam_search_causal_lm) -add_subdirectory(cpp/benchmark_genai) -add_subdirectory(cpp/chat_sample) add_subdirectory(cpp/text_generation) -add_subdirectory(cpp/lora_greedy_causal_lm) -add_subdirectory(cpp/multinomial_causal_lm) -add_subdirectory(cpp/prompt_lookup_decoding_lm) -add_subdirectory(cpp/speculative_decoding_lm) add_subdirectory(cpp/image_generation) add_subdirectory(cpp/visual_language_chat) add_subdirectory(cpp/whisper_speech_recognition) @@ -22,27 +15,15 @@ install(FILES COMPONENT cpp_samples_genai) install(DIRECTORY - cpp/beam_search_causal_lm - cpp/benchmark_genai - cpp/chat_sample cpp/text_generation cpp/image_generation - cpp/lora_greedy_causal_lm - cpp/multinomial_causal_lm - # Don't install prompt_lookup_decoding_lm because it doesn't use openvino_genai library and is not verified yet. - cpp/speculative_decoding_lm cpp/visual_language_chat cpp/whisper_speech_recognition DESTINATION samples/cpp COMPONENT cpp_samples_genai) install(DIRECTORY - python/beam_search_causal_lm - python/benchmark_genai - python/chat_sample python/text_generation python/image_generation - python/multinomial_causal_lm - python/speculative_decoding_lm python/visual_language_chat python/whisper_speech_recognition DESTINATION samples/python COMPONENT cpp_samples_genai diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt deleted file mode 100644 index 9bf1a8aac8..0000000000 --- a/samples/cpp/beam_search_causal_lm/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - HINTS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) -set_target_properties(beam_search_causal_lm PROPERTIES - COMPILE_PDB_NAME beam_search_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(beam_search_causal_lm PRIVATE cxx_std_11) - -install(TARGETS beam_search_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md deleted file mode 100644 index 947160e092..0000000000 --- a/samples/cpp/beam_search_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation C++ sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a different one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt deleted file mode 100644 index 902a05eee6..0000000000 --- a/samples/cpp/benchmark_genai/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -include(FetchContent) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -FetchContent_Declare(cxxopts - URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz - URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) -FetchContent_MakeAvailable(cxxopts) - -add_executable(benchmark_genai benchmark_genai.cpp) -target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) -set_target_properties(benchmark_genai PROPERTIES - COMPILE_PDB_NAME benchmark_genai - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS benchmark_genai - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md deleted file mode 100644 index d7b3f6ac21..0000000000 --- a/samples/cpp/benchmark_genai/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# LLMs benchmarking sample - -This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Usage - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -```sh -benchmark_genai [OPTIONS] -``` - -### Options - -- `-m, --model`: Path to the model and tokenizers base directory. -- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. -- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. -- `-n, --num_iter` (default: `3`): Number of iterations. -- `-d, --device` (default: `"CPU"`): Device to run the model on. - -### Output: - -``` -benchmark_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 -``` - -``` -Load time: 3405.69 ms -Generate time: 1430.77 ± 3.04 ms -Tokenization time: 0.51 ± 0.02 ms -Detokenization time: 0.37 ± 0.01 ms -TTFT: 81.60 ± 0.54 ms -TPOT: 71.52 ± 2.72 ms -Throughput tokens/s: 13.98 ± 0.53 -``` - -For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt deleted file mode 100644 index 69578dc86c..0000000000 --- a/samples/cpp/chat_sample/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -add_executable(chat_sample chat_sample.cpp) -target_link_libraries(chat_sample PRIVATE openvino::genai) -set_target_properties(chat_sample PROPERTIES - COMPILE_PDB_NAME chat_sample - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(chat_sample PRIVATE cxx_std_11) - -install(TARGETS chat_sample - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md deleted file mode 100644 index bdc1d294ee..0000000000 --- a/samples/cpp/chat_sample/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# C++ chat_sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run: - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`chat_sample TinyLlama-1.1B-Chat-v1.0` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. - -#### Missing chat template - -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. -The following template can be used as a default, but it may not work properly with every model: -``` -"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", -``` diff --git a/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt b/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt deleted file mode 100644 index 1d3f6307c0..0000000000 --- a/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) -add_executable(lora_greedy_causal_lm lora_greedy_causal_lm.cpp) -target_link_libraries(lora_greedy_causal_lm PRIVATE openvino::genai) -set_target_properties(lora_greedy_causal_lm PROPERTIES - COMPILE_PDB_NAME lora_greedy_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(lora_greedy_causal_lm PRIVATE cxx_std_11) -install(TARGETS lora_greedy_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt deleted file mode 100644 index 83b2335431..0000000000 --- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) -target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) -set_target_properties(multinomial_causal_lm PROPERTIES - COMPILE_PDB_NAME multinomial_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) - -install(TARGETS multinomial_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md deleted file mode 100644 index 35ca054fdd..0000000000 --- a/samples/cpp/multinomial_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt deleted file mode 100644 index b0ce8b1b60..0000000000 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -set(TARGET_NAME prompt_lookup_decoding_lm) -add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) - -set_target_properties(${TARGET_NAME} PROPERTIES - COMPILE_PDB_NAME ${TARGET_NAME} - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS ${TARGET_NAME} - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md deleted file mode 100644 index 2057ff2c6f..0000000000 --- a/samples/cpp/prompt_lookup_decoding_lm/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3 - -[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -source /setupvars.sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt deleted file mode 100644 index 7c48b6cc0b..0000000000 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -find_package(OpenVINOGenAI REQUIRED - PATHS - "${CMAKE_BINARY_DIR}" # Reuse the package from the build. - ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. - NO_CMAKE_FIND_ROOT_PATH -) - -set(TARGET_NAME speculative_decoding_lm) -add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai) - -set_target_properties(${TARGET_NAME} PROPERTIES - COMPILE_PDB_NAME ${TARGET_NAME} - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) - -install(TARGETS ${TARGET_NAME} - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md deleted file mode 100644 index 7ca26164a6..0000000000 --- a/samples/cpp/speculative_decoding_lm/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3 - -Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. - -Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-chat-hf Llama-2-7b-chat-hf -``` - -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index 377682974e..f798b4f5fc 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -8,28 +8,54 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) -add_executable(greedy_causal_lm greedy_causal_lm.cpp) -target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) -set_target_properties(greedy_causal_lm PROPERTIES - COMPILE_PDB_NAME greedy_causal_lm - # Ensure out of box LC_RPATH on macOS with SIP - INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(greedy_causal_lm PRIVATE cxx_std_11) +function(add_sample_executable target_name) + add_executable(${target_name} ${target_name}.cpp) + target_link_libraries(${target_name} PRIVATE openvino::genai) + set_target_properties(${target_name} PROPERTIES + COMPILE_PDB_NAME ${target_name} + # Ensure out-of-box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + install(TARGETS ${target_name} + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) +endfunction() -install(TARGETS greedy_causal_lm - RUNTIME DESTINATION samples_bin/ - COMPONENT samples_bin - EXCLUDE_FROM_ALL) +set (SAMPLE_LIST + greedy_causal_lm + encrypted_model_causal_lm + beam_search_causal_lm + chat_sample + lora_greedy_causal_lm + multinomial_causal_lm + prompt_lookup_decoding_lm + speculative_decoding_lm) + +foreach(sample ${SAMPLE_LIST}) + add_sample_executable(${sample}) +endforeach() + + +# benchmark_genai +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) -add_executable(encrypted_model_causal_lm encrypted_model_causal_lm.cpp) -target_link_libraries(encrypted_model_causal_lm PRIVATE openvino::genai) -set_target_properties(encrypted_model_causal_lm PROPERTIES - COMPILE_PDB_NAME encrypted_model_causal_lm +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(encrypted_model_causal_lm PRIVATE cxx_std_11) -install(TARGETS encrypted_model_causal_lm +install(TARGETS benchmark_genai RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index 6928d03927..d9e5bd8d22 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -1,44 +1,147 @@ -# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3 +# OpenVINO GenAI Text Generation Samples -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. +The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. +There are also Jupyter notebooks for some samples. You can find links to them in the appropriate sample descritions. + +## Table of Contents +1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) +2. [Sample Descriptions](#sample-descriptions) +3. [Troubleshooting](#troubleshooting) +4. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export-requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +optimim-cli export openvino --model ``` -## Run - -Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. - -`greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - +## Sample Descriptions +### Common information +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. -## Using encrypted models +### 1. Greedy Causal LM (`greedy_causal_lm`) +- **Description:** +Basic text generation using a causal language model. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Demonstrates simple text continuation. +- **Run Command:** + ```bash + ./greedy_causal_lm "" + ``` + +### 2. Beam Search Causal LM (`beam_search_causal_lm`) +- **Description:** +Uses beam search for more coherent text generation. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Improves text quality with beam search. +- **Run Command:** + ```bash + ./beam_search_causal_lm "" ["" ...] + ``` + +### 3. Chat Sample (`chat_sample`) +- **Description:** +Interactive chat interface powered by OpenVINO. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat-v1.0, etc +- **Main Feature:** Real-time chat-like text generation. +- **Run Command:** + ```bash + ./chat_sample + ``` +#### Missing chat template +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` +### 4. Multinomial Causal LM (`multinomial_causal_lm`) +- **Description:** Text generation with multinomial sampling for diversity. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Introduces randomness for creative outputs. +- **Run Command:** + ```bash + ./multinomial_causal_lm "" + ``` + +### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) +- **Description:** +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Specialized prompt-based inference. +- **Run Command:** + ```bash + ./prompt_lookup_decoding_lm "" + ``` + +### 6. Speculative Decoding LM (`speculative_decoding_lm`) +- **Description:** +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. + +Recommended models: meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model, etc +- **Main Feature:** Reduces latency while generating high-quality text. +- **Run Command:** + ```bash + ./speculative_decoding_lm "" + ``` + +### 7. Encrypted Model Causal LM (`encrypted_model_causal_lm`) +- **Description:** LLMPipeline and Tokenizer objects can be initialized directly from the memory buffer, e.g. when user stores only encrypted files and decrypts them on-the-fly. The following code snippet demonstrates how to load the model from the memory buffer: - ```cpp auto [model_str, weights_tensor] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); ov::genai::Tokenizer tokenizer(models_path); ov::genai::LLMPipeline pipe(model_str, weights_tensor, tokenizer, device); ``` For the sake of brevity the code above does not include Tokenizer decryption. For more details look to encrypted_model_causal_lm sample. - -### Troubleshooting - -#### Unicode characters encoding error on Windows +- **Main Feature:** Read model directly from memory buffer +- **Run Command:** + ```bash + ./encrypted_model_causal_lm "" + ``` + +### 8. LLMs benchmarking sample (`benchmark_genai`) +- **Description:** +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). +- **Main Feature:** Benchmark model via GenAI +- **Run Command:** + ```bash + ./benchmark_genai [OPTIONS] + ``` + #### Options +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + + +## Troubleshooting + +### Unicode characters encoding error on Windows Example error: ``` @@ -48,3 +151,7 @@ UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: 1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. 2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +## Support and Contribution +- For troubleshooting, consult the [OpenVINO documentation](https://docs.openvino.ai). +- To report issues or contribute, visit the [GitHub repository](https://github.com/openvinotoolkit/openvino.genai). diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/text_generation/beam_search_causal_lm.cpp similarity index 100% rename from samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp rename to samples/cpp/text_generation/beam_search_causal_lm.cpp diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/text_generation/benchmark_genai.cpp similarity index 100% rename from samples/cpp/benchmark_genai/benchmark_genai.cpp rename to samples/cpp/text_generation/benchmark_genai.cpp diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/text_generation/chat_sample.cpp similarity index 100% rename from samples/cpp/chat_sample/chat_sample.cpp rename to samples/cpp/text_generation/chat_sample.cpp diff --git a/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp b/samples/cpp/text_generation/lora_greedy_causal_lm.cpp similarity index 100% rename from samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp rename to samples/cpp/text_generation/lora_greedy_causal_lm.cpp diff --git a/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp b/samples/cpp/text_generation/multinomial_causal_lm.cpp similarity index 100% rename from samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp rename to samples/cpp/text_generation/multinomial_causal_lm.cpp diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp similarity index 100% rename from samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp rename to samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/text_generation/speculative_decoding_lm.cpp similarity index 100% rename from samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp rename to samples/cpp/text_generation/speculative_decoding_lm.cpp diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md deleted file mode 100644 index fac6a26e8e..0000000000 --- a/samples/python/beam_search_causal_lm/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Text generation Python sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a different one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md deleted file mode 100644 index 95f24b6eca..0000000000 --- a/samples/python/benchmark_genai/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# LLMs benchmarking sample - -This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - - -## Usage - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -```sh -python benchmark_genai.py [OPTIONS] -``` - -### Options - -- `-m, --model`: Path to the model and tokenizers base directory. -- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. -- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-n, --num_iter` (default: `3`): Number of iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. -- `-d, --device` (default: `"CPU"`): Device to run the model on. - -### Output: - -``` -python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 -``` - -``` -Load time: 3405.69 ms -Generate time: 1430.77 ± 3.04 ms -Tokenization time: 0.51 ± 0.02 ms -Detokenization time: 0.37 ± 0.01 ms -TTFT: 81.60 ± 0.54 ms -TPOT: 71.52 ± 2.72 ms -Throughput tokens/s: 13.98 ± 0.53 -``` - -For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md deleted file mode 100644 index 7e3c206431..0000000000 --- a/samples/python/chat_sample/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# Python chat_sample that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run: - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python chat_sample.py TinyLlama-1.1B-Chat-v1.0` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. - -#### Missing chat template - -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. -The following template can be used as a default, but it may not work properly with every model: -``` -"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", -``` diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md deleted file mode 100644 index c1afc08a8d..0000000000 --- a/samples/python/multinomial_causal_lm/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3 - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -This sample also contains example implementation of an iterable streamer with bufferisation. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -## Streaming - -This Python example demonstrates custom detokenization with bufferization. The streamer receives integer tokens corresponding to each word or subword, one by one. If tokens are decoded individually, the resulting text misses necessary spaces because of detokenize(tokenize(" a")) == "a". - -To address this, the detokenizer needs a larger context. We accumulate tokens in a tokens_cache buffer and decode multiple tokens together, adding the text to the streaming queue only when a complete decoded chunk is ready. We run a separate thread to print all new elements arriving in this queue from the generation pipeline. Each generated chunk of text is put into a synchronized queue, ensuring that all put and get operations are thread-safe and blocked until they can proceed. - -At the same time, in order to optimize the performance in streaming mode, we provide the Chuck Streaming. Chunk streaming has significant benefits to very small LLM for streaming generate token rate improvement. It does sampling once after several token generation. We can use the tokens_len parameter to control the number of tokens in the token_cache before sampling. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/prompt_lookup_decoding_lm/README.md b/samples/python/prompt_lookup_decoding_lm/README.md deleted file mode 100644 index 1e5f4003d4..0000000000 --- a/samples/python/prompt_lookup_decoding_lm/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# prompt_lookup_decoding_lm Python sample that supports most popular models like LLaMA 3 - -[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. - -```sh -source /setupvars.sh -pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python prompt_lookup_decoding_lm.py ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/speculative_decoding_lm/README.md b/samples/python/speculative_decoding_lm/README.md deleted file mode 100644 index 7d2656c0a3..0000000000 --- a/samples/python/speculative_decoding_lm/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# speculative_decoding_lm Python sample that supports most popular models like LLaMA 3 and other - -Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. - -Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. - -This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf - -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Run `optimum-cli` to generate IRs for the samples. - -## Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. - -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. - -Download assisting and main model to run speculative decoding sample. - -```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b -optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b -``` - -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python speculative_decoding_lm.py ./dolly-v2-7b ./dolly-v2-3b "Why is the Sun yellow?"` - - -Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. - -See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. - - -> *_NOTE:_* User can run speculative decoding on different devices. Please, specify `device` in `LLMPipeline` constructor to run main model and `device` for `draft_model` in the constructor. - -### Troubleshooting - -#### Unicode characters encoding error on Windows - -Example error: -``` -UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to -``` - -If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: -1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. -2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md index a634e21cb0..9940904cfb 100644 --- a/samples/python/text_generation/README.md +++ b/samples/python/text_generation/README.md @@ -1,48 +1,131 @@ -# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3 +# OpenVINO GenAI Text Generation Python Samples -This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. +These samples showcase the use of OpenVINO's inference capabilities for text generation tasks, including different decoding strategies such as beam search, multinomial sampling, and speculative decoding. Each sample has a specific focus and demonstrates a unique aspect of text generation. +The applications don't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. +There are also Jupyter notebooks for some samples. You can find links to them in the appropriate sample descritions. -There are two sample files: - - [`greedy_causal_lm.py`](./greedy_causal_lm.py) demonstrates basic usage of the LLM pipeline - - [`lora.py`](./lora.py) shows how to apply LoRA adapters to the pipeline +## Table of Contents +1. [Download and Convert the Model and Tokenizers](#download-and-convert-the-model-and-tokenizers) +2. [Sample Descriptions](#sample-descriptions) +3. [Troubleshooting](#troubleshooting) +4. [Support and Contribution](#support-and-contribution) ## Download and convert the model and tokenizers The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. +It's not required to install [../../export-requirements.txt](../../export-requirements.txt) for deployment if the model has already been exported. ```sh -pip install --upgrade-strategy eager -r ../../export-requirements.txt -optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +pip install --upgrade-strategy eager -r ../../requirements.txt +optimim-cli export openvino --model ``` -## Run - -Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: - -`python greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` - +## Sample Descriptions +### Common information +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to get common information about OpenVINO samples. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. -## Run with optional LoRA adapters - -LoRA adapters can be connected to the pipeline and modify generated text. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multiple adapter files with corresponding alpha parameters in command line. Check `lora.py` source code to learn how to enable adapters and specify them in each `generate` call. - -Here is an example how to run the sample with a single adapter. First download adapter file from TODO page manually and save it as TODO. Or download it from command line: - -#TODO command to download adapter - -Then run `lora.py`: - -#TODO command to run lora.py with adapter - -### Troubleshooting +### 1. Greedy Causal LM (`greedy_causal_lm`) +- **Description:** +Basic text generation using a causal language model. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Demonstrates simple text continuation. +- **Run Command:** + ```bash + python greedy_causal_lm.py [-h] model_dir prompt + ``` + +### 2. Beam Search Causal LM (`beam_search_causal_lm`) +- **Description:** +Uses beam search for more coherent text generation. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Improves text quality with beam search. +- **Run Command:** + ```bash + python beam_search_causal_lm.py model_dir prompt [prompts ...] + ``` + +### 3. Chat Sample (`chat_sample`) +- **Description:** +Interactive chat interface powered by OpenVINO. +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) that provides an example of LLM-powered text generation in Python. +Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat-v1.0, etc +- **Main Feature:** Real-time chat-like text generation. +- **Run Command:** + ```bash + python chat_sample.py model_dir + ``` +#### Missing chat template +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` -#### Unicode characters encoding error on Windows +### 4. Multinomial Causal LM (`multinomial_causal_lm`) +- **Description:** Text generation with multinomial sampling for diversity. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Introduces randomness for creative outputs. +- **Run Command:** + ```bash + python multinomial_causal_lm.py model_dir prompt + ``` + +### 5. Prompt Lookup Decoding LM (`prompt_lookup_decoding_lm`) +- **Description:** +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. +Recommended models: meta-llama/Llama-2-7b-hf, etc +- **Main Feature:** Specialized prompt-based inference. +- **Run Command:** + ```bash + python prompt_lookup_decoding_lm.py model_dir prompt + ``` + +### 6. Speculative Decoding LM (`speculative_decoding_lm`) +- **Description:** +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alongside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In that case they are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +Here is a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speculative-sampling) that provides an example of LLM-powered text generation in Python. + +Recommended models: meta-llama/Llama-2-13b-hf as main model and TinyLlama/TinyLlama-1.1B-Chat-v1.0 as draft model, etc +- **Main Feature:** Reduces latency while generating high-quality text. +- **Run Command:** + ```bash + python speculative_decoding_lm.py model_dir draft_model_dir prompt + ``` + +### 7. LLMs benchmarking sample (`benchmark_genai`) +- **Description:** +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). +- **Main Feature:** Benchmark model via GenAI +- **Run Command:** + ```bash + python benchmark_genai.py [-m MODEL] [-p PROMPT] [-nw NUM_WARMUP] [-n NUM_ITER] [-mt MAX_NEW_TOKENS] [-d DEVICE] + ``` + #### Options +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + + +## Troubleshooting + +### Unicode characters encoding error on Windows Example error: ``` @@ -52,3 +135,7 @@ UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: 1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. 2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +## Support and Contribution +- For troubleshooting, consult the [OpenVINO documentation](https://docs.openvino.ai). +- To report issues or contribute, visit the [GitHub repository](https://github.com/openvinotoolkit/openvino.genai). diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/text_generation/beam_search_causal_lm.py similarity index 100% rename from samples/python/beam_search_causal_lm/beam_search_causal_lm.py rename to samples/python/text_generation/beam_search_causal_lm.py diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py similarity index 100% rename from samples/python/benchmark_genai/benchmark_genai.py rename to samples/python/text_generation/benchmark_genai.py diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/text_generation/chat_sample.py similarity index 100% rename from samples/python/chat_sample/chat_sample.py rename to samples/python/text_generation/chat_sample.py diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/text_generation/multinomial_causal_lm.py similarity index 100% rename from samples/python/multinomial_causal_lm/multinomial_causal_lm.py rename to samples/python/text_generation/multinomial_causal_lm.py diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/text_generation/prompt_lookup_decoding_lm.py similarity index 100% rename from samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py rename to samples/python/text_generation/prompt_lookup_decoding_lm.py diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/text_generation/speculative_decoding_lm.py similarity index 100% rename from samples/python/speculative_decoding_lm/speculative_decoding_lm.py rename to samples/python/text_generation/speculative_decoding_lm.py diff --git a/src/README.md b/src/README.md index 6466b431d0..5d18d0b67b 100644 --- a/src/README.md +++ b/src/README.md @@ -231,7 +231,7 @@ custom_streamer = CustomStreamer() pipe.generate("The Sun is yellow because", max_new_tokens=15, streamer=custom_streamer) ``` -For fully implemented iterable CustomStreamer please refer to [multinomial_causal_lm](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/multinomial_causal_lm/README.md) sample. +For fully implemented iterable CustomStreamer please refer to [multinomial_causal_lm](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/text_generation/README.md) sample. Continuous batching with LLMPipeline: @@ -394,7 +394,7 @@ durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m print(f'Median from token to token duration: {np.median(durations):.2f} ms') ``` -For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/text_generation/README.md) and C++ [benchmark_genai](../samples/cpp/text_generation/README.md) samples. ## How It Works