Merge remote-tracking branch 'upstream/master' into prefix_caching

openvinotoolkit · Jul 17, 2024 · 995e4eb · 995e4eb
2 parents a154dba + 7f5e8d2
commit 995e4eb
Show file tree

Hide file tree

Showing 35 changed files with 2,085 additions and 736 deletions.
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
@@ -11,7 +11,7 @@ env:
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-22.04-16-cores
     env:
       # A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
       CMAKE_GENERATOR: Unix Makefiles
@@ -30,9 +30,9 @@ jobs:
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
       - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
+      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
-      - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
+      - run: python -m pytest ./tests/python_tests/
 
   macos_genai_python_lib:
     runs-on: macos-12
@@ -53,13 +53,12 @@ jobs:
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
       - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
+      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -c "from openvino_genai import LLMPipeline"
-      - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
+      - run: python -m pytest ./tests/python_tests/
 
   windows_genai_python_lib:
-    if: false
     runs-on: windows-latest
     env:
       CMAKE_BUILD_PARALLEL_LEVEL: null
@@ -73,62 +72,6 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - run: curl --output ov.zip ${{ env.l_ov_link }}
-      - run: unzip -d ov ov.zip
-      - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
-        shell: bash
-      - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-      - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
-      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-      # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
-      - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
-      - run: call ./ov/setupvars.bat && python -m pip install . --verbose
-      - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit
-
-  continuous_batching_python_lib_ubuntu:
-    # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
-    runs-on: ubuntu-22.04
-    env:
-      # A tokenizers' dependency fails to compile with Ninja in CenOS7 env.
-      CMAKE_GENERATOR: Unix Makefiles
-      CMAKE_BUILD_PARALLEL_LEVEL: null
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-      # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI.
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl ${{ env.l_ov_centos_link }} | tar --directory ./ov/ --strip-components 1 -xz
-          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Install dependencies and build
-        run: |
-          source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_sampling.py -m precommit
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-      - run: source ./ov/setupvars.sh && python -m pip install .
-      - run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-
-  continuous_batching_python_lib_windows:
-    runs-on: windows-latest
-    defaults:
-      run:
-        shell: cmd
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-
       - name: Install OpenVINO
         run: |
           curl --output ov.zip ${{ env.w_ov_link }}
@@ -141,33 +84,7 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
-      - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_sampling.py -m precommit
-      - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_preemption.py -m precommit
+      # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
+      - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
-      - run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-
-
-  continuous_batching_python_lib_macos:
-    runs-on: macos-12
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
-          brew install coreutils scons
-      - name: Download, convert and build
-        run: |
-          source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_sampling.py -m precommit
-      - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_preemption.py -m precommit
-      - run: source ./ov/setupvars.sh && python -m pip install .
-      - run: python -m pytest ./tests/python_tests/test_preemption.py -m precommit
+      - run: python -m pytest ./tests/python_tests/
diff --git a/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp b/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp
@@ -161,6 +161,10 @@ std::vector<int64_t> LMSDiscreteScheduler::get_timesteps() const {
 }
 
 std::map<std::string, ov::Tensor> LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) {
+    if (inference_step == 0) {
+        m_derivative_list.clear();
+    }
+
     // LMS step function:
     std::vector<float> derivative;
     derivative.reserve(latents.get_size());

diff --git a/image_generation/requirements.txt b/image_generation/requirements.txt
@@ -1,2 +1,2 @@
 -r ../samples/requirements.txt
-diffusers==0.27.2
+diffusers==0.29.2
diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp
@@ -365,11 +365,11 @@ int32_t main(int32_t argc, char* argv[]) try {
 
         ov::Tensor text_embeddings = text_encoder(models, positive_prompt, negative_prompt, do_classifier_free_guidance);
 
-        for (uint32_t n = 0; n < num_images; n++) {
-            std::shared_ptr<Scheduler> scheduler = std::make_shared<LMSDiscreteScheduler>();
-            scheduler->set_timesteps(num_inference_steps);
-            std::vector<std::int64_t> timesteps = scheduler->get_timesteps();
+        std::shared_ptr<Scheduler> scheduler = std::make_shared<LMSDiscreteScheduler>();
+        scheduler->set_timesteps(num_inference_steps);
+        std::vector<std::int64_t> timesteps = scheduler->get_timesteps();
 
+        for (uint32_t n = 0; n < num_images; n++) {
             std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n;
 
             const size_t unet_in_channels = static_cast<size_t>(sample_shape[1].get_length());

diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
@@ -101,7 +101,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
         mem_consumption.start_collect_memory_consumption()
     max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
     start = time.perf_counter()
-    if args['infer_count'] is not None:
+    if args['infer_count'] is not None and args['end_token_stopping'] is False:
         model.generation_config.eos_token_id = None
         model.config.eos_token_id = None
         result = model.generate(
@@ -693,6 +693,11 @@ def get_argprser():
     parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files')
     utils.model_utils.add_stateful_model_arguments(parser)
     parser.add_argument("--genai", action="store_true")
+    parser.add_argument(
+        '--end_token_stopping',
+        action='store_true',
+        help='Stop the generation even output token size does not achieve infer_count or max token size ({DEFAULT_OUTPUT_TOKEN_SIZE}}).'
+    )
 
     return parser.parse_args()
 

diff --git a/llm_bench/python/utils/model_utils.py b/llm_bench/python/utils/model_utils.py
@@ -139,6 +139,7 @@ def analyze_args(args):
     if model_args['prompt_index'] is not None:
         # Deduplication
         [model_args['prompt_index'].append(i) for i in args.prompt_index if i not in model_args['prompt_index']]
+    model_args['end_token_stopping'] = args.end_token_stopping
 
     model_framework = args.framework
     model_path = Path(args.model)

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
@@ -38,10 +38,9 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
-    "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
     "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
     "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
                        "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
@@ -58,7 +57,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
     "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
     "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
-    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
+    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True},
     "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
     "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True,
                      "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}},
@@ -70,7 +69,13 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
     "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
     "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "gpt-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.5, "scale": True},
+    "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "stablelm-7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.6, "scale": True},
+    "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
 }
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "openvino_tokenizers~=2024.3.0.0"
+    "openvino_tokenizers~=2024.3.0.0.dev"
 ]
 
 [tool.py-build-cmake.module]

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
@@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try {
     std::string prompt;
     std::string model_path = argv[1];
 
-    std::string device = "CPU";  // GPU can be used as well
+    std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(model_path, "CPU");
 
     ov::genai::GenerationConfig config;

diff --git a/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp
@@ -11,8 +11,6 @@
 #include <mutex>
 #include <atomic>
 
-
-#include <openvino/openvino.hpp>
 #include <nlohmann/json.hpp>
 #include <cxxopts.hpp>
 
@@ -123,6 +121,11 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data
 
         ov::genai::GenerationConfig greedy_search = ov::genai::greedy();
         greedy_search.max_new_tokens = std::min(max_output_len, output_len);
+        greedy_search.repetition_penalty = 1.0;
+        greedy_search.frequency_penalty = 0.0;
+        greedy_search.presence_penalty = 0.0;
+        greedy_search.diversity_penalty = 0.0;
+        greedy_search.length_penalty = 0.0;
 
         dataset.push_data(human_question, greedy_search);
         dataset.push_lens(input_len, output_len);

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -215,7 +215,20 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     GenerationConfig get_generation_config() const;
     void set_generation_config(const GenerationConfig& config);
 
-    void start_chat();
+
+    /**
+    * @brief start chat with keeping history in kv cache.
+    * Turns on keeping KV cache between generate calls and automatic applying of chat templates.
+    * In case if beam search is used, KV cache is kept fot the generated sequence with maximal scores.
+    * 
+    * @param system_message optional system message.
+    */
+    void start_chat(const std::string& system_message = "");
+
+    /**
+    * @brief finish chat and clear kv cache.
+    * Turns off keeping KV cache between generate calls.
+    */
     void finish_chat();
 private:
     std::unique_ptr<LLMPipelineImplBase> m_pimpl;

diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -79,7 +79,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
      * @return A string with the transformed and concatenated prompts from the chat history.
      * @throws Exception if the chat template was unable to parse the input history.
      */
-    std::string apply_chat_template(const ChatHistory& history, 
+    std::string apply_chat_template(ChatHistory history, 
                                     bool add_generation_prompt, 
                                     const std::string& chat_template="") const;
 

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -61,6 +61,7 @@ class ContinuousBatchingPipeline::Impl {
                 for (const auto& sequence: request->get_sequences()) {
                     m_scheduler->free_sequence(sequence->get_id());
                 }
+                m_sampler->clear_beam_search_info(request->get_request_id());
                 requests_iterator = m_requests.erase(requests_iterator);
             } else {
                 requests_iterator++;