From e4637b3a197cd03281e9b68f0e2f3a100c47b28a Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 13:48:22 +0400
Subject: [PATCH 01/18] Workaround (#618)

Workaround Python_VERSION_MAJOR and MINOR not being set by replasing
Python3 with Python

Disable generation of some of the COMPONENTs not needed for GenAI. There
are still unwanted empty archives, but they are generated
uncounditionally by rapidjson.
---
 CMakeLists.txt            |  3 +++
 src/python/CMakeLists.txt | 43 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8965e8b3e0..be8e03548a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,9 @@ project(OpenVINOGenAI
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX)
 
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" OFF)
+option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)
+
 # Find OpenVINODeveloperPackage first to compile with SDL flags
 find_package(OpenVINODeveloperPackage QUIET
              PATHS "${OpenVINO_DIR}")
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 1867c72fa5..bcbdb77b49 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -11,9 +11,50 @@ FetchContent_Declare(
 FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
+
+# Wouraround Python_VERSION_MAJOR and MINOR not being set by finding
+# Python package instead of Python3
+macro(ov_find_python_no_3 find_package_mode)
+    # Settings for FindPython3.cmake
+    if(NOT DEFINED Python3_USE_STATIC_LIBS)
+        set(Python3_USE_STATIC_LIBS OFF)
+    endif()
+
+    if(NOT DEFINED Python3_FIND_VIRTUALENV)
+        set(Python3_FIND_VIRTUALENV FIRST)
+    endif()
+
+    if(NOT DEFINED Python3_FIND_IMPLEMENTATIONS)
+        set(Python3_FIND_IMPLEMENTATIONS CPython PyPy)
+    endif()
+
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+        set(python3_development_component Development.Module)
+    else()
+        set(python3_development_component Development)
+    endif()
+
+    if(CMAKE_CROSSCOMPILING AND LINUX)
+        # allow to find python headers from host in case of cross-compilation
+        # e.g. install libpython3-dev:<foreign arch> and finds its headers
+        set(_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
+        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
+        ov_cross_compile_define_debian_arch()
+    endif()
+
+    find_package(Python ${find_package_mode} COMPONENTS Interpreter ${python3_development_component})
+
+    if(CMAKE_CROSSCOMPILING AND LINUX)
+        ov_cross_compile_define_debian_arch_reset()
+        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
+    endif()
+
+    unset(python3_development_component)
+endmacro()
+
 # the following two calls are required for cross-compilation
 if(OpenVINODeveloperPackage_DIR)
-    ov_find_python3(REQUIRED)
+    ov_find_python_no_3(REQUIRED)
     ov_detect_python_module_extension()
 else()
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)

From 423c8e36d8a7a9b8489f852e1a3845b7e2a32944 Mon Sep 17 00:00:00 2001
From: Wovchena <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 14:40:20 +0400
Subject: [PATCH 02/18] Revert to python3

---
 src/python/CMakeLists.txt | 42 +--------------------------------------
 1 file changed, 1 insertion(+), 41 deletions(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index bcbdb77b49..f03f2f58d1 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -12,49 +12,9 @@ FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
 
-# Wouraround Python_VERSION_MAJOR and MINOR not being set by finding
-# Python package instead of Python3
-macro(ov_find_python_no_3 find_package_mode)
-    # Settings for FindPython3.cmake
-    if(NOT DEFINED Python3_USE_STATIC_LIBS)
-        set(Python3_USE_STATIC_LIBS OFF)
-    endif()
-
-    if(NOT DEFINED Python3_FIND_VIRTUALENV)
-        set(Python3_FIND_VIRTUALENV FIRST)
-    endif()
-
-    if(NOT DEFINED Python3_FIND_IMPLEMENTATIONS)
-        set(Python3_FIND_IMPLEMENTATIONS CPython PyPy)
-    endif()
-
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        set(python3_development_component Development.Module)
-    else()
-        set(python3_development_component Development)
-    endif()
-
-    if(CMAKE_CROSSCOMPILING AND LINUX)
-        # allow to find python headers from host in case of cross-compilation
-        # e.g. install libpython3-dev:<foreign arch> and finds its headers
-        set(_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
-        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
-        ov_cross_compile_define_debian_arch()
-    endif()
-
-    find_package(Python ${find_package_mode} COMPONENTS Interpreter ${python3_development_component})
-
-    if(CMAKE_CROSSCOMPILING AND LINUX)
-        ov_cross_compile_define_debian_arch_reset()
-        set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ${_old_CMAKE_FIND_ROOT_PATH_MODE_INCLUDE})
-    endif()
-
-    unset(python3_development_component)
-endmacro()
-
 # the following two calls are required for cross-compilation
 if(OpenVINODeveloperPackage_DIR)
-    ov_find_python_no_3(REQUIRED)
+    ov_find_python3(REQUIRED)
     ov_detect_python_module_extension()
 else()
     if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)

From 1b1b2f0ffdb96c7a5a77c40859c096b89a1da04a Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 16:28:59 +0400
Subject: [PATCH 03/18] Fix cmake Python var name (#624)

---
 src/python/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index f03f2f58d1..3d03a0d7a8 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -65,10 +65,10 @@ endif()
 install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
               "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION python/openvino_genai
-        COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+        COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 install(TARGETS py_generate_pipeline
         LIBRARY DESTINATION python/openvino_genai
-        COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+        COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 
 install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION openvino_genai

From 70b74ad79eff75b14da04cfc2bbacdf2c0cd7e90 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 15 Jul 2024 19:38:42 +0400
Subject: [PATCH 04/18] Add ContinuousBatchingPipeline constructor similar to
 LLMPipeline (#604)

That allows LLMPipeline to create ContinuousBatchingPipeline as a
backend. There's also a constructor accepting ireq, which can be used if
the model was already transformed appropriately for
ContinuousBatchingPipeline. But it feels it's going to be misleading and
it simpler just to throw if such constructor is called with
ContinuousBatchingPipeline backend.
---
 .github/workflows/causal_lm_cpp.yml           |  2 +-
 .../continuous_batching_accuracy.cpp          |  4 ++-
 .../genai/continuous_batching_pipeline.hpp    | 19 ++++++++++++-
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +--
 src/cpp/include/openvino/genai/tokenizer.hpp  |  2 +-
 src/cpp/src/continuous_batching_pipeline.cpp  | 27 +++++++++++++------
 src/python/py_generate_pipeline.cpp           | 10 ++++---
 tests/python_tests/common.py                  |  2 +-
 tests/python_tests/test_sampling.py           |  4 +--
 9 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 80089a4e81..18cc89a8f0 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -648,7 +648,7 @@ jobs:
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Releas -S ./ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
diff --git a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
index 6e0cb5034f..77485e36db 100644
--- a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
+++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
@@ -78,7 +78,9 @@ int main(int argc, char* argv[]) try {
     // vLLM specific params
     scheduler_config.max_num_seqs = 2;
 
-    ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config);
+    // It's possible to construct a Tokenizer from a different path.
+    // If the Tokenizer isn't specified, it's loaded from the same folder.
+    ov::genai::ContinuousBatchingPipeline pipe(models_path, ov::genai::Tokenizer{models_path}, scheduler_config);
     std::vector<ov::genai::GenerationResult> generation_results = pipe.generate(prompts, sampling_params);
 
     for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) {
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index e30892f9c3..be9a5fd8c1 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -32,7 +32,24 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
                                const std::string& device = "CPU",
                                const ov::AnyMap& plugin_config = {});
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer();
+    /**
+    * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
+    *
+    * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
+    * @param scheduler_config
+    * @param tokenizer manually initialized ov::genai::Tokenizer
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
+    ContinuousBatchingPipeline(
+        const std::string& model_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device="CPU",
+        const ov::AnyMap& plugin_config={}
+    );
+
+    ov::genai::Tokenizer get_tokenizer();
 
     ov::genai::GenerationConfig get_config() const;
 
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index b36eab7238..84dc02bd58 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -116,10 +116,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     );
     
     /**
-    * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
+    * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
     *
     * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
-    * @param tokenizer manually initialized ov::Tokenizer 
+    * @param tokenizer manually initialized ov::genai::Tokenizer 
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 4af45e7cfd..5a1e181e21 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -26,7 +26,7 @@ struct TokenizedInputs {
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
-    * @brief ov::Tokenizer constructor.
+    * @brief ov::genai::Tokenizer constructor.
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     */
     Tokenizer(const std::string& tokenizer_path);
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index dbacf3c243..27c183ddd8 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -19,7 +19,7 @@ using namespace ov::genai;
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
-    std::shared_ptr<ov::genai::Tokenizer> m_tokenizer;
+    ov::genai::Tokenizer m_tokenizer;
     std::shared_ptr<Scheduler> m_scheduler;
     std::shared_ptr<CacheManager> m_cache_manager;
     std::shared_ptr<ModelRunner> m_model_runner;
@@ -69,9 +69,9 @@ class ContinuousBatchingPipeline::Impl {
     }
 
 public:
-    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string device, const ov::AnyMap& plugin_config) {
+    Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config) :
+            m_tokenizer{tokenizer} {
         ov::Core core;
-        m_tokenizer = std::make_shared<ov::genai::Tokenizer>(models_path);
 
         // The model can be compiled for GPU as well
         std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
@@ -104,6 +104,9 @@ class ContinuousBatchingPipeline::Impl {
         // read default generation config
     }
 
+    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config)
+        : Impl{models_path, Tokenizer(models_path), scheduler_config, device, plugin_config} {}
+
     ov::genai::GenerationConfig get_config() const {
         return m_generation_config;
     }
@@ -112,19 +115,19 @@ class ContinuousBatchingPipeline::Impl {
         return m_pipeline_metrics;
     }
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer() {
+    ov::genai::Tokenizer get_tokenizer() {
         return m_tokenizer;
     }
 
     GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
-        sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id());
+        sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
 
         ov::Tensor input_ids;
         {
             static ManualTimer timer("tokenize");
             timer.start();
-            input_ids = m_tokenizer->encode(prompt).input_ids;
+            input_ids = m_tokenizer.encode(prompt).input_ids;
             timer.end();
         }
 
@@ -262,7 +265,7 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                std::string output_text = m_tokenizer->decode(generation_output.generated_token_ids);
+                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
                 result.m_generation_ids.push_back(output_text);
                 result.m_scores.push_back(generation_output.score);
             }
@@ -282,7 +285,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model
     m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, plugin_config);
 }
 
-std::shared_ptr<ov::genai::Tokenizer> ContinuousBatchingPipeline::get_tokenizer() {
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+    const std::string& model_path,
+    const Tokenizer& tokenizer,
+    const SchedulerConfig& scheduler_config,
+    const std::string& device,
+    const ov::AnyMap& plugin_config
+) : m_impl{std::make_shared<Impl>(model_path, tokenizer, scheduler_config, device, plugin_config)} {}
+
+ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() {
     return m_impl->get_tokenizer();
 }
 
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 8e475329f1..d7b2aab29c 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -596,10 +596,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
-        .def(py::init([](const std::string& model_path, const SchedulerConfig& config) {
+        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(model_path, config);
-        }))
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
+        .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+            ScopedVar env_manager(ov_tokenizers_module_path());
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
         .def("add_request", &ContinuousBatchingPipeline::add_request)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 9b53a6b78b..2ec96f671c 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -273,7 +273,7 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {})
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(model_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index f4f35deace..c02804527b 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -7,7 +7,7 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig
+from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
 from typing import List
 
 from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
@@ -205,7 +205,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config)
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))

From f0c26772d613cc1a31c7c1491484aef41a706996 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Mon, 15 Jul 2024 19:20:26 +0200
Subject: [PATCH 05/18] Clear beam search info when generate() is finished.
 (#630)

Port of PR: https://github.com/openvinotoolkit/openvino.genai/pull/615
---
 src/cpp/src/continuous_batching_pipeline.cpp | 1 +
 src/cpp/src/sampler.hpp                      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 27c183ddd8..ddfebc5926 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -61,6 +61,7 @@ class ContinuousBatchingPipeline::Impl {
                 for (const auto& sequence: request->get_sequences()) {
                     m_scheduler->free_sequence(sequence->get_id());
                 }
+                m_sampler->clear_beam_search_info(request->get_request_id());
                 requests_iterator = m_requests.erase(requests_iterator);
             } else {
                 requests_iterator++;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 095c795a42..dc631c68ac 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -247,6 +247,8 @@ class Sampler {
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits);
 
     void set_seed(size_t seed) { rng_engine.seed(seed); }
+
+    void clear_beam_search_info(uint64_t request_id);
 };
 
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits) {
@@ -578,4 +580,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
         }
     }
 }
+
+void Sampler::clear_beam_search_info(uint64_t request_id) { 
+    m_beam_search_info.erase(request_id);
+}
 }

From 73badf67a1a533afa1d94f6fca57a7604a0f4dc9 Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Tue, 16 Jul 2024 09:51:55 +0200
Subject: [PATCH 06/18] Update nncf_utils.py (#616) (#633)

Updated default configurations based on results from CVS-143530.

(cherry picked from commit f460002dcc24171f279e032b4f91df3feab00c35)
---
 llm_bench/python/utils/nncf_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
index 51d2c67979..25ef8aff18 100644
--- a/llm_bench/python/utils/nncf_utils.py
+++ b/llm_bench/python/utils/nncf_utils.py
@@ -38,10 +38,9 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
-    "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
     "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
     "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
                        "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
@@ -58,7 +57,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
     "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
     "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
-    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
+    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True},
     "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
     "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True,
                      "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}},
@@ -70,7 +69,13 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
     "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
     "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "gpt-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.5, "scale": True},
+    "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "stablelm-7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.6, "scale": True},
+    "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
 }

From 25655e3f63705424a6de2180b3d49d8653a62f2e Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 13:25:27 +0400
Subject: [PATCH 07/18] Workaround cmake packaging (#634)

Remove unwanted archives
---
 CMakeLists.txt            | 19 +++++++++++++++++++
 src/python/CMakeLists.txt | 11 -----------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be8e03548a..908e353484 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,20 @@ find_file(spda_to_pa_header sdpa_to_paged_attention.hpp
 
 include(cmake/features.cmake)
 
+if(ENABLE_PYTHON)
+    # the following two calls are required for cross-compilation
+    if(OpenVINODeveloperPackage_DIR)
+        ov_find_python3(REQUIRED)
+        ov_detect_python_module_extension()
+    else()
+        if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
+        else()
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
+        endif()
+    endif()
+endif()
+
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 add_subdirectory(samples)
@@ -52,4 +66,9 @@ install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LIC
 install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
+# Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+if(ENABLE_PYTHON)
+    list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
+endif() 
 include(CPack)
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 3d03a0d7a8..7427b624a5 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -12,17 +12,6 @@ FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
 
-# the following two calls are required for cross-compilation
-if(OpenVINODeveloperPackage_DIR)
-    ov_find_python3(REQUIRED)
-    ov_detect_python_module_extension()
-else()
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
-    else()
-        find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
-    endif()
-endif()
 if(NOT pybind11_POPULATED)
     FetchContent_Populate(pybind11)
     add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})

From 754f6d772003c4d9ceb17f85d535bfe1f1648803 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 14:29:34 +0400
Subject: [PATCH 08/18] Save licensing_genai into docs to align with OpenVINO
 (#637)

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 908e353484..5f7390f981 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,8 +62,8 @@ add_subdirectory(src)
 add_subdirectory(samples)
 add_subdirectory(tests/cpp)
 
-install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
-install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
+install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
+install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
 # Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614

From e5247e048c2c74e4236e0333dc1825adf1fccf7c Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 16 Jul 2024 18:54:46 +0400
Subject: [PATCH 09/18] Update submodule (#638)

---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 880d569cd2..04795c1b78 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb
+Subproject commit 04795c1b78c61e3294d1744c78a8ebb5e129256c

From 2d1fa3b33fc3308f4cce9917829ad24346cc0901 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 15:51:54 +0400
Subject: [PATCH 10/18] Add Llama3 (#620)

Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
---
 samples/cpp/beam_search_causal_lm/README.md     |  2 +-
 samples/cpp/chat_sample/README.md               |  2 +-
 samples/cpp/greedy_causal_lm/README.md          |  2 +-
 samples/cpp/multinomial_causal_lm/README.md     |  2 +-
 samples/cpp/prompt_lookup_decoding_lm/README.md |  2 +-
 samples/cpp/speculative_decoding_lm/README.md   |  2 +-
 samples/python/beam_search_causal_lm/README.md  |  2 +-
 samples/python/chat_sample/README.md            |  2 +-
 samples/python/greedy_causal_lm/README.md       |  2 +-
 samples/python/multinomial_causal_lm/README.md  |  2 +-
 src/docs/SUPPORTED_MODELS.md                    | 14 +++++++++++++-
 11 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md
index a104288911..82232c42f6 100644
--- a/samples/cpp/beam_search_causal_lm/README.md
+++ b/samples/cpp/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ sample that supports most popular models like LLaMA 2
+# Text generation C++ sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md
index 4baa8385ef..8a24b20005 100644
--- a/samples/cpp/chat_sample/README.md
+++ b/samples/cpp/chat_sample/README.md
@@ -1,4 +1,4 @@
-# C++ chat_sample that supports most popular models like LLaMA 2
+# C++ chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md
index 3c0758ee6b..c0a7d5f3c4 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md
index 731d03e3c1..4478579919 100644
--- a/samples/cpp/multinomial_causal_lm/README.md
+++ b/samples/cpp/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md
index 980c0cd19c..89a5e2c585 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/README.md
+++ b/samples/cpp/prompt_lookup_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality.
 
diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md
index 7abcb6782a..c86bd8b617 100644
--- a/samples/cpp/speculative_decoding_lm/README.md
+++ b/samples/cpp/speculative_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model.
 
diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md
index ff5286d010..5e80aa69da 100644
--- a/samples/python/beam_search_causal_lm/README.md
+++ b/samples/python/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python sample that supports most popular models like LLaMA 2
+# Text generation Python sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md
index 34d71fab8a..983789d0eb 100644
--- a/samples/python/chat_sample/README.md
+++ b/samples/python/chat_sample/README.md
@@ -1,4 +1,4 @@
-# Python chat_sample that supports most popular models like LLaMA 2
+# Python chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md
index 7c87b04aad..97b044eb51 100644
--- a/samples/python/greedy_causal_lm/README.md
+++ b/samples/python/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md
index d76b933663..d39142f3de 100644
--- a/samples/python/multinomial_causal_lm/README.md
+++ b/samples/python/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 0e6099db03..3eb2af17b4 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -45,7 +45,19 @@
       </td>
     </tr>
     <tr>
-      <td rowspan="3" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td rowspan="4" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td>Llama 3</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><code>meta-llama/Meta-Llama-3-8B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><code>meta-llama/Meta-Llama-3-8B-Instruct</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B"><code>meta-llama/Meta-Llama-3-70B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"><code>meta-llama/Meta-Llama-3-70B-Instruct</code></a></li>
+        </ul>
+      </td>
+    </tr>
+    <tr>
+      <!-- <td><code>LlamaForCausalLM</code></td> -->
       <td>Llama 2</td>
       <td>
         <ul>

From 489a87d7c46960a0cb9920ac93333394c91d5306 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 16:54:42 +0400
Subject: [PATCH 11/18] nightly->rc1 (#621)

---
 .github/workflows/causal_lm_cpp.yml           | 66 +++++++++----------
 .github/workflows/genai_package.yml           | 18 ++---
 .github/workflows/genai_python_lib.yml        | 12 ++--
 .github/workflows/lcm_dreamshaper_cpp.yml     |  8 +--
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 src/README.md                                 |  2 +-
 src/docs/BUILD.md                             |  8 +--
 7 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 18cc89a8f0..85bef624c8 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -34,8 +34,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -77,8 +77,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -208,8 +208,8 @@ jobs:
       - name: Download, convert and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -253,8 +253,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -280,8 +280,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -308,8 +308,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -336,8 +336,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -364,8 +364,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
@@ -401,8 +401,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -445,8 +445,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -493,8 +493,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -543,8 +543,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -603,8 +603,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -645,8 +645,8 @@ jobs:
       - name: Install dependencies and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -684,8 +684,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 06e589dfb9..2535e423d9 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_package:
     strategy:
@@ -28,8 +28,8 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -57,8 +57,8 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -99,8 +99,8 @@ jobs:
         shell: bash
       - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 34d5fbf924..e0c43bddd5 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_centos7_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
@@ -29,7 +29,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -m pytest ./tests/python_tests/
@@ -52,7 +52,7 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -c "from openvino_genai import LLMPipeline"
@@ -79,7 +79,7 @@ jobs:
         shell: bash
       - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
-      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 2d450ad9c8..82a74f8cdf 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -50,8 +50,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
@@ -95,8 +95,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index cda567c23b..5197b27da8 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -49,8 +49,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_sd_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/src/README.md b/src/README.md
index c67a60eaec..445b88aa58 100644
--- a/src/README.md
+++ b/src/README.md
@@ -23,7 +23,7 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions
     > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     > cd openvino.genai
     > # Install python dependencies
-    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
     > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
     > ```
 
diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 710428139e..1aee73bfb0 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -18,7 +18,7 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
     sudo ./ov/install_dependencies/install_openvino_dependencies.sh
     ```
 3. Build the project:
@@ -48,9 +48,9 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64.zip
+    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
     unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64
+    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64
     ```
 3. Build the project:
     ```sh
@@ -85,7 +85,7 @@
 2. Download OpenVINO archive and install dependencies:
     ```sh
     mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
+    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
     ```
 3. Build the project:
     ```sh

From 67f04675a03b2774babc1e0358407ba67f49cb45 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Wed, 17 Jul 2024 19:17:11 +0400
Subject: [PATCH 12/18] Add OpenVINOGenAITargets to core_genai_dev COMPONENT
 (#642)

OpenVINOGenAITargets.cmake was excluded from packaging because
CPACK_COMPONENTS_ALL is custom now and doesn't install Unspecified
component
---
 CMakeLists.txt         | 2 +-
 src/cpp/CMakeLists.txt | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f7390f981..7059324d84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,7 +67,7 @@ install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT lice
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
 # Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
-set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_docs)
 if(ENABLE_PYTHON)
     list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 endif() 
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 454c53b944..c140bf9ac7 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -103,7 +103,8 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION runtime/include COMPONENT core_genai_dev)
 install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
-        NAMESPACE openvino:: DESTINATION runtime/cmake)
+        NAMESPACE openvino:: DESTINATION runtime/cmake
+        COMPONENT core_genai_dev)
 
 include(CMakePackageConfigHelpers)
 configure_package_config_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/OpenVINOGenAIConfig.cmake.in"

From 19691609512f7c7d344cdf19cd8d36db30b6c574 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Mon, 22 Jul 2024 12:46:25 +0400
Subject: [PATCH 13/18] Apply todo, initialize detokenizer's cache (#647)

---
 src/cpp/src/tokenizer.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 9b4a206a1e..ac6b925dcb 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -98,8 +98,11 @@ class Tokenizer::TokenizerImpl {
                                                    device).create_infer_request();
 
         // Get special token ids by inference if they are not defined.
-        // todo: do not call until CVS-143410 is resolved
-        // infer_special_tokens_if_necessary();
+        infer_special_tokens_if_necessary();
+        // Initialize tokenizer's cache to save time later.
+        // infer_special_tokens_if_necessary() already could do that
+        // but it didn't run decode() for sure.
+        decode(encode("").input_ids);
     }
 
     // load special tokens ids from config.json

From 0e0f6a9c6cde08835dd579b20e76149f9fc17545 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Mon, 22 Jul 2024 11:04:19 +0100
Subject: [PATCH 14/18] Cherry-pick static LLM pipeline changes (#654)

Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
---
 samples/cpp/chat_sample/chat_sample.cpp |  2 +-
 src/cpp/src/llm_pipeline_static.cpp     | 88 +++++++++++++++++++------
 src/cpp/src/llm_pipeline_static.hpp     | 12 ++--
 3 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
index d9d9c2b2de..ae4dad88a2 100644
--- a/samples/cpp/chat_sample/chat_sample.cpp
+++ b/samples/cpp/chat_sample/chat_sample.cpp
@@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try {
     std::string prompt;
     std::string model_path = argv[1];
 
-    std::string device = "CPU";  // GPU can be used as well
+    std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(model_path, "CPU");
     
     ov::genai::GenerationConfig config;
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 3a9ea4d1d9..3f50d30ec9 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -8,6 +8,8 @@
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
 
+#include <openvino/pass/stateful_to_stateless.hpp>
+
 namespace {
 
 std::shared_ptr<ov::Model> add_slices_to_kvcache_inputs(const std::shared_ptr<ov::Model>& model) {
@@ -75,25 +77,42 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-void fill_tensor(ov::Tensor tensor, int64_t fill_val) {
+void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) {
     int64_t* tensor_data = tensor.data<int64_t>();
-    std::fill(tensor_data, tensor_data + tensor.get_size(), fill_val);
+    std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val);
 }
 
-void copy_with_left_offset(const ov::Tensor& orig, ov::Tensor& padded) {
-    const auto orig_size = orig.get_size();
-    const auto padded_size = padded.get_size();
-    const auto kLeftOffset = padded_size - orig_size;
+void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& padded) {
     int64_t* orig_data = orig.data<int64_t>();
     int64_t* padded_data = padded.data<int64_t>();
-    std::copy(orig_data, orig_data + orig_size, padded_data + kLeftOffset);
+    std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset);
 }
 
-ov::AnyMap extract_config_or_empty(const ov::AnyMap& config, const std::string& config_name) {
+ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string& config_name) {
     ov::AnyMap stage_cfg;
     if (auto it = config.find(config_name); it != config.end()) {
         const auto& map = it->second.as<std::map<std::string, std::string>>();
         stage_cfg = { map.begin(), map.end() };
+    } else if (config_name == "PREFILL_CONFIG") {
+        std::map<std::string, std::string> prefill_config = {
+			{ "NPU_USE_NPUW", "YES" },
+			{ "NPUW_FOLD", "YES" },
+			{ "NPUW_DCOFF_TYPE", "f16" },
+			{ "NPUW_DCOFF_SCALE",  "YES" },
+			{ "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" }
+        };
+        stage_cfg.insert(prefill_config.begin(), prefill_config.end());
+    } else if (config_name == "GENERATE_CONFIG") {
+        std::map<std::string, std::string> generate_config = {
+            { "NPU_USE_NPUW", "YES" },
+            { "NPUW_FOLD", "YES" },
+            { "NPUW_DCOFF_TYPE", "f16" },
+            { "NPUW_DCOFF_SCALE", "YES" },
+            { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" },
+            { "NPUW_PARALLEL_COMPILE", "YES" },
+            { "NPUW_FUNCALL_ASYNC", "YES" }
+        };
+        stage_cfg.insert(generate_config.begin(), generate_config.end());
     }
     return stage_cfg;
 }
@@ -126,7 +145,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     ov::Core core;
     // (1) Read the template model - this will be kvcache model
     auto kvcache_model = core.read_model(path / "openvino_model.xml");
-    // (2) TODO: Expose KV-cache input and output layers from kvcache model
+    // (2) Expose KV-cache input and output layers from kvcache model
+    ov::pass::StatefulToStateless().run_on_model(kvcache_model);
     // (3) Clone the model - this will be prefill
     auto prefill_model = kvcache_model->clone();
     prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
@@ -140,10 +160,10 @@ StaticLLMPipeline::StaticLLMPipeline(
     kvcache_model = add_slices_to_kvcache_inputs(kvcache_model);
     // (6) Compile both model
     m_prefill_request = core.compile_model(
-        prefill_model, device, extract_config_or_empty(config, "PREFILL_CONFIG")
+        prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG")
     ).create_infer_request();
     m_kvcache_request = core.compile_model(
-        kvcache_model, device, extract_config_or_empty(config, "GENERATE_CONFIG")
+        kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG")
     ).create_infer_request();
     // (7) Initialize tensors
     prepare_for_new_conversation();
@@ -156,6 +176,18 @@ StaticLLMPipeline::StaticLLMPipeline(
 ) : StaticLLMPipeline(path, path.string(), device, config) {
 }
 
+void StaticLLMPipeline::start_chat(const std::string& system_message) {
+    if (!system_message.empty()) {
+        m_history.push_back({{"role", "system"}, {"content", system_message}});
+    }
+    m_is_chat_conversation = true;
+};
+
+void StaticLLMPipeline::finish_chat() {
+    m_is_chat_conversation = false;
+    m_history.clear();
+};
+
 void StaticLLMPipeline::prepare_for_new_conversation() {
     fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id());
     fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u);
@@ -175,9 +207,23 @@ DecodedResults StaticLLMPipeline::generate(
     }
 
     OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
-    auto tokenized_input = m_tokenizer.encode(std::get<std::string>(inputs));
+    auto& prompt = std::get<std::string>(inputs);
+
+    if (m_is_chat_conversation) {
+        m_history.push_back({{"role", "user"}, {"content", prompt}});
+        constexpr bool add_generation_prompt = true;
+        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    }
+
+    auto tokenized_input = m_tokenizer.encode(prompt);
     auto encoded_results = generate(tokenized_input, config, streamer);
-    return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+
+    if (m_is_chat_conversation) {
+        auto answer = decoded_results.texts[0];
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+    return decoded_results;
 }
 
 EncodedResults StaticLLMPipeline::generate(
@@ -222,22 +268,25 @@ EncodedResults StaticLLMPipeline::generate(
     ov::genai::EncodedResults results;
     // NB: Only batch=1 is supported now
     results.scores.resize(1u);
+    results.scores[0] = 0u;
     results.tokens.resize(1u);
 
-    // NB: Check if input prompt less than maximum size
+    // NB: Check if there is enough space in KV-cache to process input prompt
     auto prompt_len = input_ids.get_size();
     if (prompt_len > m_kvcache_desc.total_size) {
         OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens");
     }
 
-    // NB: Reset tensors on every generate call - chat conversation isn't supported yet!
+    // NB: From the "generate" perspective, every call is treated as start of new conversation,
+    // but if continuation is needed, prompt contains information about the entire conversation.
     prepare_for_new_conversation();
 
     auto padded_input_ids = m_prefill_request.get_tensor("input_ids");
-    copy_with_left_offset(input_ids, padded_input_ids);
+    const size_t offset = padded_input_ids.get_size() - input_ids.get_size();
+    copy_with_offset(input_ids, offset, padded_input_ids);
 
     auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask");
-    copy_with_left_offset(attention_mask, padded_attention_mask);
+    fill_tensor(padded_attention_mask, 1u, offset);
 
     auto padded_position_ids = m_prefill_request.get_tensor("position_ids");
     auto* padded_pos_data = padded_position_ids.data<int64_t>();
@@ -248,13 +297,13 @@ EncodedResults StaticLLMPipeline::generate(
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += prompt_len;
     int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
+    results.tokens[0].push_back(last_token);
     if (streamer_ptr && streamer_ptr->put(last_token)) {
         return results;
     }
 
     padded_attention_mask.copy_to(m_kvcache_request.get_tensor("attention_mask"));
 
-
     // Inputs: input_ids, attention_mask, position_ids, ...
     // Outputs: logits, ...
     const auto kStartInputKVCacheLayers = 3u;
@@ -286,13 +335,12 @@ EncodedResults StaticLLMPipeline::generate(
 
         last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
         results.tokens[0].push_back(last_token);
-        results.scores[0] = 0u;
 
         if (streamer_ptr && streamer_ptr->put(last_token)) {
             break;
         }
 
-        if (last_token == m_generation_config.eos_token_id) {
+        if (last_token == config.eos_token_id && !config.ignore_eos) {
             break;
         }
 
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 8c2f19ffa7..85488e1880 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -35,13 +35,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         StreamerVariant streamer
     ) override;
 
-    void start_chat(const std::string& system_message) override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-    void finish_chat() override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-
+    void start_chat(const std::string& system_message) override;
+    void finish_chat() override;
 private:
     void prepare_for_new_conversation();
 
@@ -54,6 +49,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
+
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
 };
 
 }  // namespace genai

From 7bf42f1f12f55f1ae30610267897c22a98545f31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= <milosz.zeglarski@intel.com>
Date: Mon, 22 Jul 2024 17:03:49 +0200
Subject: [PATCH 15/18] Cherry-pick custom max_element loop (#662)

Cherry picked from master
---
 src/cpp/src/sampler.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index dc631c68ac..6390fc8725 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -219,8 +219,13 @@ class Sampler {
     }
 
     Token _greedy_sample(const std::vector<Token>& logit_vector) const {
-        auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
-        return *out_token;
+        Token max_token{-std::numeric_limits<float>::infinity() , 0};
+        for (const auto& logit : logit_vector) {
+            if (logit.m_log_prob > max_token.m_log_prob) {
+                max_token = logit;
+            }
+        }
+        return max_token;
     }
 
     std::vector<Token> _multinomial_sample(const std::vector<Token>& logit_vector, size_t num_tokens_per_sequence) {

From bad01b94e2c21abce6d211c8c85db00f9af7f6c0 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Mon, 22 Jul 2024 19:35:25 +0200
Subject: [PATCH 16/18] Add note for pybind ov::Tensor issue (#659)

---
 src/docs/BUILD.md | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 1aee73bfb0..3b89995dc2 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -1,5 +1,8 @@
 # How to Build OpenVINO™ GenAI
 
+> **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`.
+The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html).
+
 ## Build for Linux Systems
 
 ### Software Requirements 
@@ -10,20 +13,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build).  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -40,21 +39,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
-    unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64
-    ```
 3. Build the project:
     ```sh
-    call ov\setupvars.bat
+    call <INSTALL_DIR>\setupvars.bat
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -77,19 +71,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov

From cb0da0ad7a2e35f686d7f529489d83ce01783989 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@intel.com>
Date: Tue, 23 Jul 2024 01:57:33 +0800
Subject: [PATCH 17/18] [OV 24.3]Fix multinomial sample CMakeList (#658)

@Wovchena, retarget to OV 24.3 release branch
---
 samples/cpp/multinomial_causal_lm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
index efcac50f09..98bc76ee3c 100644
--- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt
+++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
@@ -11,7 +11,7 @@ set_target_properties(multinomial_causal_lm PROPERTIES
     COMPILE_PDB_NAME multinomial_causal_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-target_compile_features(greedy_causal_lm PRIVATE cxx_std_11)
+target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11)
 install(TARGETS multinomial_causal_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin

From bc9224884963ff89c99b7c73b30404fd6e3b0f40 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Tue, 23 Jul 2024 08:31:59 +0200
Subject: [PATCH 18/18] add Readme for tests (#664)

- Added Readme for python tests
- Added `--model_ids` option to run selectively only on specific models

---------

Co-authored-by: Zlobin Vladimir <vladimir.zlobin@intel.com>
---
 tests/python_tests/README.md                 | 47 ++++++++++++++++++++
 tests/python_tests/conftest.py               |  7 ++-
 tests/python_tests/ov_genai_test_utils.py    |  5 ++-
 tests/python_tests/test_chat_generate_api.py |  4 ++
 tests/python_tests/test_generate_api.py      | 32 +++++++++++++
 5 files changed, 93 insertions(+), 2 deletions(-)
 create mode 100644 tests/python_tests/README.md

diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md
new file mode 100644
index 0000000000..e5381708de
--- /dev/null
+++ b/tests/python_tests/README.md
@@ -0,0 +1,47 @@
+# OpenVINO™ GenAI Tests
+
+This tests aim to validate support for vanilla and continuous batching GenAI APIs.
+
+## Setup environemnt
+
+In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md).
+
+Then install requirements for tests:
+```sh
+pip install -r tests/python_tests/requirements.txt
+```
+
+## Run Tests
+
+```sh
+python -m pytest tests/python_tests/ -m precommit
+```
+
+During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g.
+```sh
+GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit
+```
+
+If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g.
+```sh
+PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit
+```
+
+## Customise tests run
+
+Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat"
+```
+
+If you wish to run all tests except beam search do the following:
+```sh
+python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search"
+```
+
+Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct"
+```
+
+List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index 66212468af..f98f47ecf3 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname):
         return f'{argname}={val}'
     return None
 
-def pytest_configure(config):
+def pytest_addoption(parser):
+    parser.addoption("--model_ids", help="Select models to run")
+
+def pytest_configure(config: pytest.Config):
     marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly'
     pytest.run_marker = marker
+    pytest.selected_model_ids = config.getoption('--model_ids', default=None)
+
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 4ba71a1d48..bc95418aff 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -49,7 +49,10 @@ def get_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-
+    
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+    # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 94de8f6cc2..5a73d481d3 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -33,6 +33,7 @@
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     device = 'CPU'
     chat_history_hf = []
@@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
     # compares with HF when history in ov_genai is save as a text
     device = 'CPU'
@@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
     # Check that when history is stored in KV cache results are the same as when history stored in a text.
     device ='CPU'
@@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config:
     {'role': 'user', 'content': 'What was my first question?'},
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize('chat_config', get_chat_templates())
 def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
     tokenizer_config = chat_config[1]
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 40eba92277..e2395cf8d7 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison(
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_decoding(model_descr, generation_config, prompt):
     run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
 
@@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt):
     condition=sys.platform == "linux"
 )
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_ov_tensors(model_descr, inputs):
     hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
@@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("encoded_prompt", encoded_prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 @pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_multibatch(model_descr, generation_config, prompts):
     run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
 
@@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts):
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
                               max_new_tokens, diversity_penalty, prompt):
     generation_config = dict(
@@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
 @pytest.mark.parametrize("max_new_tokens", [10, 80])
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
     # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
     # while genai ends sentence with <eos>
@@ -323,6 +330,7 @@ def user_defined_callback(subword):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -332,6 +340,7 @@ def test_callback_one_string(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -340,12 +349,14 @@ def test_callback_batch_fail(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("model_descr", get_models_list())
 def test_callback_decoding_metallama(model_descr, callback):
     # On metallam this prompt generates output which can shorten after adding new tokens.
@@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -380,6 +392,7 @@ def end(self):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_one_string():
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -389,6 +402,7 @@ def test_streamer_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -397,6 +411,7 @@ def test_streamer_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -404,6 +419,7 @@ def test_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -429,6 +447,7 @@ def test_operator_with_callback_batch_fail(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -436,6 +455,7 @@ def test_operator_with_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_ids_1(model_tmp_path):
     # test when there is an available config.json
     config_json = { 
@@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_str_2(model_tmp_path):
     # test with special_tokens_map
     special_tokens_map_json = { 
@@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3_(model_tmp_path):
     # special_tokens_map is not available 
     # but tokenize_config.json exists
@@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3(model_tmp_path):
     # both config.json is availabel and tokenizer_config.json available
     # check that it does not read int values from tokenizer_config.json if they are in config.json
@@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=AssertionError, 
     reason="CVS-143410 ov tokenizer should be aligned with hf",
@@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path):
 ]
 @pytest.mark.parametrize("generation_config", invalid_configs)
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_invalid_configs(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
     config_json = {}
@@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_valid_configs(model_tmp_path):
     model_id, temp_path = model_tmp_path
     pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
@@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path):
     dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", invalid_py_configs)
 def test_python_generation_config_validation(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
@@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_1():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_2():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_3():
     # On this model this prompt generates unfinished utf-8 string
     # and streams it. Test that pybind will not fail while we pass string to python.
@@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3():
 
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
 def test_left_pad():
     # test left pad tokenizer post processing implementation