From b80053182f94885fa092013fcdef7c0f2deff84a Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Mon, 23 Dec 2024 12:23:56 +0000
Subject: [PATCH 01/20] Experimental snapshot with greedy decoding

---
 src/cpp/src/llm_pipeline_static.cpp | 32 ++++++++++++--
 src/cpp/src/sampler.cpp             | 68 +++++++++++++++--------------
 src/cpp/src/sampler.hpp             |  1 +
 3 files changed, 65 insertions(+), 36 deletions(-)
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 090aed9650..4505a8832e 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -3,6 +3,9 @@
 
 #include "llm_pipeline_static.hpp"
 
+#include "logit_processor.hpp"
+#include "sampler.hpp"
+
 #include <fstream>
 #include <regex>
 
@@ -937,6 +940,21 @@ DecodedResults StaticLLMPipeline::generate(
     return decoded_results;
 }
 
+int64_t sample_next_token(const ov::Tensor& logits,
+                          const GenerationConfig& config,
+                          LogitProcessor& logit_processor) {
+    Logits logit_vector(logits.data<float>(), logits.get_shape()[2]);
+    logit_processor.apply(logit_vector);
+    int64_t last_token = -1;
+    if (config.is_greedy_decoding()) {
+        last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index;
+    } else {
+        OPENVINO_ASSERT(false);
+    }
+    logit_processor.register_new_generated_token(last_token);
+    return last_token;
+}
+
 EncodedResults StaticLLMPipeline::generate(
     const EncodedInputs& inputs,
     OptionalGenerationConfig generation_config,
@@ -973,10 +991,15 @@ EncodedResults StaticLLMPipeline::generate(
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
 
-    if (!config.is_greedy_decoding()) {
-        OPENVINO_THROW("Currently only greedy decoding is supported");
+    if (!config.is_greedy_decoding() && !config.is_multinomial()) {
+        OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
     }
 
+    std::vector<int64_t> input_ids_vec;
+    input_ids_vec.reserve(input_ids.get_size());
+    std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), std::back_inserter(input_ids_vec));
+    LogitProcessor logit_processor(config, input_ids_vec);
+
     ov::Shape prompts_shape = input_ids.get_shape();
     const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
@@ -1015,7 +1038,8 @@ EncodedResults StaticLLMPipeline::generate(
 
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
-    int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
+
+    auto last_token = sample_next_token(m_prefill_request.get_tensor("logits"), config, logit_processor);
     results.tokens[0].push_back(last_token);
     if (streamer_ptr && streamer_ptr->put(last_token)) {
         return results;
@@ -1069,7 +1093,7 @@ EncodedResults StaticLLMPipeline::generate(
         m_kvcache_request.infer();
         m_kvcache_desc.num_stored_tokens += 1;
 
-        last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
+        last_token = sample_next_token(m_kvcache_request.get_tensor("logits"), config, logit_processor);
         results.tokens[0].push_back(last_token);
 
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index f77463d767..48c3e5354e 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -67,6 +67,41 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
     return tokens;
 }
 
+Token greedy_sample(const Logits& logits, size_t top_logprobs) {
+    // For greedy sampling we do not expect sorting or shrinking considered tokens
+    // so we can operate directly on the data buffer
+    size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1
+    std::vector<float> top_values(m, -std::numeric_limits<float>::infinity());
+    std::vector<size_t> top_indexes(m, 0);
+
+    for (size_t i = 0; i < logits.m_size; ++i) {
+        if (logits.m_data[i] > top_values.back()) {
+            top_values.back() = logits.m_data[i];
+            top_indexes.back() = i;
+
+            for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) {
+                std::swap(top_values[j], top_values[j - 1]);
+                std::swap(top_indexes[j], top_indexes[j - 1]);
+            }
+        }
+    }
+
+    size_t max_index = top_indexes.front();
+    float max_value = 0.0;
+
+    if (top_logprobs) {
+        // apply log softmax to max value
+        max_value = top_values.front();
+        float log_sum = std::log(std::accumulate(
+            logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) {
+                return accumulated + std::exp(to_add - max_value);
+        }));
+        max_value = -log_sum;
+    }
+
+    return Token(max_value, max_index);
+}
+
 std::vector<int64_t> wrap_tokens(const std::vector<int64_t>& tokens, const std::vector<int64_t>& prefix_tokens, const std::vector<int64_t>& suffix_tokens) {
     std::vector<int64_t> all_tokens = prefix_tokens;
     all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end());
@@ -493,38 +528,7 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to
 }
 
 Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const {
-    // For greedy sampling we do not expect sorting or shrinking considered tokens
-    // so we can operate directly on the data buffer
-    size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1
-    std::vector<float> top_values(m, -std::numeric_limits<float>::infinity());
-    std::vector<size_t> top_indexes(m, 0);
-
-    for (size_t i = 0; i < logits.m_size; ++i) {
-        if (logits.m_data[i] > top_values.back()) {
-            top_values.back() = logits.m_data[i];
-            top_indexes.back() = i;
-
-            for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) {
-                std::swap(top_values[j], top_values[j - 1]);
-                std::swap(top_indexes[j], top_indexes[j - 1]);
-            }
-        }
-    }
-
-    size_t max_index = top_indexes.front();
-    float max_value = 0.0;
-
-    if (top_logprobs) {
-        // apply log softmax to max value
-        max_value = top_values.front();
-        float log_sum = std::log(std::accumulate(
-            logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) {
-                return accumulated + std::exp(to_add - max_value);
-        }));
-        max_value = -log_sum;
-    }
-
-    return Token(max_value, max_index);
+    return greedy_sample(logits, top_logprobs);
 }
 
 std::vector<Token> Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) {
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 0f7876cbf9..0cf983fc01 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -31,6 +31,7 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t
 }
 
 std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx);
+Token greedy_sample(const Logits& logits, size_t top_logprobs);
 
 struct SamplerOutput {
     // IDs of sequences that need to be dropped

From 55ead2d6066e4fd8dc6b2e20b7e22e13a551148d Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Mon, 23 Dec 2024 16:17:56 +0000
Subject: [PATCH 02/20] Add multinomial support

---
 src/cpp/src/llm_pipeline_static.cpp | 17 +++++++--
 src/cpp/src/llm_pipeline_static.hpp |  4 ++
 src/cpp/src/sampler.cpp             | 57 ++++++++++++++++-------------
 src/cpp/src/sampler.hpp             |  5 +++
 4 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 4505a8832e..282c5e957a 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -942,13 +942,22 @@ DecodedResults StaticLLMPipeline::generate(
 
 int64_t sample_next_token(const ov::Tensor& logits,
                           const GenerationConfig& config,
+                          std::mt19937& rng_engine,
                           LogitProcessor& logit_processor) {
-    Logits logit_vector(logits.data<float>(), logits.get_shape()[2]);
+    const size_t vocab_size = logits.get_shape()[2];
+    const size_t seq_len_size = logits.get_shape()[1];
+    const size_t offset = (seq_len_size - 1) * vocab_size;
+    // NB: Slice out and take probabilities only for the last token
+    Logits logit_vector(logits.data<float>() + offset, vocab_size);
     logit_processor.apply(logit_vector);
     int64_t last_token = -1;
     if (config.is_greedy_decoding()) {
         last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index;
+    } else if (config.is_multinomial()) {
+        last_token = ov::genai::multinomial_sample(logit_vector, 1u, rng_engine)[0].m_index;
     } else {
+        // NB: Only greedy and multinomial supported,
+        // the appropriate check is performed before
         OPENVINO_ASSERT(false);
     }
     logit_processor.register_new_generated_token(last_token);
@@ -1039,7 +1048,8 @@ EncodedResults StaticLLMPipeline::generate(
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
 
-    auto last_token = sample_next_token(m_prefill_request.get_tensor("logits"), config, logit_processor);
+    auto last_token = sample_next_token(
+        m_prefill_request.get_tensor("logits"), config, m_rng_engine, logit_processor);
     results.tokens[0].push_back(last_token);
     if (streamer_ptr && streamer_ptr->put(last_token)) {
         return results;
@@ -1093,7 +1103,8 @@ EncodedResults StaticLLMPipeline::generate(
         m_kvcache_request.infer();
         m_kvcache_desc.num_stored_tokens += 1;
 
-        last_token = sample_next_token(m_kvcache_request.get_tensor("logits"), config, logit_processor);
+        last_token = sample_next_token(
+            m_kvcache_request.get_tensor("logits"), config, m_rng_engine, logit_processor);
         results.tokens[0].push_back(last_token);
 
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 7acc28c684..664b820346 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include <filesystem>
+#include <random>
 
 #include "llm_pipeline_base.hpp"
 
@@ -83,6 +84,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
 
     bool m_is_chat_conversation = false;
     ChatHistory m_history;
+
+    // NB: For multinomial sampling
+    std::mt19937 m_rng_engine;
 };
 
 }  // namespace genai
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 48c3e5354e..468bdc2b48 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -102,6 +102,36 @@ Token greedy_sample(const Logits& logits, size_t top_logprobs) {
     return Token(max_value, max_index);
 }
 
+std::vector<Token> multinomial_sample(const Logits& logits,
+                                      size_t num_tokens_per_sequence,
+                                      std::mt19937& rng_engine) {
+    // If top_p or top_k was applied we use sorted vector, if not we go with original buffer.
+    std::vector<float> multinomial_weights;
+    multinomial_weights.reserve(logits.m_size);
+    if (logits.is_vector_initialized())
+        for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob);
+    else
+        multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size);
+
+    // std::discrete_distribution returns corrupted results when applied to log probabilities
+    // which result returning NAN only logprobs.
+    // so log() is applied after this line
+    auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1
+
+    std::vector<Token> out_tokens;
+    for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) {
+        size_t element_to_pick = dist(rng_engine);
+        if (logits.is_vector_initialized()) {
+            auto logit = logits.m_vector[element_to_pick];
+            logit.m_log_prob = std::log(logit.m_log_prob);
+            out_tokens.push_back(logit);
+        }
+        else
+            out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick);
+    }
+    return out_tokens;
+}
+
 std::vector<int64_t> wrap_tokens(const std::vector<int64_t>& tokens, const std::vector<int64_t>& prefix_tokens, const std::vector<int64_t>& suffix_tokens) {
     std::vector<int64_t> all_tokens = prefix_tokens;
     all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end());
@@ -532,31 +562,7 @@ Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const {
 }
 
 std::vector<Token> Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) {
-    // If top_p or top_k was applied we use sorted vector, if not we go with original buffer.
-    std::vector<float> multinomial_weights;
-    multinomial_weights.reserve(logits.m_size);
-    if (logits.is_vector_initialized())
-        for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob);
-    else
-        multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size);
-
-    // std::discrete_distribution returns corrupted results when applied to log probabilities
-    // which result returning NAN only logprobs.
-    // so log() is applied after this line
-    auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1
-
-    std::vector<Token> out_tokens;
-    for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) {
-        size_t element_to_pick = dist(rng_engine);
-        if (logits.is_vector_initialized()) {
-            auto logit = logits.m_vector[element_to_pick];
-            logit.m_log_prob = std::log(logit.m_log_prob);
-            out_tokens.push_back(logit);
-        }
-        else
-            out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick);
-    }
-    return out_tokens;
+    return multinomial_sample(logits, num_tokens_per_sequence, rng_engine);
 }
 
 std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) {
@@ -763,7 +769,6 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled
         size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
         const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters();
-
         const auto request_id = sequence_group->get_request_id();
         if (!m_logit_processors.count(request_id)) {
             m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())});
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 0cf983fc01..d79173087a 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -31,8 +31,13 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t
 }
 
 std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx);
+
 Token greedy_sample(const Logits& logits, size_t top_logprobs);
 
+std::vector<Token> multinomial_sample(const Logits& logits,
+                                      size_t num_tokens_per_sequence,
+                                      std::mt19937& rng_engine);
+
 struct SamplerOutput {
     // IDs of sequences that need to be dropped
     std::vector<uint64_t> m_dropped_sequences;

From 188575d92d52e5a489c1e0b73175b771c3b52415 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Tue, 24 Dec 2024 10:46:20 +0000
Subject: [PATCH 03/20] Handle rng seed

---
 src/cpp/src/llm_pipeline_static.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index ee81ea377b..34d118c387 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1009,6 +1009,7 @@ EncodedResults StaticLLMPipeline::generate(
     input_ids_vec.reserve(input_ids.get_size());
     std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), std::back_inserter(input_ids_vec));
     LogitProcessor logit_processor(config, input_ids_vec);
+    m_rng_engine.seed(config.rng_seed);
 
     ov::Shape prompts_shape = input_ids.get_shape();
     const size_t batch_size = prompts_shape[0];

From ab849500cc6a655ed1fff1c96ca05fd69e2964e8 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Tue, 24 Dec 2024 11:14:16 +0000
Subject: [PATCH 04/20] Update sampler.cpp

---
 src/cpp/src/sampler.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index d50caeffda..0e53f1b4ac 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -771,6 +771,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled
         size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
         const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters();
+
         const auto request_id = sequence_group->get_request_id();
         if (!m_logit_processors.count(request_id)) {
             m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())});

From c73f1f50bf9a9eeaa191f7774109a11e5637d772 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Mon, 30 Dec 2024 15:06:41 +0000
Subject: [PATCH 05/20] Handle stop_strings and add more tests

---
 src/cpp/src/llm_pipeline_static.cpp           | 34 ++++++--
 src/cpp/src/sampler.cpp                       |  7 --
 src/cpp/src/sampler.hpp                       | 15 ++++
 .../python_tests/test_llm_pipeline_static.py  | 84 ++++++++++++++++---
 4 files changed, 118 insertions(+), 22 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 34d118c387..db4133b0e7 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -238,12 +238,12 @@ enum class GenerateHint {
 
 std::string to_string(GenerateHint h) {
     switch(h) {
-        case GenerateHint::FAST_COMPILE : 
+        case GenerateHint::FAST_COMPILE :
             return "FAST_COMPILE";
-        case GenerateHint::BEST_PERF : 
+        case GenerateHint::BEST_PERF :
             return "BEST_PERF";
         default:
-            OPENVINO_THROW("Unsupported value for type GenerateHint provided");        
+            OPENVINO_THROW("Unsupported value for type GenerateHint provided");
     }
 }
 
@@ -692,7 +692,6 @@ StaticLLMPipeline::StaticLLMPipeline(
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config
 ) : LLMPipelineImplBase(tokenizer, generation_config) {
-    
     bool use_blobs = false;
     auto anyopt = get_option<bool>(properties, "USE_BLOBS");
     if (anyopt.has_value()) {
@@ -1005,12 +1004,23 @@ EncodedResults StaticLLMPipeline::generate(
         OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
     }
 
+    // FIXME:...
+    if ( streamer_ptr                &&
+        !config.stop_strings.empty() &&
+        !config.include_stop_str_in_output) {
+        OPENVINO_THROW("Static LLM pipeline doesn't support "
+                       "\"include_stop_str_in_output: false\" when a streamer is provided");
+    }
+
     std::vector<int64_t> input_ids_vec;
     input_ids_vec.reserve(input_ids.get_size());
     std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), std::back_inserter(input_ids_vec));
     LogitProcessor logit_processor(config, input_ids_vec);
     m_rng_engine.seed(config.rng_seed);
 
+    const auto processed_stop_strings =
+        process_stop_strings(config.stop_strings, m_tokenizer);
+
     ov::Shape prompts_shape = input_ids.get_shape();
     const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
@@ -1111,11 +1121,25 @@ EncodedResults StaticLLMPipeline::generate(
 
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
+
+        bool met_stop_str = false;
+        if (!config.stop_strings.empty()) {
+            auto match_result = match_stop_string(m_tokenizer,
+                                                  results.tokens[0],
+                                                  processed_stop_strings,
+                                                  config.include_stop_str_in_output);
+            if (match_result.is_matched) {
+                met_stop_str = true;
+                results.tokens[0].erase(
+                    results.tokens[0].end() - match_result.to_remove, results.tokens[0].end());
+            }
+        }
+
         if (streamer_ptr && streamer_ptr->put(last_token)) {
             break;
         }
 
-        if (last_token == config.eos_token_id && !config.ignore_eos) {
+        if (met_stop_str || (last_token == config.eos_token_id && !config.ignore_eos)) {
             break;
         }
 
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index d50caeffda..646eb6b85e 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -160,13 +160,6 @@ std::vector<int64_t> encode_and_process_string(const std::string& stop_string, o
     return encoded_stop_string;
 }
 
-struct MatchStopStringResult {
-    size_t to_remove = 0;
-    // int64_t last_token_id = 0;
-    // bool is_to_update_last_token = false;
-    bool is_matched = false;
-};
-
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
 MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
                       const TokenIds& generated_tokens,
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 12870ec70e..baf57ebb40 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -38,6 +38,21 @@ std::vector<Token> multinomial_sample(const Logits& logits,
                                       size_t num_tokens_per_sequence,
                                       std::mt19937& rng_engine);
 
+std::pair<size_t, std::set<std::string>>
+process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& tokenizer);
+
+struct MatchStopStringResult {
+    size_t to_remove = 0;
+    // int64_t last_token_id = 0;
+    // bool is_to_update_last_token = false;
+    bool is_matched = false;
+};
+
+MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
+                      const TokenIds& generated_tokens,
+                      const std::pair<size_t, std::set<std::string>>& stop_strings,
+                      bool is_include_to_output);
+
 struct SamplerOutput {
     // IDs of sequences that need to be dropped
     std::vector<uint64_t> m_dropped_sequences;
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index cad8b0fea0..06dc6e8675 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import openvino_genai as ov_genai
-from openvino.runtime import Core
+from openvino_genai import GenerationConfig
 import pytest
 import sys
 from ov_genai_test_utils import (
@@ -10,6 +10,28 @@
     get_chat_models_list,
 )
 
+from common import                                      \
+    get_greedy,                                         \
+    get_greedy_with_min_and_max_tokens,                 \
+    get_greedy_with_repetition_penalty,                 \
+    get_greedy_with_penalties,                          \
+    get_greedy_with_min_and_max_tokens,                 \
+    get_greedy_with_single_stop_string,                 \
+    get_greedy_with_multiple_stop_strings,              \
+    get_greedy_with_multiple_stop_strings_no_match,     \
+    get_greedy_stop_strings_exclude_from_output,        \
+    get_greedy_stop_strings_include_to_output,          \
+    get_greedy_n_stop_strings_exclude_from_output,      \
+    get_greedy_n_stop_strings_include_to_output,        \
+    get_multinomial_temperature,                        \
+    get_multinomial_temperature_and_top_p,              \
+    get_multinomial_temperature_and_top_k,              \
+    get_multinomial_temperature_top_p_and_top_k,        \
+    get_multinomial_temperature_and_repetition_penalty, \
+    get_multinomial_temperature_and_frequence_penalty,  \
+    get_multinomial_temperature_and_presence_penalty,   \
+    get_multinomial_all_parameters,                     \
+    get_beam_search
 
 # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU.
 common_config = {
@@ -29,18 +51,33 @@ def generate_chat_history(model_path, device, pipeline_config, questions):
     return chat_history
 
 
+generation_configs = [
+    get_greedy(),
+    get_greedy_with_min_and_max_tokens(),
+    get_greedy_with_repetition_penalty(),
+    get_greedy_with_penalties(),
+    get_greedy_with_min_and_max_tokens(),
+    get_greedy_with_single_stop_string(),
+    get_greedy_with_multiple_stop_strings(),
+    get_greedy_with_multiple_stop_strings_no_match(),
+    get_greedy_stop_strings_exclude_from_output(),
+    get_greedy_stop_strings_include_to_output(),
+    get_greedy_n_stop_strings_exclude_from_output(),
+    get_greedy_n_stop_strings_include_to_output()
+]
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_generation_compare_with_stateful():
-    prompt = 'The Sun is yellow because'
+@pytest.mark.parametrize("generation_config", generation_configs)
+def test_generation_compare_with_stateful(generation_config):
+    prompt = 'What is OpenVINO?'
     model_path = get_models_list()[0][1]
 
     stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU")
-    ref_out = stateful_pipe.generate(prompt, max_new_tokens=100)
+    ref_out = stateful_pipe.generate(prompt, generation_config)
 
     static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
-    actual_out = static_pipe.generate(prompt, max_new_tokens=100)
+    actual_out = static_pipe.generate(prompt, generation_config)
 
     if ref_out != actual_out:
         print(f'ref_out: {ref_out}\n')
@@ -48,6 +85,32 @@ def test_generation_compare_with_stateful():
     assert ref_out == actual_out
 
 
+generation_configs = [
+    get_multinomial_temperature(),
+    get_multinomial_temperature_and_top_p(),
+    get_multinomial_temperature_and_top_k(),
+    get_multinomial_temperature_top_p_and_top_k(),
+    get_multinomial_temperature_and_repetition_penalty(),
+    get_multinomial_temperature_and_frequence_penalty(),
+    get_multinomial_temperature_and_presence_penalty()
+]
+@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("generation_config", generation_configs)
+def test_multinomial_sampling(generation_config):
+	# Multinomial sampling is highly sensitive to raw logits values. For fair comparison,
+	# a reference implementation producing identical logits (e.g., from StaticLLMPipeline)
+	# would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply
+    # different optimizations due to differences in provided topologies, leading to slight
+    # variations in raw logits. Therefore, there is no reliable reference for validation,
+    # so only ensure that no exceptions are raised.
+    prompt = 'What is OpenVINO?'
+    model_path = get_models_list()[0][1]
+    static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
+    actual_out = static_pipe.generate(prompt, generation_config)
+
+
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
@@ -100,17 +163,18 @@ def test_batch_raise_error():
 
 
 # TODO: For the further sampling support
-generation_configs = [
-    dict(num_beam_groups=3),
-    dict(do_sample=True)
+generation_config = [
+    get_beam_search(),
+    # NB: Only num_return_sequences=1 is supported!
+    get_multinomial_all_parameters()
 ]
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
-@pytest.mark.parametrize("generation_config", generation_configs)
+@pytest.mark.parametrize("generation_config", generation_config)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_unsupported_sampling_raise_error(generation_config):
     model_path = get_models_list()[0][1]
-    prompt = 'The Sun is yellow because'
+    prompt = 'What is OpenVINO?'
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
         pipe.generate(prompt, **generation_config)

From 28435c802c7af3e01295562a23e3f2588e350cd8 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 11:56:33 +0000
Subject: [PATCH 06/20] Re-use CB sampler

---
 src/cpp/src/llm_pipeline_static.cpp | 137 +++++++++++++---------------
 src/cpp/src/llm_pipeline_static.hpp |   6 +-
 src/cpp/src/sampler.cpp             |   2 +-
 src/cpp/src/sampler.hpp             |   2 +-
 4 files changed, 66 insertions(+), 81 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index db4133b0e7..4ec225f949 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -646,7 +646,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ) : LLMPipelineImplBase(tokenizer,
-                        utils::from_config_json_if_exists(models_path)) {
+                        utils::from_config_json_if_exists(models_path)),
+    m_sampler(m_tokenizer) {
     auto properties = config;
     /* NB: Static LLM pipeline consists of two models,
        first to process the input prompt (prefill),
@@ -675,6 +676,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     if (m_generation_config.eos_token_id == -1) {
         m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
     }
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
 };
 
 StaticLLMPipeline::StaticLLMPipeline(
@@ -691,7 +694,7 @@ StaticLLMPipeline::StaticLLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config
-) : LLMPipelineImplBase(tokenizer, generation_config) {
+) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) {
     bool use_blobs = false;
     auto anyopt = get_option<bool>(properties, "USE_BLOBS");
     if (anyopt.has_value()) {
@@ -710,6 +713,8 @@ StaticLLMPipeline::StaticLLMPipeline(
     if (m_generation_config.eos_token_id == -1) {
         m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
     }
+
+    m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
 void StaticLLMPipeline::setupAndCompileModels(
@@ -940,28 +945,29 @@ DecodedResults StaticLLMPipeline::generate(
     return decoded_results;
 }
 
-int64_t sample_next_token(const ov::Tensor& logits,
-                          const GenerationConfig& config,
-                          std::mt19937& rng_engine,
-                          LogitProcessor& logit_processor) {
-    const size_t vocab_size = logits.get_shape()[2];
-    const size_t seq_len_size = logits.get_shape()[1];
-    const size_t offset = (seq_len_size - 1) * vocab_size;
-    // NB: Slice out and take probabilities only for the last token
-    Logits logit_vector(logits.data<float>() + offset, vocab_size);
-    logit_processor.apply(logit_vector);
-    int64_t last_token = -1;
-    if (config.is_greedy_decoding()) {
-        last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index;
-    } else if (config.is_multinomial()) {
-        last_token = ov::genai::multinomial_sample(logit_vector, 1u, rng_engine)[0].m_index;
-    } else {
-        // NB: Only greedy and multinomial supported,
-        // the appropriate check is performed before
-        OPENVINO_ASSERT(false);
+void stream_generated_tokens(std::shared_ptr<StreamerBase> streamer_ptr,
+                             GenerationHandle& handle) {
+    if (streamer_ptr && handle->can_read()) {
+        std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
+        for (const auto& gen_token : token.begin()->second.generated_ids) {
+            if (streamer_ptr->put(gen_token)) {
+                handle->drop();
+                break;
+            }
+        }
     }
-    logit_processor.register_new_generated_token(last_token);
-    return last_token;
+}
+
+int64_t get_last_token(SequenceGroup::Ptr sequence_group) {
+    const auto running_sequences = sequence_group->get_running_sequences();
+    OPENVINO_ASSERT(running_sequences.size() == 1u);
+    const auto sequence = running_sequences.front();
+
+    size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
+    OPENVINO_ASSERT(num_scheduled_tokens == 1u);
+
+    const auto num_processed_tokens = sequence_group->get_num_processed_tokens();
+    return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()];
 }
 
 EncodedResults StaticLLMPipeline::generate(
@@ -981,7 +987,10 @@ EncodedResults StaticLLMPipeline::generate(
         attention_mask = data->attention_mask;
     }
 
-    if (input_ids.get_shape().at(0) > 1u) {
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
+
+    if (batch_size > 1u) {
         OPENVINO_THROW("Currently only batch size=1 is supported");
     }
 
@@ -1004,25 +1013,6 @@ EncodedResults StaticLLMPipeline::generate(
         OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
     }
 
-    // FIXME:...
-    if ( streamer_ptr                &&
-        !config.stop_strings.empty() &&
-        !config.include_stop_str_in_output) {
-        OPENVINO_THROW("Static LLM pipeline doesn't support "
-                       "\"include_stop_str_in_output: false\" when a streamer is provided");
-    }
-
-    std::vector<int64_t> input_ids_vec;
-    input_ids_vec.reserve(input_ids.get_size());
-    std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), std::back_inserter(input_ids_vec));
-    LogitProcessor logit_processor(config, input_ids_vec);
-    m_rng_engine.seed(config.rng_seed);
-
-    const auto processed_stop_strings =
-        process_stop_strings(config.stop_strings, m_tokenizer);
-
-    ov::Shape prompts_shape = input_ids.get_shape();
-    const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     // NB: Only batch=1 is supported now
@@ -1060,12 +1050,20 @@ EncodedResults StaticLLMPipeline::generate(
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
 
-    auto last_token = sample_next_token(
-        m_prefill_request.get_tensor("logits"), config, m_rng_engine, logit_processor);
-    results.tokens[0].push_back(last_token);
-    if (streamer_ptr && streamer_ptr->put(last_token)) {
-        return results;
-    }
+    auto logits = m_prefill_request.get_tensor("logits");
+    int64_t output_sequence_len = logits.get_shape().at(1);
+
+    auto sequence_group = std::make_shared<SequenceGroup>(
+        0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
+    sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
+    sequence_group->schedule_tokens(output_sequence_len);
+
+    // NB: Controls what tokens are ready to be pushed into the streamer
+    GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
+        sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());
+
+    SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
+    stream_generated_tokens(streamer_ptr, handle);
 
     // Outputs: logits, ...
     const auto kStartOutputKVCacheLayers = 1u;
@@ -1106,8 +1104,10 @@ EncodedResults StaticLLMPipeline::generate(
     std::fill(attention_mask_data, attention_mask_data + m_kvcache_desc.num_stored_tokens - 1u, 1u);
     attention_mask_data[m_kvcache_desc.total_size - 1] = 1u;
 
-    const size_t max_tokens = config.get_max_new_tokens(prompt_len);
-    for (int i = 0; i < max_tokens - 1; ++i) {
+    while (sequence_group->is_running()) {
+        sequence_group->schedule_tokens(1);
+        int64_t last_token = get_last_token(sequence_group);
+
         input_ids_data[0] = last_token;
         position_ids_data[0] = m_kvcache_desc.num_stored_tokens;
         attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u;
@@ -1115,37 +1115,16 @@ EncodedResults StaticLLMPipeline::generate(
         m_kvcache_request.infer();
         m_kvcache_desc.num_stored_tokens += 1;
 
-        last_token = sample_next_token(
-            m_kvcache_request.get_tensor("logits"), config, m_rng_engine, logit_processor);
-        results.tokens[0].push_back(last_token);
-
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
-        bool met_stop_str = false;
-        if (!config.stop_strings.empty()) {
-            auto match_result = match_stop_string(m_tokenizer,
-                                                  results.tokens[0],
-                                                  processed_stop_strings,
-                                                  config.include_stop_str_in_output);
-            if (match_result.is_matched) {
-                met_stop_str = true;
-                results.tokens[0].erase(
-                    results.tokens[0].end() - match_result.to_remove, results.tokens[0].end());
-            }
-        }
-
-        if (streamer_ptr && streamer_ptr->put(last_token)) {
-            break;
-        }
-
-        if (met_stop_str || (last_token == config.eos_token_id && !config.ignore_eos)) {
-            break;
-        }
+        SamplerOutput sampler_output = m_sampler.sample(
+            {sequence_group}, m_kvcache_request.get_tensor("logits"));
+        stream_generated_tokens(streamer_ptr, handle);
 
         // NB: KV-cache is full, further generation is impossible
         if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
-            break;
+            sequence_group->set_out_of_memory();
         }
 
         // NB: Write KV-cache for the new token to the correct input position for the next iteration
@@ -1168,6 +1147,12 @@ EncodedResults StaticLLMPipeline::generate(
         streamer_ptr->end();
     }
 
+    OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u);
+    auto sequence = sequence_group->get_finished_sequences().front();
+    results.tokens[0] = sequence->get_generated_ids();
+    results.scores[0] = sequence->get_cumulative_log_prob();
+    m_sampler.clear_request_info(sequence_group->get_request_id());
+
     auto stop_time = std::chrono::steady_clock::now();
     // If is called without tokenization then that stat will not be reported.
     auto& metrics = results.perf_metrics;
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 664b820346..13d7752e2e 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -7,6 +7,7 @@
 #include <random>
 
 #include "llm_pipeline_base.hpp"
+#include "sampler.hpp"
 
 namespace ov {
 namespace genai {
@@ -78,15 +79,14 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         bool v_tensors_transposed;
     };
 
+    Sampler m_sampler;
+
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
 
     bool m_is_chat_conversation = false;
     ChatHistory m_history;
-
-    // NB: For multinomial sampling
-    std::mt19937 m_rng_engine;
 };
 
 }  // namespace genai
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 31aab070f0..e2a3238676 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -746,7 +746,7 @@ process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& token
     return result;
 }
 
-SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
+SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_groups,
                               ov::Tensor logits,
                               bool is_validation_mode_enabled) {
     const float * logits_data = logits.data<float>();
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index baf57ebb40..271d209f75 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -88,7 +88,7 @@ class Sampler {
     Sampler() = default;
     Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {};
 
-    SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
+    SamplerOutput sample(const std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
     void set_seed(size_t new_seed) {
         rng_engine.seed(new_seed);
         seed = new_seed;

From 461bc8cd11d81b4c77520f7afb27241a7a681dea Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 14:25:12 +0000
Subject: [PATCH 07/20] Add test and fix SequenceGroup

---
 src/cpp/src/llm_pipeline_static.cpp           |  1 +
 src/cpp/src/sequence_group.hpp                |  4 ++--
 .../python_tests/test_llm_pipeline_static.py  | 20 +++++++++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 4ec225f949..a89957850f 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1125,6 +1125,7 @@ EncodedResults StaticLLMPipeline::generate(
         // NB: KV-cache is full, further generation is impossible
         if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
             sequence_group->set_out_of_memory();
+            break;
         }
 
         // NB: Write KV-cache for the new token to the correct input position for the next iteration
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 8f8d5f899e..2df8a1f200 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -290,8 +290,8 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
     }
 
     size_t num_finished_seqs() const {
-        return std::count_if(m_sequences.begin(), m_sequences.end(), [] (Sequence::CPtr seq) {
-            return seq->has_finished();
+        return std::count_if(m_sequences.begin(), m_sequences.end(), [this] (Sequence::CPtr seq) {
+            return seq->has_finished() || seq->out_of_memory() || handle_dropped();
         });
     }
 
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index f8b7a7c2af..326386fe31 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -196,6 +196,26 @@ def test_max_number_of_tokens():
     assert len(encoded_results.tokens[0]) == num_tokens
 
 
+@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_terminate_when_kvcache_is_full():
+    model_path = get_models_list()[0][1]
+    prompt = 'The Sun is yellow because'
+    pipeline_config = { "MAX_PROMPT_LEN": 64, "MIN_RESPONSE_LEN": 64 }
+    pipeline_config |= common_config
+    kv_cache_size = pipeline_config['MAX_PROMPT_LEN'] + pipeline_config['MIN_RESPONSE_LEN']
+
+    tokenizer = ov_genai.Tokenizer(model_path)
+    tokenized_input = tokenizer.encode(prompt)
+    input_len = tokenized_input.input_ids.get_shape()[1]
+
+    pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config)
+    encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True)
+
+    assert len(encoded_results.tokens[0]) == (kv_cache_size - input_len + 1)
+
+
 # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt!
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline")

From 9f6aca40446ca1cee296c014df467a4a450898a3 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 14:25:28 +0000
Subject: [PATCH 08/20] Remove do_sample=False hardcode for GenAI

---
 tools/llm_bench/task/text_generation.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 4822b228ca..d6aebdbc3e 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -227,7 +227,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     gen_config = model.get_generation_config()
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
-    gen_config.do_sample = False
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get('num_assistant_tokens', None):
@@ -381,7 +380,6 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     gen_config = model.get_generation_config()
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
-    gen_config.do_sample = False
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get("num_assistant_tokens", None):

From 808d6b9603726de953d13c7dc27b8d10b387dad7 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 14:39:09 +0000
Subject: [PATCH 09/20] Fix comments to review

---
 src/cpp/src/llm_pipeline_static.cpp           | 52 +++++++++----------
 src/cpp/src/llm_pipeline_static.hpp           |  3 +-
 src/cpp/src/sampler.cpp                       |  2 +-
 src/cpp/src/sampler.hpp                       |  2 +-
 src/cpp/src/sequence_group.hpp                |  2 +-
 .../python_tests/test_llm_pipeline_static.py  | 10 ++--
 6 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index a89957850f..aac55015c0 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "llm_pipeline_static.hpp"
@@ -635,6 +635,31 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) {
     }
 }
 
+void stream_generated_tokens(std::shared_ptr<StreamerBase> streamer_ptr,
+                             GenerationHandle& handle) {
+    if (streamer_ptr && handle->can_read()) {
+        std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
+        for (const auto& gen_token : token.begin()->second.generated_ids) {
+            if (streamer_ptr->put(gen_token)) {
+                handle->drop();
+                break;
+            }
+        }
+    }
+}
+
+int64_t get_last_token(SequenceGroup::Ptr sequence_group) {
+    const auto running_sequences = sequence_group->get_running_sequences();
+    OPENVINO_ASSERT(running_sequences.size() == 1u);
+    const auto sequence = running_sequences.front();
+
+    size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
+    OPENVINO_ASSERT(num_scheduled_tokens == 1u);
+
+    const auto num_processed_tokens = sequence_group->get_num_processed_tokens();
+    return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()];
+}
+
 } // anonymous namespace
 
 namespace ov {
@@ -945,31 +970,6 @@ DecodedResults StaticLLMPipeline::generate(
     return decoded_results;
 }
 
-void stream_generated_tokens(std::shared_ptr<StreamerBase> streamer_ptr,
-                             GenerationHandle& handle) {
-    if (streamer_ptr && handle->can_read()) {
-        std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
-        for (const auto& gen_token : token.begin()->second.generated_ids) {
-            if (streamer_ptr->put(gen_token)) {
-                handle->drop();
-                break;
-            }
-        }
-    }
-}
-
-int64_t get_last_token(SequenceGroup::Ptr sequence_group) {
-    const auto running_sequences = sequence_group->get_running_sequences();
-    OPENVINO_ASSERT(running_sequences.size() == 1u);
-    const auto sequence = running_sequences.front();
-
-    size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
-    OPENVINO_ASSERT(num_scheduled_tokens == 1u);
-
-    const auto num_processed_tokens = sequence_group->get_num_processed_tokens();
-    return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()];
-}
-
 EncodedResults StaticLLMPipeline::generate(
     const EncodedInputs& inputs,
     OptionalGenerationConfig generation_config,
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 13d7752e2e..8dc7ef49a1 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -1,10 +1,9 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
 #include <filesystem>
-#include <random>
 
 #include "llm_pipeline_base.hpp"
 #include "sampler.hpp"
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index e2a3238676..73a406c695 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #include "sampler.hpp"
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 271d209f75..df0c406749 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -1,5 +1,5 @@
 
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 2df8a1f200..6a17cf59b8 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index 326386fe31..10e7255309 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -99,9 +99,9 @@ def test_generation_compare_with_stateful(generation_config):
 @pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", generation_configs)
 def test_multinomial_sampling(generation_config):
-	# Multinomial sampling is highly sensitive to raw logits values. For fair comparison,
-	# a reference implementation producing identical logits (e.g., from StaticLLMPipeline)
-	# would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply
+    # Multinomial sampling is highly sensitive to raw logits values. For fair comparison,
+    # a reference implementation producing identical logits (e.g., from StaticLLMPipeline)
+    # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply
     # different optimizations due to differences in provided topologies, leading to slight
     # variations in raw logits. Therefore, there is no reliable reference for validation,
     # so only ensure that no exceptions are raised.
@@ -163,13 +163,13 @@ def test_batch_raise_error():
 
 
 # TODO: For the further sampling support
-generation_config = [
+generation_configs = [
     get_beam_search(),
     # NB: Only num_return_sequences=1 is supported!
     get_multinomial_all_parameters()
 ]
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
-@pytest.mark.parametrize("generation_config", generation_config)
+@pytest.mark.parametrize("generation_config", generation_configs)
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_unsupported_sampling_raise_error(generation_config):

From eb02c49a5f171b0aec7b4a6aa183bdc12fadae4b Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 14:52:04 +0000
Subject: [PATCH 10/20] Revert changes in sampler

---
 src/cpp/src/llm_pipeline_static.cpp |   9 +-
 src/cpp/src/sampler.cpp             | 131 ++++++++++++++--------------
 src/cpp/src/sampler.hpp             |  21 -----
 3 files changed, 68 insertions(+), 93 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index aac55015c0..6731fb8da0 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -3,7 +3,6 @@
 
 #include "llm_pipeline_static.hpp"
 
-#include "logit_processor.hpp"
 #include "sampler.hpp"
 
 #include <fstream>
@@ -635,10 +634,10 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) {
     }
 }
 
-void stream_generated_tokens(std::shared_ptr<StreamerBase> streamer_ptr,
-                             GenerationHandle& handle) {
+void stream_generated_tokens(std::shared_ptr<ov::genai::StreamerBase> streamer_ptr,
+                             ov::genai::GenerationHandle& handle) {
     if (streamer_ptr && handle->can_read()) {
-        std::unordered_map<uint64_t, GenerationOutput> token = handle->back();
+        std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->back();
         for (const auto& gen_token : token.begin()->second.generated_ids) {
             if (streamer_ptr->put(gen_token)) {
                 handle->drop();
@@ -648,7 +647,7 @@ void stream_generated_tokens(std::shared_ptr<StreamerBase> streamer_ptr,
     }
 }
 
-int64_t get_last_token(SequenceGroup::Ptr sequence_group) {
+int64_t get_last_token(ov::genai::SequenceGroup::Ptr sequence_group) {
     const auto running_sequences = sequence_group->get_running_sequences();
     OPENVINO_ASSERT(running_sequences.size() == 1u);
     const auto sequence = running_sequences.front();
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 73a406c695..6498a7d4c4 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -67,71 +67,6 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
     return tokens;
 }
 
-Token greedy_sample(const Logits& logits, size_t top_logprobs) {
-    // For greedy sampling we do not expect sorting or shrinking considered tokens
-    // so we can operate directly on the data buffer
-    size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1
-    std::vector<float> top_values(m, -std::numeric_limits<float>::infinity());
-    std::vector<size_t> top_indexes(m, 0);
-
-    for (size_t i = 0; i < logits.m_size; ++i) {
-        if (logits.m_data[i] > top_values.back()) {
-            top_values.back() = logits.m_data[i];
-            top_indexes.back() = i;
-
-            for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) {
-                std::swap(top_values[j], top_values[j - 1]);
-                std::swap(top_indexes[j], top_indexes[j - 1]);
-            }
-        }
-    }
-
-    size_t max_index = top_indexes.front();
-    float max_value = 0.0;
-
-    if (top_logprobs) {
-        // apply log softmax to max value
-        max_value = top_values.front();
-        float log_sum = std::log(std::accumulate(
-            logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) {
-                return accumulated + std::exp(to_add - max_value);
-        }));
-        max_value = -log_sum;
-    }
-
-    return Token(max_value, max_index);
-}
-
-std::vector<Token> multinomial_sample(const Logits& logits,
-                                      size_t num_tokens_per_sequence,
-                                      std::mt19937& rng_engine) {
-    // If top_p or top_k was applied we use sorted vector, if not we go with original buffer.
-    std::vector<float> multinomial_weights;
-    multinomial_weights.reserve(logits.m_size);
-    if (logits.is_vector_initialized())
-        for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob);
-    else
-        multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size);
-
-    // std::discrete_distribution returns corrupted results when applied to log probabilities
-    // which result returning NAN only logprobs.
-    // so log() is applied after this line
-    auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1
-
-    std::vector<Token> out_tokens;
-    for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) {
-        size_t element_to_pick = dist(rng_engine);
-        if (logits.is_vector_initialized()) {
-            auto logit = logits.m_vector[element_to_pick];
-            logit.m_log_prob = std::log(logit.m_log_prob);
-            out_tokens.push_back(logit);
-        }
-        else
-            out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick);
-    }
-    return out_tokens;
-}
-
 std::vector<int64_t> wrap_tokens(const std::vector<int64_t>& tokens, const std::vector<int64_t>& prefix_tokens, const std::vector<int64_t>& suffix_tokens) {
     std::vector<int64_t> all_tokens = prefix_tokens;
     all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end());
@@ -160,6 +95,13 @@ std::vector<int64_t> encode_and_process_string(const std::string& stop_string, o
     return encoded_stop_string;
 }
 
+struct MatchStopStringResult {
+    size_t to_remove = 0;
+    // int64_t last_token_id = 0;
+    // bool is_to_update_last_token = false;
+    bool is_matched = false;
+};
+
 // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned.
 MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
                       const TokenIds& generated_tokens,
@@ -539,11 +481,66 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to
 }
 
 Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const {
-    return greedy_sample(logits, top_logprobs);
+    // For greedy sampling we do not expect sorting or shrinking considered tokens
+    // so we can operate directly on the data buffer
+    size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1
+    std::vector<float> top_values(m, -std::numeric_limits<float>::infinity());
+    std::vector<size_t> top_indexes(m, 0);
+
+    for (size_t i = 0; i < logits.m_size; ++i) {
+        if (logits.m_data[i] > top_values.back()) {
+            top_values.back() = logits.m_data[i];
+            top_indexes.back() = i;
+
+            for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) {
+                std::swap(top_values[j], top_values[j - 1]);
+                std::swap(top_indexes[j], top_indexes[j - 1]);
+            }
+        }
+    }
+
+    size_t max_index = top_indexes.front();
+    float max_value = 0.0;
+
+    if (top_logprobs) {
+        // apply log softmax to max value
+        max_value = top_values.front();
+        float log_sum = std::log(std::accumulate(
+            logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) {
+                return accumulated + std::exp(to_add - max_value);
+        }));
+        max_value = -log_sum;
+    }
+
+    return Token(max_value, max_index);
 }
 
 std::vector<Token> Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) {
-    return multinomial_sample(logits, num_tokens_per_sequence, rng_engine);
+    // If top_p or top_k was applied we use sorted vector, if not we go with original buffer.
+    std::vector<float> multinomial_weights;
+    multinomial_weights.reserve(logits.m_size);
+    if (logits.is_vector_initialized())
+        for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob);
+    else
+        multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size);
+
+    // std::discrete_distribution returns corrupted results when applied to log probabilities
+    // which result returning NAN only logprobs.
+    // so log() is applied after this line
+    auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1
+
+    std::vector<Token> out_tokens;
+    for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) {
+        size_t element_to_pick = dist(rng_engine);
+        if (logits.is_vector_initialized()) {
+            auto logit = logits.m_vector[element_to_pick];
+            logit.m_log_prob = std::log(logit.m_log_prob);
+            out_tokens.push_back(logit);
+        }
+        else
+            out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick);
+    }
+    return out_tokens;
 }
 
 std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) {
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index df0c406749..7796f93d1e 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -32,27 +32,6 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t
 
 std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx);
 
-Token greedy_sample(const Logits& logits, size_t top_logprobs);
-
-std::vector<Token> multinomial_sample(const Logits& logits,
-                                      size_t num_tokens_per_sequence,
-                                      std::mt19937& rng_engine);
-
-std::pair<size_t, std::set<std::string>>
-process_stop_strings(const std::set<std::string>& stop_strings, Tokenizer& tokenizer);
-
-struct MatchStopStringResult {
-    size_t to_remove = 0;
-    // int64_t last_token_id = 0;
-    // bool is_to_update_last_token = false;
-    bool is_matched = false;
-};
-
-MatchStopStringResult match_stop_string(Tokenizer& tokenizer,
-                      const TokenIds& generated_tokens,
-                      const std::pair<size_t, std::set<std::string>>& stop_strings,
-                      bool is_include_to_output);
-
 struct SamplerOutput {
     // IDs of sequences that need to be dropped
     std::vector<uint64_t> m_dropped_sequences;

From 24866f38e6eee5b825b48d8ac2080cdd7548b440 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 15:20:42 +0000
Subject: [PATCH 11/20] Add test to check termination by sampler

---
 .../python_tests/test_llm_pipeline_static.py  | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index 10e7255309..5c7f07fcbb 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -183,7 +183,7 @@ def test_unsupported_sampling_raise_error(generation_config):
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_max_number_of_tokens():
+def test_terminate_by_max_number_of_tokens():
     model_path = get_models_list()[0][1]
     prompt = 'The Sun is yellow because'
     num_tokens = 128
@@ -199,7 +199,7 @@ def test_max_number_of_tokens():
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_terminate_when_kvcache_is_full():
+def test_terminate_by_out_of_memory():
     model_path = get_models_list()[0][1]
     prompt = 'The Sun is yellow because'
     pipeline_config = { "MAX_PROMPT_LEN": 64, "MIN_RESPONSE_LEN": 64 }
@@ -216,6 +216,29 @@ def test_terminate_when_kvcache_is_full():
     assert len(encoded_results.tokens[0]) == (kv_cache_size - input_len + 1)
 
 
+@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_terminate_by_sampler():
+    model_path = get_models_list()[0][1]
+    prompt = 'The Sun is yellow because'
+
+    current_iter = 0
+    num_iters = 10
+    def callback(subword):
+        nonlocal current_iter
+        current_iter += 1
+        return current_iter == num_iters
+
+    tokenizer = ov_genai.Tokenizer(model_path)
+    tokenized_input = tokenizer.encode(prompt)
+
+    pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
+    encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True, streamer=callback)
+
+    assert len(encoded_results.tokens[0]) == num_iters
+
+
 # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt!
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")
 @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline")

From 99a48be5efdb32c412ead496245d53c226d94236 Mon Sep 17 00:00:00 2001
From: TolyaTalamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 15:53:56 +0000
Subject: [PATCH 12/20] Fix comments on review

---
 src/cpp/src/llm_pipeline_static.cpp | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 6731fb8da0..2d6dbb8cb0 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -647,18 +647,6 @@ void stream_generated_tokens(std::shared_ptr<ov::genai::StreamerBase> streamer_p
     }
 }
 
-int64_t get_last_token(ov::genai::SequenceGroup::Ptr sequence_group) {
-    const auto running_sequences = sequence_group->get_running_sequences();
-    OPENVINO_ASSERT(running_sequences.size() == 1u);
-    const auto sequence = running_sequences.front();
-
-    size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
-    OPENVINO_ASSERT(num_scheduled_tokens == 1u);
-
-    const auto num_processed_tokens = sequence_group->get_num_processed_tokens();
-    return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()];
-}
-
 } // anonymous namespace
 
 namespace ov {
@@ -1012,6 +1000,10 @@ EncodedResults StaticLLMPipeline::generate(
         OPENVINO_THROW("Currently only greedy and multinomial decoding are supported");
     }
 
+    if (config.num_return_sequences != 1u) {
+        OPENVINO_THROW("Currently only \"num_return_sequences\" equal to 1 is supported!");
+    }
+
     ov::genai::EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     // NB: Only batch=1 is supported now
@@ -1105,9 +1097,10 @@ EncodedResults StaticLLMPipeline::generate(
 
     while (sequence_group->is_running()) {
         sequence_group->schedule_tokens(1);
-        int64_t last_token = get_last_token(sequence_group);
+        const auto running_sequences = sequence_group->get_running_sequences();
+        OPENVINO_ASSERT(running_sequences.size() == 1u);
 
-        input_ids_data[0] = last_token;
+        input_ids_data[0] = running_sequences.front()->get_generated_ids().back();
         position_ids_data[0] = m_kvcache_desc.num_stored_tokens;
         attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u;
 

From 28c37d4e196a220fb9e5ebda6d79aa80d1fd154b Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 18:33:03 +0000
Subject: [PATCH 13/20] Update tests/python_tests/test_llm_pipeline_static.py

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 tests/python_tests/test_llm_pipeline_static.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index 5c7f07fcbb..81ce82793c 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -177,7 +177,7 @@ def test_unsupported_sampling_raise_error(generation_config):
     prompt = 'What is OpenVINO?'
     pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config)
     with pytest.raises(RuntimeError):
-        pipe.generate(prompt, **generation_config)
+        pipe.generate(prompt, generation_config)
 
 
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")

From 881a565fe826dc7676f420fe4642d5d2c02cdaf3 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Thu, 2 Jan 2025 21:00:28 +0000
Subject: [PATCH 14/20] Update text_generation.py

---
 tools/llm_bench/task/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index d6aebdbc3e..ad1a55ef2f 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -198,7 +198,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
 
 def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index,
                               streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption):
-    set_seed(args['seed'])
     input_text_list = [input_text] * args['batch_size']
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(input_text_list):
@@ -226,6 +225,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         log.info(out_str)
     gen_config = model.get_generation_config()
     gen_config.max_new_tokens = max_gen_tokens
+    gen_config.rng_seed= args["seed"]
     gen_config.num_beams = args["num_beams"]
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
@@ -352,7 +352,6 @@ def token_printer():
 
 def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list,
                                           prompt_index, streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption):
-    set_seed(args['seed'])
     input_text_list = [input_text] * args['batch_size']
     if args["output_dir"] is not None and num == 0:
         for bs_index, in_text in enumerate(input_text_list):
@@ -378,6 +377,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
     streamer.reset()
     gen_config = model.get_generation_config()
+    gen_config.rng_seed= args["seed"]
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     if args.get('draft_model', ''):

From 831bf86d02d392237a3b4452470644bafa1e7da8 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 3 Jan 2025 08:50:44 +0000
Subject: [PATCH 15/20] Update test_llm_pipeline_static.py

---
 .../python_tests/test_llm_pipeline_static.py  | 32 +------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index 81ce82793c..bb73033e6a 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -11,26 +11,11 @@
 )
 
 from common import                                      \
-    get_greedy,                                         \
-    get_greedy_with_min_and_max_tokens,                 \
-    get_greedy_with_repetition_penalty,                 \
-    get_greedy_with_penalties,                          \
-    get_greedy_with_min_and_max_tokens,                 \
-    get_greedy_with_single_stop_string,                 \
-    get_greedy_with_multiple_stop_strings,              \
-    get_greedy_with_multiple_stop_strings_no_match,     \
-    get_greedy_stop_strings_exclude_from_output,        \
-    get_greedy_stop_strings_include_to_output,          \
     get_greedy_n_stop_strings_exclude_from_output,      \
     get_greedy_n_stop_strings_include_to_output,        \
     get_multinomial_temperature,                        \
-    get_multinomial_temperature_and_top_p,              \
-    get_multinomial_temperature_and_top_k,              \
-    get_multinomial_temperature_top_p_and_top_k,        \
-    get_multinomial_temperature_and_repetition_penalty, \
-    get_multinomial_temperature_and_frequence_penalty,  \
-    get_multinomial_temperature_and_presence_penalty,   \
     get_multinomial_all_parameters,                     \
+    get_multinomial_temperature_and_presence_penalty    \
     get_beam_search
 
 # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU.
@@ -53,15 +38,6 @@ def generate_chat_history(model_path, device, pipeline_config, questions):
 
 generation_configs = [
     get_greedy(),
-    get_greedy_with_min_and_max_tokens(),
-    get_greedy_with_repetition_penalty(),
-    get_greedy_with_penalties(),
-    get_greedy_with_min_and_max_tokens(),
-    get_greedy_with_single_stop_string(),
-    get_greedy_with_multiple_stop_strings(),
-    get_greedy_with_multiple_stop_strings_no_match(),
-    get_greedy_stop_strings_exclude_from_output(),
-    get_greedy_stop_strings_include_to_output(),
     get_greedy_n_stop_strings_exclude_from_output(),
     get_greedy_n_stop_strings_include_to_output()
 ]
@@ -86,12 +62,6 @@ def test_generation_compare_with_stateful(generation_config):
 
 
 generation_configs = [
-    get_multinomial_temperature(),
-    get_multinomial_temperature_and_top_p(),
-    get_multinomial_temperature_and_top_k(),
-    get_multinomial_temperature_top_p_and_top_k(),
-    get_multinomial_temperature_and_repetition_penalty(),
-    get_multinomial_temperature_and_frequence_penalty(),
     get_multinomial_temperature_and_presence_penalty()
 ]
 @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI")

From 7d1fd1d19853c64c32b53dcdc5504465be25a06a Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 3 Jan 2025 08:51:29 +0000
Subject: [PATCH 16/20] Update test_llm_pipeline_static.py

---
 tests/python_tests/test_llm_pipeline_static.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index bb73033e6a..c9ae3ce30f 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -37,7 +37,6 @@ def generate_chat_history(model_path, device, pipeline_config, questions):
 
 
 generation_configs = [
-    get_greedy(),
     get_greedy_n_stop_strings_exclude_from_output(),
     get_greedy_n_stop_strings_include_to_output()
 ]

From dcb075c731f932db7871eb05d309b54ba94c3cbc Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 3 Jan 2025 10:22:01 +0000
Subject: [PATCH 17/20] Update test_llm_pipeline_static.py

---
 tests/python_tests/test_llm_pipeline_static.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index c9ae3ce30f..b3969fe002 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -15,7 +15,7 @@
     get_greedy_n_stop_strings_include_to_output,        \
     get_multinomial_temperature,                        \
     get_multinomial_all_parameters,                     \
-    get_multinomial_temperature_and_presence_penalty    \
+    get_multinomial_temperature_and_presence_penalty,   \
     get_beam_search
 
 # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU.

From dcbf89014e99e9d3a7d55b43602a5b4367bb1d37 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 3 Jan 2025 11:01:50 +0000
Subject: [PATCH 18/20] Update text_generation.py

---
 tools/llm_bench/task/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index ad1a55ef2f..03fde296b1 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -225,7 +225,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
         log.info(out_str)
     gen_config = model.get_generation_config()
     gen_config.max_new_tokens = max_gen_tokens
-    gen_config.rng_seed= args["seed"]
+    gen_config.rng_seed = args["seed"]
     gen_config.num_beams = args["num_beams"]
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
@@ -377,7 +377,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
     streamer.reset()
     gen_config = model.get_generation_config()
-    gen_config.rng_seed= args["seed"]
+    gen_config.rng_seed = args["seed"]
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     if args.get('draft_model', ''):

From 8e89a9b10286b88df87facdfb54063f6fdce357b Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Fri, 3 Jan 2025 11:57:24 +0000
Subject: [PATCH 19/20] Update text_generation.py

---
 tools/llm_bench/task/text_generation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 03fde296b1..c768d427e7 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -227,6 +227,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.rng_seed = args["seed"]
     gen_config.num_beams = args["num_beams"]
+    gen_config.do_sample = False
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get('num_assistant_tokens', None):
@@ -380,6 +381,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     gen_config.rng_seed = args["seed"]
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
+    gen_config.do_sample = False
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get("num_assistant_tokens", None):

From 4dcd5e04c01cbdca5c542f1c827d68d422aa674f Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Sat, 4 Jan 2025 09:56:16 +0000
Subject: [PATCH 20/20] Update test_llm_pipeline_static.py

---
 tests/python_tests/test_llm_pipeline_static.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py
index ca22dab719..d2d3673356 100644
--- a/tests/python_tests/test_llm_pipeline_static.py
+++ b/tests/python_tests/test_llm_pipeline_static.py
@@ -15,8 +15,8 @@
 from common import get_default_properties
 
 from common import                                      \
-    get_greedy_n_stop_strings_exclude_from_output,      \
-    get_greedy_n_stop_strings_include_to_output,        \
+    get_greedy,                                         \
+    get_greedy_with_penalties,                          \
     get_multinomial_temperature,                        \
     get_multinomial_all_parameters,                     \
     get_multinomial_temperature_and_presence_penalty,   \
@@ -46,8 +46,8 @@ def generate_chat_history(model_path, device, pipeline_config, questions):
 
 
 generation_configs = [
-    get_greedy_n_stop_strings_exclude_from_output(),
-    get_greedy_n_stop_strings_include_to_output()
+    get_greedy(),
+    get_greedy_with_penalties()
 ]
 @pytest.mark.precommit
 @pytest.mark.nightly