From b80053182f94885fa092013fcdef7c0f2deff84a Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 23 Dec 2024 12:23:56 +0000 Subject: [PATCH 01/20] Experimental snapshot with greedy decoding --- src/cpp/src/llm_pipeline_static.cpp | 32 ++++++++++++-- src/cpp/src/sampler.cpp | 68 +++++++++++++++-------------- src/cpp/src/sampler.hpp | 1 + 3 files changed, 65 insertions(+), 36 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 090aed9650..4505a8832e 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -3,6 +3,9 @@ #include "llm_pipeline_static.hpp" +#include "logit_processor.hpp" +#include "sampler.hpp" + #include #include @@ -937,6 +940,21 @@ DecodedResults StaticLLMPipeline::generate( return decoded_results; } +int64_t sample_next_token(const ov::Tensor& logits, + const GenerationConfig& config, + LogitProcessor& logit_processor) { + Logits logit_vector(logits.data(), logits.get_shape()[2]); + logit_processor.apply(logit_vector); + int64_t last_token = -1; + if (config.is_greedy_decoding()) { + last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index; + } else { + OPENVINO_ASSERT(false); + } + logit_processor.register_new_generated_token(last_token); + return last_token; +} + EncodedResults StaticLLMPipeline::generate( const EncodedInputs& inputs, OptionalGenerationConfig generation_config, @@ -973,10 +991,15 @@ EncodedResults StaticLLMPipeline::generate( streamer_ptr = std::make_shared(m_tokenizer, *callback); } - if (!config.is_greedy_decoding()) { - OPENVINO_THROW("Currently only greedy decoding is supported"); + if (!config.is_greedy_decoding() && !config.is_multinomial()) { + OPENVINO_THROW("Currently only greedy and multinomial decoding are supported"); } + std::vector input_ids_vec; + input_ids_vec.reserve(input_ids.get_size()); + std::copy_n(input_ids.data(), input_ids.get_size(), std::back_inserter(input_ids_vec)); + LogitProcessor logit_processor(config, input_ids_vec); + ov::Shape prompts_shape = input_ids.get_shape(); const size_t batch_size = prompts_shape[0]; ov::genai::EncodedResults results; @@ -1015,7 +1038,8 @@ EncodedResults StaticLLMPipeline::generate( // NB: Now there are prompt_len tokens in KV-cache m_kvcache_desc.num_stored_tokens += static_cast(prompt_len); - int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0); + + auto last_token = sample_next_token(m_prefill_request.get_tensor("logits"), config, logit_processor); results.tokens[0].push_back(last_token); if (streamer_ptr && streamer_ptr->put(last_token)) { return results; @@ -1069,7 +1093,7 @@ EncodedResults StaticLLMPipeline::generate( m_kvcache_request.infer(); m_kvcache_desc.num_stored_tokens += 1; - last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0); + last_token = sample_next_token(m_kvcache_request.get_tensor("logits"), config, logit_processor); results.tokens[0].push_back(last_token); raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index f77463d767..48c3e5354e 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -67,6 +67,41 @@ std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { return tokens; } +Token greedy_sample(const Logits& logits, size_t top_logprobs) { + // For greedy sampling we do not expect sorting or shrinking considered tokens + // so we can operate directly on the data buffer + size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 + std::vector top_values(m, -std::numeric_limits::infinity()); + std::vector top_indexes(m, 0); + + for (size_t i = 0; i < logits.m_size; ++i) { + if (logits.m_data[i] > top_values.back()) { + top_values.back() = logits.m_data[i]; + top_indexes.back() = i; + + for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { + std::swap(top_values[j], top_values[j - 1]); + std::swap(top_indexes[j], top_indexes[j - 1]); + } + } + } + + size_t max_index = top_indexes.front(); + float max_value = 0.0; + + if (top_logprobs) { + // apply log softmax to max value + max_value = top_values.front(); + float log_sum = std::log(std::accumulate( + logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_value); + })); + max_value = -log_sum; + } + + return Token(max_value, max_index); +} + std::vector wrap_tokens(const std::vector& tokens, const std::vector& prefix_tokens, const std::vector& suffix_tokens) { std::vector all_tokens = prefix_tokens; all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end()); @@ -493,38 +528,7 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to } Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const { - // For greedy sampling we do not expect sorting or shrinking considered tokens - // so we can operate directly on the data buffer - size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 - std::vector top_values(m, -std::numeric_limits::infinity()); - std::vector top_indexes(m, 0); - - for (size_t i = 0; i < logits.m_size; ++i) { - if (logits.m_data[i] > top_values.back()) { - top_values.back() = logits.m_data[i]; - top_indexes.back() = i; - - for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { - std::swap(top_values[j], top_values[j - 1]); - std::swap(top_indexes[j], top_indexes[j - 1]); - } - } - } - - size_t max_index = top_indexes.front(); - float max_value = 0.0; - - if (top_logprobs) { - // apply log softmax to max value - max_value = top_values.front(); - float log_sum = std::log(std::accumulate( - logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { - return accumulated + std::exp(to_add - max_value); - })); - max_value = -log_sum; - } - - return Token(max_value, max_index); + return greedy_sample(logits, top_logprobs); } std::vector Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 0f7876cbf9..0cf983fc01 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -31,6 +31,7 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set log_softmax(const ov::Tensor& logits, size_t batch_idx); +Token greedy_sample(const Logits& logits, size_t top_logprobs); struct SamplerOutput { // IDs of sequences that need to be dropped From 55ead2d6066e4fd8dc6b2e20b7e22e13a551148d Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 23 Dec 2024 16:17:56 +0000 Subject: [PATCH 02/20] Add multinomial support --- src/cpp/src/llm_pipeline_static.cpp | 17 +++++++-- src/cpp/src/llm_pipeline_static.hpp | 4 ++ src/cpp/src/sampler.cpp | 57 ++++++++++++++++------------- src/cpp/src/sampler.hpp | 5 +++ 4 files changed, 54 insertions(+), 29 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 4505a8832e..282c5e957a 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -942,13 +942,22 @@ DecodedResults StaticLLMPipeline::generate( int64_t sample_next_token(const ov::Tensor& logits, const GenerationConfig& config, + std::mt19937& rng_engine, LogitProcessor& logit_processor) { - Logits logit_vector(logits.data(), logits.get_shape()[2]); + const size_t vocab_size = logits.get_shape()[2]; + const size_t seq_len_size = logits.get_shape()[1]; + const size_t offset = (seq_len_size - 1) * vocab_size; + // NB: Slice out and take probabilities only for the last token + Logits logit_vector(logits.data() + offset, vocab_size); logit_processor.apply(logit_vector); int64_t last_token = -1; if (config.is_greedy_decoding()) { last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index; + } else if (config.is_multinomial()) { + last_token = ov::genai::multinomial_sample(logit_vector, 1u, rng_engine)[0].m_index; } else { + // NB: Only greedy and multinomial supported, + // the appropriate check is performed before OPENVINO_ASSERT(false); } logit_processor.register_new_generated_token(last_token); @@ -1039,7 +1048,8 @@ EncodedResults StaticLLMPipeline::generate( // NB: Now there are prompt_len tokens in KV-cache m_kvcache_desc.num_stored_tokens += static_cast(prompt_len); - auto last_token = sample_next_token(m_prefill_request.get_tensor("logits"), config, logit_processor); + auto last_token = sample_next_token( + m_prefill_request.get_tensor("logits"), config, m_rng_engine, logit_processor); results.tokens[0].push_back(last_token); if (streamer_ptr && streamer_ptr->put(last_token)) { return results; @@ -1093,7 +1103,8 @@ EncodedResults StaticLLMPipeline::generate( m_kvcache_request.infer(); m_kvcache_desc.num_stored_tokens += 1; - last_token = sample_next_token(m_kvcache_request.get_tensor("logits"), config, logit_processor); + last_token = sample_next_token( + m_kvcache_request.get_tensor("logits"), config, m_rng_engine, logit_processor); results.tokens[0].push_back(last_token); raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 7acc28c684..664b820346 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include "llm_pipeline_base.hpp" @@ -83,6 +84,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { bool m_is_chat_conversation = false; ChatHistory m_history; + + // NB: For multinomial sampling + std::mt19937 m_rng_engine; }; } // namespace genai diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 48c3e5354e..468bdc2b48 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -102,6 +102,36 @@ Token greedy_sample(const Logits& logits, size_t top_logprobs) { return Token(max_value, max_index); } +std::vector multinomial_sample(const Logits& logits, + size_t num_tokens_per_sequence, + std::mt19937& rng_engine) { + // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. + std::vector multinomial_weights; + multinomial_weights.reserve(logits.m_size); + if (logits.is_vector_initialized()) + for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); + else + multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); + + // std::discrete_distribution returns corrupted results when applied to log probabilities + // which result returning NAN only logprobs. + // so log() is applied after this line + auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 + + std::vector out_tokens; + for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { + size_t element_to_pick = dist(rng_engine); + if (logits.is_vector_initialized()) { + auto logit = logits.m_vector[element_to_pick]; + logit.m_log_prob = std::log(logit.m_log_prob); + out_tokens.push_back(logit); + } + else + out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); + } + return out_tokens; +} + std::vector wrap_tokens(const std::vector& tokens, const std::vector& prefix_tokens, const std::vector& suffix_tokens) { std::vector all_tokens = prefix_tokens; all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end()); @@ -532,31 +562,7 @@ Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const { } std::vector Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { - // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. - std::vector multinomial_weights; - multinomial_weights.reserve(logits.m_size); - if (logits.is_vector_initialized()) - for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); - else - multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); - - // std::discrete_distribution returns corrupted results when applied to log probabilities - // which result returning NAN only logprobs. - // so log() is applied after this line - auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 - - std::vector out_tokens; - for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { - size_t element_to_pick = dist(rng_engine); - if (logits.is_vector_initialized()) { - auto logit = logits.m_vector[element_to_pick]; - logit.m_log_prob = std::log(logit.m_log_prob); - out_tokens.push_back(logit); - } - else - out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); - } - return out_tokens; + return multinomial_sample(logits, num_tokens_per_sequence, rng_engine); } std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) { @@ -763,7 +769,6 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); - const auto request_id = sequence_group->get_request_id(); if (!m_logit_processors.count(request_id)) { m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())}); diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 0cf983fc01..d79173087a 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -31,8 +31,13 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set log_softmax(const ov::Tensor& logits, size_t batch_idx); + Token greedy_sample(const Logits& logits, size_t top_logprobs); +std::vector multinomial_sample(const Logits& logits, + size_t num_tokens_per_sequence, + std::mt19937& rng_engine); + struct SamplerOutput { // IDs of sequences that need to be dropped std::vector m_dropped_sequences; From 188575d92d52e5a489c1e0b73175b771c3b52415 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Tue, 24 Dec 2024 10:46:20 +0000 Subject: [PATCH 03/20] Handle rng seed --- src/cpp/src/llm_pipeline_static.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index ee81ea377b..34d118c387 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -1009,6 +1009,7 @@ EncodedResults StaticLLMPipeline::generate( input_ids_vec.reserve(input_ids.get_size()); std::copy_n(input_ids.data(), input_ids.get_size(), std::back_inserter(input_ids_vec)); LogitProcessor logit_processor(config, input_ids_vec); + m_rng_engine.seed(config.rng_seed); ov::Shape prompts_shape = input_ids.get_shape(); const size_t batch_size = prompts_shape[0]; From ab849500cc6a655ed1fff1c96ca05fd69e2964e8 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Tue, 24 Dec 2024 11:14:16 +0000 Subject: [PATCH 04/20] Update sampler.cpp --- src/cpp/src/sampler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index d50caeffda..0e53f1b4ac 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -771,6 +771,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); + const auto request_id = sequence_group->get_request_id(); if (!m_logit_processors.count(request_id)) { m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())}); From c73f1f50bf9a9eeaa191f7774109a11e5637d772 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Mon, 30 Dec 2024 15:06:41 +0000 Subject: [PATCH 05/20] Handle stop_strings and add more tests --- src/cpp/src/llm_pipeline_static.cpp | 34 ++++++-- src/cpp/src/sampler.cpp | 7 -- src/cpp/src/sampler.hpp | 15 ++++ .../python_tests/test_llm_pipeline_static.py | 84 ++++++++++++++++--- 4 files changed, 118 insertions(+), 22 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 34d118c387..db4133b0e7 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -238,12 +238,12 @@ enum class GenerateHint { std::string to_string(GenerateHint h) { switch(h) { - case GenerateHint::FAST_COMPILE : + case GenerateHint::FAST_COMPILE : return "FAST_COMPILE"; - case GenerateHint::BEST_PERF : + case GenerateHint::BEST_PERF : return "BEST_PERF"; default: - OPENVINO_THROW("Unsupported value for type GenerateHint provided"); + OPENVINO_THROW("Unsupported value for type GenerateHint provided"); } } @@ -692,7 +692,6 @@ StaticLLMPipeline::StaticLLMPipeline( const ov::AnyMap& properties, const ov::genai::GenerationConfig& generation_config ) : LLMPipelineImplBase(tokenizer, generation_config) { - bool use_blobs = false; auto anyopt = get_option(properties, "USE_BLOBS"); if (anyopt.has_value()) { @@ -1005,12 +1004,23 @@ EncodedResults StaticLLMPipeline::generate( OPENVINO_THROW("Currently only greedy and multinomial decoding are supported"); } + // FIXME:... + if ( streamer_ptr && + !config.stop_strings.empty() && + !config.include_stop_str_in_output) { + OPENVINO_THROW("Static LLM pipeline doesn't support " + "\"include_stop_str_in_output: false\" when a streamer is provided"); + } + std::vector input_ids_vec; input_ids_vec.reserve(input_ids.get_size()); std::copy_n(input_ids.data(), input_ids.get_size(), std::back_inserter(input_ids_vec)); LogitProcessor logit_processor(config, input_ids_vec); m_rng_engine.seed(config.rng_seed); + const auto processed_stop_strings = + process_stop_strings(config.stop_strings, m_tokenizer); + ov::Shape prompts_shape = input_ids.get_shape(); const size_t batch_size = prompts_shape[0]; ov::genai::EncodedResults results; @@ -1111,11 +1121,25 @@ EncodedResults StaticLLMPipeline::generate( raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + + bool met_stop_str = false; + if (!config.stop_strings.empty()) { + auto match_result = match_stop_string(m_tokenizer, + results.tokens[0], + processed_stop_strings, + config.include_stop_str_in_output); + if (match_result.is_matched) { + met_stop_str = true; + results.tokens[0].erase( + results.tokens[0].end() - match_result.to_remove, results.tokens[0].end()); + } + } + if (streamer_ptr && streamer_ptr->put(last_token)) { break; } - if (last_token == config.eos_token_id && !config.ignore_eos) { + if (met_stop_str || (last_token == config.eos_token_id && !config.ignore_eos)) { break; } diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index d50caeffda..646eb6b85e 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -160,13 +160,6 @@ std::vector encode_and_process_string(const std::string& stop_string, o return encoded_stop_string; } -struct MatchStopStringResult { - size_t to_remove = 0; - // int64_t last_token_id = 0; - // bool is_to_update_last_token = false; - bool is_matched = false; -}; - // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. MatchStopStringResult match_stop_string(Tokenizer& tokenizer, const TokenIds& generated_tokens, diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 12870ec70e..baf57ebb40 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -38,6 +38,21 @@ std::vector multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence, std::mt19937& rng_engine); +std::pair> +process_stop_strings(const std::set& stop_strings, Tokenizer& tokenizer); + +struct MatchStopStringResult { + size_t to_remove = 0; + // int64_t last_token_id = 0; + // bool is_to_update_last_token = false; + bool is_matched = false; +}; + +MatchStopStringResult match_stop_string(Tokenizer& tokenizer, + const TokenIds& generated_tokens, + const std::pair>& stop_strings, + bool is_include_to_output); + struct SamplerOutput { // IDs of sequences that need to be dropped std::vector m_dropped_sequences; diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index cad8b0fea0..06dc6e8675 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import openvino_genai as ov_genai -from openvino.runtime import Core +from openvino_genai import GenerationConfig import pytest import sys from ov_genai_test_utils import ( @@ -10,6 +10,28 @@ get_chat_models_list, ) +from common import \ + get_greedy, \ + get_greedy_with_min_and_max_tokens, \ + get_greedy_with_repetition_penalty, \ + get_greedy_with_penalties, \ + get_greedy_with_min_and_max_tokens, \ + get_greedy_with_single_stop_string, \ + get_greedy_with_multiple_stop_strings, \ + get_greedy_with_multiple_stop_strings_no_match, \ + get_greedy_stop_strings_exclude_from_output, \ + get_greedy_stop_strings_include_to_output, \ + get_greedy_n_stop_strings_exclude_from_output, \ + get_greedy_n_stop_strings_include_to_output, \ + get_multinomial_temperature, \ + get_multinomial_temperature_and_top_p, \ + get_multinomial_temperature_and_top_k, \ + get_multinomial_temperature_top_p_and_top_k, \ + get_multinomial_temperature_and_repetition_penalty, \ + get_multinomial_temperature_and_frequence_penalty, \ + get_multinomial_temperature_and_presence_penalty, \ + get_multinomial_all_parameters, \ + get_beam_search # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU. common_config = { @@ -29,18 +51,33 @@ def generate_chat_history(model_path, device, pipeline_config, questions): return chat_history +generation_configs = [ + get_greedy(), + get_greedy_with_min_and_max_tokens(), + get_greedy_with_repetition_penalty(), + get_greedy_with_penalties(), + get_greedy_with_min_and_max_tokens(), + get_greedy_with_single_stop_string(), + get_greedy_with_multiple_stop_strings(), + get_greedy_with_multiple_stop_strings_no_match(), + get_greedy_stop_strings_exclude_from_output(), + get_greedy_stop_strings_include_to_output(), + get_greedy_n_stop_strings_exclude_from_output(), + get_greedy_n_stop_strings_include_to_output() +] @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly -def test_generation_compare_with_stateful(): - prompt = 'The Sun is yellow because' +@pytest.mark.parametrize("generation_config", generation_configs) +def test_generation_compare_with_stateful(generation_config): + prompt = 'What is OpenVINO?' model_path = get_models_list()[0][1] stateful_pipe = ov_genai.LLMPipeline(model_path, "CPU") - ref_out = stateful_pipe.generate(prompt, max_new_tokens=100) + ref_out = stateful_pipe.generate(prompt, generation_config) static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) - actual_out = static_pipe.generate(prompt, max_new_tokens=100) + actual_out = static_pipe.generate(prompt, generation_config) if ref_out != actual_out: print(f'ref_out: {ref_out}\n') @@ -48,6 +85,32 @@ def test_generation_compare_with_stateful(): assert ref_out == actual_out +generation_configs = [ + get_multinomial_temperature(), + get_multinomial_temperature_and_top_p(), + get_multinomial_temperature_and_top_k(), + get_multinomial_temperature_top_p_and_top_k(), + get_multinomial_temperature_and_repetition_penalty(), + get_multinomial_temperature_and_frequence_penalty(), + get_multinomial_temperature_and_presence_penalty() +] +@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("generation_config", generation_configs) +def test_multinomial_sampling(generation_config): + # Multinomial sampling is highly sensitive to raw logits values. For fair comparison, + # a reference implementation producing identical logits (e.g., from StaticLLMPipeline) + # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply + # different optimizations due to differences in provided topologies, leading to slight + # variations in raw logits. Therefore, there is no reliable reference for validation, + # so only ensure that no exceptions are raised. + prompt = 'What is OpenVINO?' + model_path = get_models_list()[0][1] + static_pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) + actual_out = static_pipe.generate(prompt, generation_config) + + @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly @@ -100,17 +163,18 @@ def test_batch_raise_error(): # TODO: For the further sampling support -generation_configs = [ - dict(num_beam_groups=3), - dict(do_sample=True) +generation_config = [ + get_beam_search(), + # NB: Only num_return_sequences=1 is supported! + get_multinomial_all_parameters() ] @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") -@pytest.mark.parametrize("generation_config", generation_configs) +@pytest.mark.parametrize("generation_config", generation_config) @pytest.mark.precommit @pytest.mark.nightly def test_unsupported_sampling_raise_error(generation_config): model_path = get_models_list()[0][1] - prompt = 'The Sun is yellow because' + prompt = 'What is OpenVINO?' pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) with pytest.raises(RuntimeError): pipe.generate(prompt, **generation_config) From 28435c802c7af3e01295562a23e3f2588e350cd8 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 11:56:33 +0000 Subject: [PATCH 06/20] Re-use CB sampler --- src/cpp/src/llm_pipeline_static.cpp | 137 +++++++++++++--------------- src/cpp/src/llm_pipeline_static.hpp | 6 +- src/cpp/src/sampler.cpp | 2 +- src/cpp/src/sampler.hpp | 2 +- 4 files changed, 66 insertions(+), 81 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index db4133b0e7..4ec225f949 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -646,7 +646,8 @@ StaticLLMPipeline::StaticLLMPipeline( const std::string& device, const ov::AnyMap& config ) : LLMPipelineImplBase(tokenizer, - utils::from_config_json_if_exists(models_path)) { + utils::from_config_json_if_exists(models_path)), + m_sampler(m_tokenizer) { auto properties = config; /* NB: Static LLM pipeline consists of two models, first to process the input prompt (prefill), @@ -675,6 +676,8 @@ StaticLLMPipeline::StaticLLMPipeline( if (m_generation_config.eos_token_id == -1) { m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } + + m_sampler.set_seed(m_generation_config.rng_seed); }; StaticLLMPipeline::StaticLLMPipeline( @@ -691,7 +694,7 @@ StaticLLMPipeline::StaticLLMPipeline( const std::string& device, const ov::AnyMap& properties, const ov::genai::GenerationConfig& generation_config -) : LLMPipelineImplBase(tokenizer, generation_config) { +) : LLMPipelineImplBase(tokenizer, generation_config), m_sampler(m_tokenizer) { bool use_blobs = false; auto anyopt = get_option(properties, "USE_BLOBS"); if (anyopt.has_value()) { @@ -710,6 +713,8 @@ StaticLLMPipeline::StaticLLMPipeline( if (m_generation_config.eos_token_id == -1) { m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } + + m_sampler.set_seed(m_generation_config.rng_seed); } void StaticLLMPipeline::setupAndCompileModels( @@ -940,28 +945,29 @@ DecodedResults StaticLLMPipeline::generate( return decoded_results; } -int64_t sample_next_token(const ov::Tensor& logits, - const GenerationConfig& config, - std::mt19937& rng_engine, - LogitProcessor& logit_processor) { - const size_t vocab_size = logits.get_shape()[2]; - const size_t seq_len_size = logits.get_shape()[1]; - const size_t offset = (seq_len_size - 1) * vocab_size; - // NB: Slice out and take probabilities only for the last token - Logits logit_vector(logits.data() + offset, vocab_size); - logit_processor.apply(logit_vector); - int64_t last_token = -1; - if (config.is_greedy_decoding()) { - last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index; - } else if (config.is_multinomial()) { - last_token = ov::genai::multinomial_sample(logit_vector, 1u, rng_engine)[0].m_index; - } else { - // NB: Only greedy and multinomial supported, - // the appropriate check is performed before - OPENVINO_ASSERT(false); +void stream_generated_tokens(std::shared_ptr streamer_ptr, + GenerationHandle& handle) { + if (streamer_ptr && handle->can_read()) { + std::unordered_map token = handle->back(); + for (const auto& gen_token : token.begin()->second.generated_ids) { + if (streamer_ptr->put(gen_token)) { + handle->drop(); + break; + } + } } - logit_processor.register_new_generated_token(last_token); - return last_token; +} + +int64_t get_last_token(SequenceGroup::Ptr sequence_group) { + const auto running_sequences = sequence_group->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() == 1u); + const auto sequence = running_sequences.front(); + + size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); + OPENVINO_ASSERT(num_scheduled_tokens == 1u); + + const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); + return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()]; } EncodedResults StaticLLMPipeline::generate( @@ -981,7 +987,10 @@ EncodedResults StaticLLMPipeline::generate( attention_mask = data->attention_mask; } - if (input_ids.get_shape().at(0) > 1u) { + ov::Shape prompts_shape = input_ids.get_shape(); + const size_t batch_size = prompts_shape[0]; + + if (batch_size > 1u) { OPENVINO_THROW("Currently only batch size=1 is supported"); } @@ -1004,25 +1013,6 @@ EncodedResults StaticLLMPipeline::generate( OPENVINO_THROW("Currently only greedy and multinomial decoding are supported"); } - // FIXME:... - if ( streamer_ptr && - !config.stop_strings.empty() && - !config.include_stop_str_in_output) { - OPENVINO_THROW("Static LLM pipeline doesn't support " - "\"include_stop_str_in_output: false\" when a streamer is provided"); - } - - std::vector input_ids_vec; - input_ids_vec.reserve(input_ids.get_size()); - std::copy_n(input_ids.data(), input_ids.get_size(), std::back_inserter(input_ids_vec)); - LogitProcessor logit_processor(config, input_ids_vec); - m_rng_engine.seed(config.rng_seed); - - const auto processed_stop_strings = - process_stop_strings(config.stop_strings, m_tokenizer); - - ov::Shape prompts_shape = input_ids.get_shape(); - const size_t batch_size = prompts_shape[0]; ov::genai::EncodedResults results; auto& raw_perf_counters = results.perf_metrics.raw_metrics; // NB: Only batch=1 is supported now @@ -1060,12 +1050,20 @@ EncodedResults StaticLLMPipeline::generate( // NB: Now there are prompt_len tokens in KV-cache m_kvcache_desc.num_stored_tokens += static_cast(prompt_len); - auto last_token = sample_next_token( - m_prefill_request.get_tensor("logits"), config, m_rng_engine, logit_processor); - results.tokens[0].push_back(last_token); - if (streamer_ptr && streamer_ptr->put(last_token)) { - return results; - } + auto logits = m_prefill_request.get_tensor("logits"); + int64_t output_sequence_len = logits.get_shape().at(1); + + auto sequence_group = std::make_shared( + 0 /* request_id */, padded_input_ids, config, 1 /* block_size */); + sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len); + sequence_group->schedule_tokens(output_sequence_len); + + // NB: Controls what tokens are ready to be pushed into the streamer + GenerationHandle handle = std::make_shared( + sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters()); + + SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits); + stream_generated_tokens(streamer_ptr, handle); // Outputs: logits, ... const auto kStartOutputKVCacheLayers = 1u; @@ -1106,8 +1104,10 @@ EncodedResults StaticLLMPipeline::generate( std::fill(attention_mask_data, attention_mask_data + m_kvcache_desc.num_stored_tokens - 1u, 1u); attention_mask_data[m_kvcache_desc.total_size - 1] = 1u; - const size_t max_tokens = config.get_max_new_tokens(prompt_len); - for (int i = 0; i < max_tokens - 1; ++i) { + while (sequence_group->is_running()) { + sequence_group->schedule_tokens(1); + int64_t last_token = get_last_token(sequence_group); + input_ids_data[0] = last_token; position_ids_data[0] = m_kvcache_desc.num_stored_tokens; attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u; @@ -1115,37 +1115,16 @@ EncodedResults StaticLLMPipeline::generate( m_kvcache_request.infer(); m_kvcache_desc.num_stored_tokens += 1; - last_token = sample_next_token( - m_kvcache_request.get_tensor("logits"), config, m_rng_engine, logit_processor); - results.tokens[0].push_back(last_token); - raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); raw_perf_counters.m_batch_sizes.emplace_back(batch_size); - bool met_stop_str = false; - if (!config.stop_strings.empty()) { - auto match_result = match_stop_string(m_tokenizer, - results.tokens[0], - processed_stop_strings, - config.include_stop_str_in_output); - if (match_result.is_matched) { - met_stop_str = true; - results.tokens[0].erase( - results.tokens[0].end() - match_result.to_remove, results.tokens[0].end()); - } - } - - if (streamer_ptr && streamer_ptr->put(last_token)) { - break; - } - - if (met_stop_str || (last_token == config.eos_token_id && !config.ignore_eos)) { - break; - } + SamplerOutput sampler_output = m_sampler.sample( + {sequence_group}, m_kvcache_request.get_tensor("logits")); + stream_generated_tokens(streamer_ptr, handle); // NB: KV-cache is full, further generation is impossible if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { - break; + sequence_group->set_out_of_memory(); } // NB: Write KV-cache for the new token to the correct input position for the next iteration @@ -1168,6 +1147,12 @@ EncodedResults StaticLLMPipeline::generate( streamer_ptr->end(); } + OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u); + auto sequence = sequence_group->get_finished_sequences().front(); + results.tokens[0] = sequence->get_generated_ids(); + results.scores[0] = sequence->get_cumulative_log_prob(); + m_sampler.clear_request_info(sequence_group->get_request_id()); + auto stop_time = std::chrono::steady_clock::now(); // If is called without tokenization then that stat will not be reported. auto& metrics = results.perf_metrics; diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 664b820346..13d7752e2e 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -7,6 +7,7 @@ #include #include "llm_pipeline_base.hpp" +#include "sampler.hpp" namespace ov { namespace genai { @@ -78,15 +79,14 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { bool v_tensors_transposed; }; + Sampler m_sampler; + KVCacheDesc m_kvcache_desc; ov::InferRequest m_kvcache_request; ov::InferRequest m_prefill_request; bool m_is_chat_conversation = false; ChatHistory m_history; - - // NB: For multinomial sampling - std::mt19937 m_rng_engine; }; } // namespace genai diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 31aab070f0..e2a3238676 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -746,7 +746,7 @@ process_stop_strings(const std::set& stop_strings, Tokenizer& token return result; } -SamplerOutput Sampler::sample(std::vector & sequence_groups, +SamplerOutput Sampler::sample(const std::vector & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled) { const float * logits_data = logits.data(); diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index baf57ebb40..271d209f75 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -88,7 +88,7 @@ class Sampler { Sampler() = default; Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {}; - SamplerOutput sample(std::vector & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false); + SamplerOutput sample(const std::vector & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false); void set_seed(size_t new_seed) { rng_engine.seed(new_seed); seed = new_seed; From 461bc8cd11d81b4c77520f7afb27241a7a681dea Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 14:25:12 +0000 Subject: [PATCH 07/20] Add test and fix SequenceGroup --- src/cpp/src/llm_pipeline_static.cpp | 1 + src/cpp/src/sequence_group.hpp | 4 ++-- .../python_tests/test_llm_pipeline_static.py | 20 +++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 4ec225f949..a89957850f 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -1125,6 +1125,7 @@ EncodedResults StaticLLMPipeline::generate( // NB: KV-cache is full, further generation is impossible if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { sequence_group->set_out_of_memory(); + break; } // NB: Write KV-cache for the new token to the correct input position for the next iteration diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 8f8d5f899e..2df8a1f200 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -290,8 +290,8 @@ class SequenceGroup : public std::enable_shared_from_this { } size_t num_finished_seqs() const { - return std::count_if(m_sequences.begin(), m_sequences.end(), [] (Sequence::CPtr seq) { - return seq->has_finished(); + return std::count_if(m_sequences.begin(), m_sequences.end(), [this] (Sequence::CPtr seq) { + return seq->has_finished() || seq->out_of_memory() || handle_dropped(); }); } diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index f8b7a7c2af..326386fe31 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -196,6 +196,26 @@ def test_max_number_of_tokens(): assert len(encoded_results.tokens[0]) == num_tokens +@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") +@pytest.mark.precommit +@pytest.mark.nightly +def test_terminate_when_kvcache_is_full(): + model_path = get_models_list()[0][1] + prompt = 'The Sun is yellow because' + pipeline_config = { "MAX_PROMPT_LEN": 64, "MIN_RESPONSE_LEN": 64 } + pipeline_config |= common_config + kv_cache_size = pipeline_config['MAX_PROMPT_LEN'] + pipeline_config['MIN_RESPONSE_LEN'] + + tokenizer = ov_genai.Tokenizer(model_path) + tokenized_input = tokenizer.encode(prompt) + input_len = tokenized_input.input_ids.get_shape()[1] + + pipe = ov_genai.LLMPipeline(model_path, "NPU", **pipeline_config) + encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True) + + assert len(encoded_results.tokens[0]) == (kv_cache_size - input_len + 1) + + # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt! @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline") From 9f6aca40446ca1cee296c014df467a4a450898a3 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 14:25:28 +0000 Subject: [PATCH 08/20] Remove do_sample=False hardcode for GenAI --- tools/llm_bench/task/text_generation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 4822b228ca..d6aebdbc3e 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -227,7 +227,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data gen_config = model.get_generation_config() gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] - gen_config.do_sample = False if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get('num_assistant_tokens', None): @@ -381,7 +380,6 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config = model.get_generation_config() gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] - gen_config.do_sample = False if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get("num_assistant_tokens", None): From 808d6b9603726de953d13c7dc27b8d10b387dad7 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 14:39:09 +0000 Subject: [PATCH 09/20] Fix comments to review --- src/cpp/src/llm_pipeline_static.cpp | 52 +++++++++---------- src/cpp/src/llm_pipeline_static.hpp | 3 +- src/cpp/src/sampler.cpp | 2 +- src/cpp/src/sampler.hpp | 2 +- src/cpp/src/sequence_group.hpp | 2 +- .../python_tests/test_llm_pipeline_static.py | 10 ++-- 6 files changed, 35 insertions(+), 36 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index a89957850f..aac55015c0 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "llm_pipeline_static.hpp" @@ -635,6 +635,31 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) { } } +void stream_generated_tokens(std::shared_ptr streamer_ptr, + GenerationHandle& handle) { + if (streamer_ptr && handle->can_read()) { + std::unordered_map token = handle->back(); + for (const auto& gen_token : token.begin()->second.generated_ids) { + if (streamer_ptr->put(gen_token)) { + handle->drop(); + break; + } + } + } +} + +int64_t get_last_token(SequenceGroup::Ptr sequence_group) { + const auto running_sequences = sequence_group->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() == 1u); + const auto sequence = running_sequences.front(); + + size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); + OPENVINO_ASSERT(num_scheduled_tokens == 1u); + + const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); + return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()]; +} + } // anonymous namespace namespace ov { @@ -945,31 +970,6 @@ DecodedResults StaticLLMPipeline::generate( return decoded_results; } -void stream_generated_tokens(std::shared_ptr streamer_ptr, - GenerationHandle& handle) { - if (streamer_ptr && handle->can_read()) { - std::unordered_map token = handle->back(); - for (const auto& gen_token : token.begin()->second.generated_ids) { - if (streamer_ptr->put(gen_token)) { - handle->drop(); - break; - } - } - } -} - -int64_t get_last_token(SequenceGroup::Ptr sequence_group) { - const auto running_sequences = sequence_group->get_running_sequences(); - OPENVINO_ASSERT(running_sequences.size() == 1u); - const auto sequence = running_sequences.front(); - - size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); - OPENVINO_ASSERT(num_scheduled_tokens == 1u); - - const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); - return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()]; -} - EncodedResults StaticLLMPipeline::generate( const EncodedInputs& inputs, OptionalGenerationConfig generation_config, diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 13d7752e2e..8dc7ef49a1 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -1,10 +1,9 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include -#include #include "llm_pipeline_base.hpp" #include "sampler.hpp" diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index e2a3238676..73a406c695 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "sampler.hpp" diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 271d209f75..df0c406749 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -1,5 +1,5 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 2df8a1f200..6a17cf59b8 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 326386fe31..10e7255309 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -99,9 +99,9 @@ def test_generation_compare_with_stateful(generation_config): @pytest.mark.nightly @pytest.mark.parametrize("generation_config", generation_configs) def test_multinomial_sampling(generation_config): - # Multinomial sampling is highly sensitive to raw logits values. For fair comparison, - # a reference implementation producing identical logits (e.g., from StaticLLMPipeline) - # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply + # Multinomial sampling is highly sensitive to raw logits values. For fair comparison, + # a reference implementation producing identical logits (e.g., from StaticLLMPipeline) + # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply # different optimizations due to differences in provided topologies, leading to slight # variations in raw logits. Therefore, there is no reliable reference for validation, # so only ensure that no exceptions are raised. @@ -163,13 +163,13 @@ def test_batch_raise_error(): # TODO: For the further sampling support -generation_config = [ +generation_configs = [ get_beam_search(), # NB: Only num_return_sequences=1 is supported! get_multinomial_all_parameters() ] @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") -@pytest.mark.parametrize("generation_config", generation_config) +@pytest.mark.parametrize("generation_config", generation_configs) @pytest.mark.precommit @pytest.mark.nightly def test_unsupported_sampling_raise_error(generation_config): From eb02c49a5f171b0aec7b4a6aa183bdc12fadae4b Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 14:52:04 +0000 Subject: [PATCH 10/20] Revert changes in sampler --- src/cpp/src/llm_pipeline_static.cpp | 9 +- src/cpp/src/sampler.cpp | 131 ++++++++++++++-------------- src/cpp/src/sampler.hpp | 21 ----- 3 files changed, 68 insertions(+), 93 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index aac55015c0..6731fb8da0 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -3,7 +3,6 @@ #include "llm_pipeline_static.hpp" -#include "logit_processor.hpp" #include "sampler.hpp" #include @@ -635,10 +634,10 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) { } } -void stream_generated_tokens(std::shared_ptr streamer_ptr, - GenerationHandle& handle) { +void stream_generated_tokens(std::shared_ptr streamer_ptr, + ov::genai::GenerationHandle& handle) { if (streamer_ptr && handle->can_read()) { - std::unordered_map token = handle->back(); + std::unordered_map token = handle->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { if (streamer_ptr->put(gen_token)) { handle->drop(); @@ -648,7 +647,7 @@ void stream_generated_tokens(std::shared_ptr streamer_ptr, } } -int64_t get_last_token(SequenceGroup::Ptr sequence_group) { +int64_t get_last_token(ov::genai::SequenceGroup::Ptr sequence_group) { const auto running_sequences = sequence_group->get_running_sequences(); OPENVINO_ASSERT(running_sequences.size() == 1u); const auto sequence = running_sequences.front(); diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 73a406c695..6498a7d4c4 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -67,71 +67,6 @@ std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { return tokens; } -Token greedy_sample(const Logits& logits, size_t top_logprobs) { - // For greedy sampling we do not expect sorting or shrinking considered tokens - // so we can operate directly on the data buffer - size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 - std::vector top_values(m, -std::numeric_limits::infinity()); - std::vector top_indexes(m, 0); - - for (size_t i = 0; i < logits.m_size; ++i) { - if (logits.m_data[i] > top_values.back()) { - top_values.back() = logits.m_data[i]; - top_indexes.back() = i; - - for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { - std::swap(top_values[j], top_values[j - 1]); - std::swap(top_indexes[j], top_indexes[j - 1]); - } - } - } - - size_t max_index = top_indexes.front(); - float max_value = 0.0; - - if (top_logprobs) { - // apply log softmax to max value - max_value = top_values.front(); - float log_sum = std::log(std::accumulate( - logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { - return accumulated + std::exp(to_add - max_value); - })); - max_value = -log_sum; - } - - return Token(max_value, max_index); -} - -std::vector multinomial_sample(const Logits& logits, - size_t num_tokens_per_sequence, - std::mt19937& rng_engine) { - // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. - std::vector multinomial_weights; - multinomial_weights.reserve(logits.m_size); - if (logits.is_vector_initialized()) - for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); - else - multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); - - // std::discrete_distribution returns corrupted results when applied to log probabilities - // which result returning NAN only logprobs. - // so log() is applied after this line - auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 - - std::vector out_tokens; - for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { - size_t element_to_pick = dist(rng_engine); - if (logits.is_vector_initialized()) { - auto logit = logits.m_vector[element_to_pick]; - logit.m_log_prob = std::log(logit.m_log_prob); - out_tokens.push_back(logit); - } - else - out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); - } - return out_tokens; -} - std::vector wrap_tokens(const std::vector& tokens, const std::vector& prefix_tokens, const std::vector& suffix_tokens) { std::vector all_tokens = prefix_tokens; all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end()); @@ -160,6 +95,13 @@ std::vector encode_and_process_string(const std::string& stop_string, o return encoded_stop_string; } +struct MatchStopStringResult { + size_t to_remove = 0; + // int64_t last_token_id = 0; + // bool is_to_update_last_token = false; + bool is_matched = false; +}; + // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. MatchStopStringResult match_stop_string(Tokenizer& tokenizer, const TokenIds& generated_tokens, @@ -539,11 +481,66 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to } Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const { - return greedy_sample(logits, top_logprobs); + // For greedy sampling we do not expect sorting or shrinking considered tokens + // so we can operate directly on the data buffer + size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 + std::vector top_values(m, -std::numeric_limits::infinity()); + std::vector top_indexes(m, 0); + + for (size_t i = 0; i < logits.m_size; ++i) { + if (logits.m_data[i] > top_values.back()) { + top_values.back() = logits.m_data[i]; + top_indexes.back() = i; + + for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { + std::swap(top_values[j], top_values[j - 1]); + std::swap(top_indexes[j], top_indexes[j - 1]); + } + } + } + + size_t max_index = top_indexes.front(); + float max_value = 0.0; + + if (top_logprobs) { + // apply log softmax to max value + max_value = top_values.front(); + float log_sum = std::log(std::accumulate( + logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_value); + })); + max_value = -log_sum; + } + + return Token(max_value, max_index); } std::vector Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { - return multinomial_sample(logits, num_tokens_per_sequence, rng_engine); + // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. + std::vector multinomial_weights; + multinomial_weights.reserve(logits.m_size); + if (logits.is_vector_initialized()) + for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); + else + multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); + + // std::discrete_distribution returns corrupted results when applied to log probabilities + // which result returning NAN only logprobs. + // so log() is applied after this line + auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 + + std::vector out_tokens; + for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { + size_t element_to_pick = dist(rng_engine); + if (logits.is_vector_initialized()) { + auto logit = logits.m_vector[element_to_pick]; + logit.m_log_prob = std::log(logit.m_log_prob); + out_tokens.push_back(logit); + } + else + out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); + } + return out_tokens; } std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) { diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index df0c406749..7796f93d1e 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -32,27 +32,6 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set log_softmax(const ov::Tensor& logits, size_t batch_idx); -Token greedy_sample(const Logits& logits, size_t top_logprobs); - -std::vector multinomial_sample(const Logits& logits, - size_t num_tokens_per_sequence, - std::mt19937& rng_engine); - -std::pair> -process_stop_strings(const std::set& stop_strings, Tokenizer& tokenizer); - -struct MatchStopStringResult { - size_t to_remove = 0; - // int64_t last_token_id = 0; - // bool is_to_update_last_token = false; - bool is_matched = false; -}; - -MatchStopStringResult match_stop_string(Tokenizer& tokenizer, - const TokenIds& generated_tokens, - const std::pair>& stop_strings, - bool is_include_to_output); - struct SamplerOutput { // IDs of sequences that need to be dropped std::vector m_dropped_sequences; From 24866f38e6eee5b825b48d8ac2080cdd7548b440 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 15:20:42 +0000 Subject: [PATCH 11/20] Add test to check termination by sampler --- .../python_tests/test_llm_pipeline_static.py | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 10e7255309..5c7f07fcbb 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -183,7 +183,7 @@ def test_unsupported_sampling_raise_error(generation_config): @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly -def test_max_number_of_tokens(): +def test_terminate_by_max_number_of_tokens(): model_path = get_models_list()[0][1] prompt = 'The Sun is yellow because' num_tokens = 128 @@ -199,7 +199,7 @@ def test_max_number_of_tokens(): @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.precommit @pytest.mark.nightly -def test_terminate_when_kvcache_is_full(): +def test_terminate_by_out_of_memory(): model_path = get_models_list()[0][1] prompt = 'The Sun is yellow because' pipeline_config = { "MAX_PROMPT_LEN": 64, "MIN_RESPONSE_LEN": 64 } @@ -216,6 +216,29 @@ def test_terminate_when_kvcache_is_full(): assert len(encoded_results.tokens[0]) == (kv_cache_size - input_len + 1) +@pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") +@pytest.mark.precommit +@pytest.mark.nightly +def test_terminate_by_sampler(): + model_path = get_models_list()[0][1] + prompt = 'The Sun is yellow because' + + current_iter = 0 + num_iters = 10 + def callback(subword): + nonlocal current_iter + current_iter += 1 + return current_iter == num_iters + + tokenizer = ov_genai.Tokenizer(model_path) + tokenized_input = tokenizer.encode(prompt) + + pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) + encoded_results = pipe.generate(tokenized_input, max_new_tokens=1000, ignore_eos=True, streamer=callback) + + assert len(encoded_results.tokens[0]) == num_iters + + # FIXME: Known problem, output differs from stateful pipeline starting from 3rd prompt! @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") @pytest.mark.skip(reason="JIRA-144780: Output differs from stateful pipeline") From 99a48be5efdb32c412ead496245d53c226d94236 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 15:53:56 +0000 Subject: [PATCH 12/20] Fix comments on review --- src/cpp/src/llm_pipeline_static.cpp | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 6731fb8da0..2d6dbb8cb0 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -647,18 +647,6 @@ void stream_generated_tokens(std::shared_ptr streamer_p } } -int64_t get_last_token(ov::genai::SequenceGroup::Ptr sequence_group) { - const auto running_sequences = sequence_group->get_running_sequences(); - OPENVINO_ASSERT(running_sequences.size() == 1u); - const auto sequence = running_sequences.front(); - - size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); - OPENVINO_ASSERT(num_scheduled_tokens == 1u); - - const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); - return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()]; -} - } // anonymous namespace namespace ov { @@ -1012,6 +1000,10 @@ EncodedResults StaticLLMPipeline::generate( OPENVINO_THROW("Currently only greedy and multinomial decoding are supported"); } + if (config.num_return_sequences != 1u) { + OPENVINO_THROW("Currently only \"num_return_sequences\" equal to 1 is supported!"); + } + ov::genai::EncodedResults results; auto& raw_perf_counters = results.perf_metrics.raw_metrics; // NB: Only batch=1 is supported now @@ -1105,9 +1097,10 @@ EncodedResults StaticLLMPipeline::generate( while (sequence_group->is_running()) { sequence_group->schedule_tokens(1); - int64_t last_token = get_last_token(sequence_group); + const auto running_sequences = sequence_group->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() == 1u); - input_ids_data[0] = last_token; + input_ids_data[0] = running_sequences.front()->get_generated_ids().back(); position_ids_data[0] = m_kvcache_desc.num_stored_tokens; attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u; From 28c37d4e196a220fb9e5ebda6d79aa80d1fd154b Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Thu, 2 Jan 2025 18:33:03 +0000 Subject: [PATCH 13/20] Update tests/python_tests/test_llm_pipeline_static.py Co-authored-by: Ilya Lavrenov --- tests/python_tests/test_llm_pipeline_static.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 5c7f07fcbb..81ce82793c 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -177,7 +177,7 @@ def test_unsupported_sampling_raise_error(generation_config): prompt = 'What is OpenVINO?' pipe = ov_genai.LLMPipeline(model_path, "NPU", **common_config) with pytest.raises(RuntimeError): - pipe.generate(prompt, **generation_config) + pipe.generate(prompt, generation_config) @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") From 881a565fe826dc7676f420fe4642d5d2c02cdaf3 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Thu, 2 Jan 2025 21:00:28 +0000 Subject: [PATCH 14/20] Update text_generation.py --- tools/llm_bench/task/text_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index d6aebdbc3e..ad1a55ef2f 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -198,7 +198,6 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption): - set_seed(args['seed']) input_text_list = [input_text] * args['batch_size'] if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(input_text_list): @@ -226,6 +225,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data log.info(out_str) gen_config = model.get_generation_config() gen_config.max_new_tokens = max_gen_tokens + gen_config.rng_seed= args["seed"] gen_config.num_beams = args["num_beams"] if args.get('draft_model', ''): config_info = "Speculative decoding config: " @@ -352,7 +352,6 @@ def token_printer(): def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, tokens_len, streaming, model_precision, proc_id, mem_consumption): - set_seed(args['seed']) input_text_list = [input_text] * args['batch_size'] if args["output_dir"] is not None and num == 0: for bs_index, in_text in enumerate(input_text_list): @@ -378,6 +377,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] streamer.reset() gen_config = model.get_generation_config() + gen_config.rng_seed= args["seed"] gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] if args.get('draft_model', ''): From 831bf86d02d392237a3b4452470644bafa1e7da8 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Fri, 3 Jan 2025 08:50:44 +0000 Subject: [PATCH 15/20] Update test_llm_pipeline_static.py --- .../python_tests/test_llm_pipeline_static.py | 32 +------------------ 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 81ce82793c..bb73033e6a 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -11,26 +11,11 @@ ) from common import \ - get_greedy, \ - get_greedy_with_min_and_max_tokens, \ - get_greedy_with_repetition_penalty, \ - get_greedy_with_penalties, \ - get_greedy_with_min_and_max_tokens, \ - get_greedy_with_single_stop_string, \ - get_greedy_with_multiple_stop_strings, \ - get_greedy_with_multiple_stop_strings_no_match, \ - get_greedy_stop_strings_exclude_from_output, \ - get_greedy_stop_strings_include_to_output, \ get_greedy_n_stop_strings_exclude_from_output, \ get_greedy_n_stop_strings_include_to_output, \ get_multinomial_temperature, \ - get_multinomial_temperature_and_top_p, \ - get_multinomial_temperature_and_top_k, \ - get_multinomial_temperature_top_p_and_top_k, \ - get_multinomial_temperature_and_repetition_penalty, \ - get_multinomial_temperature_and_frequence_penalty, \ - get_multinomial_temperature_and_presence_penalty, \ get_multinomial_all_parameters, \ + get_multinomial_temperature_and_presence_penalty \ get_beam_search # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU. @@ -53,15 +38,6 @@ def generate_chat_history(model_path, device, pipeline_config, questions): generation_configs = [ get_greedy(), - get_greedy_with_min_and_max_tokens(), - get_greedy_with_repetition_penalty(), - get_greedy_with_penalties(), - get_greedy_with_min_and_max_tokens(), - get_greedy_with_single_stop_string(), - get_greedy_with_multiple_stop_strings(), - get_greedy_with_multiple_stop_strings_no_match(), - get_greedy_stop_strings_exclude_from_output(), - get_greedy_stop_strings_include_to_output(), get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ] @@ -86,12 +62,6 @@ def test_generation_compare_with_stateful(generation_config): generation_configs = [ - get_multinomial_temperature(), - get_multinomial_temperature_and_top_p(), - get_multinomial_temperature_and_top_k(), - get_multinomial_temperature_top_p_and_top_k(), - get_multinomial_temperature_and_repetition_penalty(), - get_multinomial_temperature_and_frequence_penalty(), get_multinomial_temperature_and_presence_penalty() ] @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") From 7d1fd1d19853c64c32b53dcdc5504465be25a06a Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Fri, 3 Jan 2025 08:51:29 +0000 Subject: [PATCH 16/20] Update test_llm_pipeline_static.py --- tests/python_tests/test_llm_pipeline_static.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index bb73033e6a..c9ae3ce30f 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -37,7 +37,6 @@ def generate_chat_history(model_path, device, pipeline_config, questions): generation_configs = [ - get_greedy(), get_greedy_n_stop_strings_exclude_from_output(), get_greedy_n_stop_strings_include_to_output() ] From dcb075c731f932db7871eb05d309b54ba94c3cbc Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Fri, 3 Jan 2025 10:22:01 +0000 Subject: [PATCH 17/20] Update test_llm_pipeline_static.py --- tests/python_tests/test_llm_pipeline_static.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index c9ae3ce30f..b3969fe002 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -15,7 +15,7 @@ get_greedy_n_stop_strings_include_to_output, \ get_multinomial_temperature, \ get_multinomial_all_parameters, \ - get_multinomial_temperature_and_presence_penalty \ + get_multinomial_temperature_and_presence_penalty, \ get_beam_search # This test suite is designed specifically to validate the functionality and robustness of the StaticLLMPipeline on NPUW:CPU. From dcbf89014e99e9d3a7d55b43602a5b4367bb1d37 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Fri, 3 Jan 2025 11:01:50 +0000 Subject: [PATCH 18/20] Update text_generation.py --- tools/llm_bench/task/text_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index ad1a55ef2f..03fde296b1 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -225,7 +225,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data log.info(out_str) gen_config = model.get_generation_config() gen_config.max_new_tokens = max_gen_tokens - gen_config.rng_seed= args["seed"] + gen_config.rng_seed = args["seed"] gen_config.num_beams = args["num_beams"] if args.get('draft_model', ''): config_info = "Speculative decoding config: " @@ -377,7 +377,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] streamer.reset() gen_config = model.get_generation_config() - gen_config.rng_seed= args["seed"] + gen_config.rng_seed = args["seed"] gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] if args.get('draft_model', ''): From 8e89a9b10286b88df87facdfb54063f6fdce357b Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Fri, 3 Jan 2025 11:57:24 +0000 Subject: [PATCH 19/20] Update text_generation.py --- tools/llm_bench/task/text_generation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 03fde296b1..c768d427e7 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -227,6 +227,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data gen_config.max_new_tokens = max_gen_tokens gen_config.rng_seed = args["seed"] gen_config.num_beams = args["num_beams"] + gen_config.do_sample = False if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get('num_assistant_tokens', None): @@ -380,6 +381,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config.rng_seed = args["seed"] gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] + gen_config.do_sample = False if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get("num_assistant_tokens", None): From 4dcd5e04c01cbdca5c542f1c827d68d422aa674f Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Sat, 4 Jan 2025 09:56:16 +0000 Subject: [PATCH 20/20] Update test_llm_pipeline_static.py --- tests/python_tests/test_llm_pipeline_static.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index ca22dab719..d2d3673356 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -15,8 +15,8 @@ from common import get_default_properties from common import \ - get_greedy_n_stop_strings_exclude_from_output, \ - get_greedy_n_stop_strings_include_to_output, \ + get_greedy, \ + get_greedy_with_penalties, \ get_multinomial_temperature, \ get_multinomial_all_parameters, \ get_multinomial_temperature_and_presence_penalty, \ @@ -46,8 +46,8 @@ def generate_chat_history(model_path, device, pipeline_config, questions): generation_configs = [ - get_greedy_n_stop_strings_exclude_from_output(), - get_greedy_n_stop_strings_include_to_output() + get_greedy(), + get_greedy_with_penalties() ] @pytest.mark.precommit @pytest.mark.nightly