diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 6f4f124894..34d118c387 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -3,6 +3,9 @@ #include "llm_pipeline_static.hpp" +#include "logit_processor.hpp" +#include "sampler.hpp" + #include #include @@ -938,6 +941,30 @@ DecodedResults StaticLLMPipeline::generate( return decoded_results; } +int64_t sample_next_token(const ov::Tensor& logits, + const GenerationConfig& config, + std::mt19937& rng_engine, + LogitProcessor& logit_processor) { + const size_t vocab_size = logits.get_shape()[2]; + const size_t seq_len_size = logits.get_shape()[1]; + const size_t offset = (seq_len_size - 1) * vocab_size; + // NB: Slice out and take probabilities only for the last token + Logits logit_vector(logits.data() + offset, vocab_size); + logit_processor.apply(logit_vector); + int64_t last_token = -1; + if (config.is_greedy_decoding()) { + last_token = ov::genai::greedy_sample(logit_vector, config.logprobs).m_index; + } else if (config.is_multinomial()) { + last_token = ov::genai::multinomial_sample(logit_vector, 1u, rng_engine)[0].m_index; + } else { + // NB: Only greedy and multinomial supported, + // the appropriate check is performed before + OPENVINO_ASSERT(false); + } + logit_processor.register_new_generated_token(last_token); + return last_token; +} + EncodedResults StaticLLMPipeline::generate( const EncodedInputs& inputs, OptionalGenerationConfig generation_config, @@ -974,10 +1001,16 @@ EncodedResults StaticLLMPipeline::generate( streamer_ptr = std::make_shared(m_tokenizer, *callback); } - if (!config.is_greedy_decoding()) { - OPENVINO_THROW("Currently only greedy decoding is supported"); + if (!config.is_greedy_decoding() && !config.is_multinomial()) { + OPENVINO_THROW("Currently only greedy and multinomial decoding are supported"); } + std::vector input_ids_vec; + input_ids_vec.reserve(input_ids.get_size()); + std::copy_n(input_ids.data(), input_ids.get_size(), std::back_inserter(input_ids_vec)); + LogitProcessor logit_processor(config, input_ids_vec); + m_rng_engine.seed(config.rng_seed); + ov::Shape prompts_shape = input_ids.get_shape(); const size_t batch_size = prompts_shape[0]; ov::genai::EncodedResults results; @@ -1016,7 +1049,9 @@ EncodedResults StaticLLMPipeline::generate( // NB: Now there are prompt_len tokens in KV-cache m_kvcache_desc.num_stored_tokens += static_cast(prompt_len); - int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0); + + auto last_token = sample_next_token( + m_prefill_request.get_tensor("logits"), config, m_rng_engine, logit_processor); results.tokens[0].push_back(last_token); if (streamer_ptr && streamer_ptr->put(last_token)) { return results; @@ -1070,7 +1105,8 @@ EncodedResults StaticLLMPipeline::generate( m_kvcache_request.infer(); m_kvcache_desc.num_stored_tokens += 1; - last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0); + last_token = sample_next_token( + m_kvcache_request.get_tensor("logits"), config, m_rng_engine, logit_processor); results.tokens[0].push_back(last_token); raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 7acc28c684..664b820346 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include "llm_pipeline_base.hpp" @@ -83,6 +84,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { bool m_is_chat_conversation = false; ChatHistory m_history; + + // NB: For multinomial sampling + std::mt19937 m_rng_engine; }; } // namespace genai diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 9c18dc7721..0e53f1b4ac 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -67,6 +67,71 @@ std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { return tokens; } +Token greedy_sample(const Logits& logits, size_t top_logprobs) { + // For greedy sampling we do not expect sorting or shrinking considered tokens + // so we can operate directly on the data buffer + size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 + std::vector top_values(m, -std::numeric_limits::infinity()); + std::vector top_indexes(m, 0); + + for (size_t i = 0; i < logits.m_size; ++i) { + if (logits.m_data[i] > top_values.back()) { + top_values.back() = logits.m_data[i]; + top_indexes.back() = i; + + for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { + std::swap(top_values[j], top_values[j - 1]); + std::swap(top_indexes[j], top_indexes[j - 1]); + } + } + } + + size_t max_index = top_indexes.front(); + float max_value = 0.0; + + if (top_logprobs) { + // apply log softmax to max value + max_value = top_values.front(); + float log_sum = std::log(std::accumulate( + logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_value); + })); + max_value = -log_sum; + } + + return Token(max_value, max_index); +} + +std::vector multinomial_sample(const Logits& logits, + size_t num_tokens_per_sequence, + std::mt19937& rng_engine) { + // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. + std::vector multinomial_weights; + multinomial_weights.reserve(logits.m_size); + if (logits.is_vector_initialized()) + for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); + else + multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); + + // std::discrete_distribution returns corrupted results when applied to log probabilities + // which result returning NAN only logprobs. + // so log() is applied after this line + auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 + + std::vector out_tokens; + for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { + size_t element_to_pick = dist(rng_engine); + if (logits.is_vector_initialized()) { + auto logit = logits.m_vector[element_to_pick]; + logit.m_log_prob = std::log(logit.m_log_prob); + out_tokens.push_back(logit); + } + else + out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); + } + return out_tokens; +} + std::vector wrap_tokens(const std::vector& tokens, const std::vector& prefix_tokens, const std::vector& suffix_tokens) { std::vector all_tokens = prefix_tokens; all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end()); @@ -481,66 +546,11 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to } Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const { - // For greedy sampling we do not expect sorting or shrinking considered tokens - // so we can operate directly on the data buffer - size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 - std::vector top_values(m, -std::numeric_limits::infinity()); - std::vector top_indexes(m, 0); - - for (size_t i = 0; i < logits.m_size; ++i) { - if (logits.m_data[i] > top_values.back()) { - top_values.back() = logits.m_data[i]; - top_indexes.back() = i; - - for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { - std::swap(top_values[j], top_values[j - 1]); - std::swap(top_indexes[j], top_indexes[j - 1]); - } - } - } - - size_t max_index = top_indexes.front(); - float max_value = 0.0; - - if (top_logprobs) { - // apply log softmax to max value - max_value = top_values.front(); - float log_sum = std::log(std::accumulate( - logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { - return accumulated + std::exp(to_add - max_value); - })); - max_value = -log_sum; - } - - return Token(max_value, max_index); + return greedy_sample(logits, top_logprobs); } std::vector Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { - // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. - std::vector multinomial_weights; - multinomial_weights.reserve(logits.m_size); - if (logits.is_vector_initialized()) - for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); - else - multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); - - // std::discrete_distribution returns corrupted results when applied to log probabilities - // which result returning NAN only logprobs. - // so log() is applied after this line - auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 - - std::vector out_tokens; - for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { - size_t element_to_pick = dist(rng_engine); - if (logits.is_vector_initialized()) { - auto logit = logits.m_vector[element_to_pick]; - logit.m_log_prob = std::log(logit.m_log_prob); - out_tokens.push_back(logit); - } - else - out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); - } - return out_tokens; + return multinomial_sample(logits, num_tokens_per_sequence, rng_engine); } std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) { diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 981e11560f..12870ec70e 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -32,6 +32,12 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set log_softmax(const ov::Tensor& logits, size_t batch_idx); +Token greedy_sample(const Logits& logits, size_t top_logprobs); + +std::vector multinomial_sample(const Logits& logits, + size_t num_tokens_per_sequence, + std::mt19937& rng_engine); + struct SamplerOutput { // IDs of sequences that need to be dropped std::vector m_dropped_sequences;