From 44650f42fcd27df7b72fe0124dedf398734e3942 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Fri, 15 Nov 2024 20:07:09 +0000 Subject: [PATCH 1/2] Fix wrong logits processing without applying of slice matmul --- src/cpp/src/lm_encoding.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 644aa369c6..cb27ca60fb 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -105,6 +105,20 @@ std::pair get_lm_encoded_results( auto logits = m_llm.get_tensor("logits"); + // if slice matmul is not applied logits will contains not only result tokens + size_t vocab_size = logits.get_shape().back(); + if (!m_embedding.has_value()) { + ov::Tensor new_logits = ov::Tensor(logits.get_element_type(), {batch_size, 1, vocab_size}); + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + + for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) { + size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size; + const float* logits_data = logits.data() + batch_offset + sequence_offset; + std::copy(logits_data, logits_data + vocab_size, new_logits.data() + batch_idx * vocab_size); + } + logits = new_logits; + } + int64_t sequence_len = logits.get_shape().at(1); for (auto& sequence_group : sequence_groups) sequence_group->schedule_tokens(sequence_len); From 98d780fdc013fce50943b98743897195fc63b3e7 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Fri, 15 Nov 2024 20:59:28 +0000 Subject: [PATCH 2/2] update --- src/cpp/src/llm_pipeline.cpp | 5 ----- src/cpp/src/lm_encoding.cpp | 19 ++++--------------- src/cpp/src/visual_language/pipeline.cpp | 1 - 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 830fa7ac37..62a72b1cbd 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -269,18 +269,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { SequenceGroup::Ptr sequence_group; if (is_chat_conversation && !m_is_cache_empty) { sequence_group = std::make_shared(request_id, m_tokenized_chat_history.input_ids, config, block_size, enable_prefix_caching); - sequence_group->update_processed_tokens_num(m_tokenized_chat_history.input_ids.get_shape().at(1) - 1); } else { size_t seq_len = input_ids.get_shape().at(1); size_t batch_offset = request_id * seq_len; const int64_t* prompt_start = input_ids.data() + batch_offset; std::vector tokenized_prompt(prompt_start, prompt_start + seq_len); - // in case of multi batch scenario, remove eos_token_id at start of prompt - auto real_prompt_start = std::find_if(tokenized_prompt.begin(), tokenized_prompt.end(), [&config](int64_t token) { return token != config.eos_token_id; }); - tokenized_prompt.erase(tokenized_prompt.begin(), real_prompt_start); sequence_group = std::make_shared(request_id, tokenized_prompt, config, block_size, enable_prefix_caching); - sequence_group->update_processed_tokens_num(tokenized_prompt.size() - 1); } sequence_group->set_sequence_group_ptr(sequence_group); diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index cb27ca60fb..c76d9f7edf 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -105,24 +105,13 @@ std::pair get_lm_encoded_results( auto logits = m_llm.get_tensor("logits"); - // if slice matmul is not applied logits will contains not only result tokens - size_t vocab_size = logits.get_shape().back(); - if (!m_embedding.has_value()) { - ov::Tensor new_logits = ov::Tensor(logits.get_element_type(), {batch_size, 1, vocab_size}); - size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; - - for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) { - size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size; - const float* logits_data = logits.data() + batch_offset + sequence_offset; - std::copy(logits_data, logits_data + vocab_size, new_logits.data() + batch_idx * vocab_size); - } - logits = new_logits; - } - int64_t sequence_len = logits.get_shape().at(1); - for (auto& sequence_group : sequence_groups) + for (auto& sequence_group : sequence_groups) { + sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - sequence_len); sequence_group->schedule_tokens(sequence_len); + } + std::map beam_offets; for (size_t i = 0; i < sequence_groups.size(); i++) beam_offets.insert({sequence_groups.at(i)->get_request_id(), i}); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 92358f5810..9ece0ff754 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -105,7 +105,6 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { std::fill_n(prompt_ids.data(), prompt_ids.get_size(), 0); SequenceGroup::Ptr sequence_group = std::make_shared(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching); - sequence_group->update_processed_tokens_num(history_size); sequence_group->set_sequence_group_ptr(sequence_group); requests.push_back(sequence_group);