From 08154facb8bf64fac849301d52637d0de9c878f3 Mon Sep 17 00:00:00 2001 From: Oleg Pipikin Date: Thu, 4 Jul 2024 08:47:04 +0200 Subject: [PATCH 1/2] Fix chat template, add test for chat scenario (#473) Fix chat template, add test for chat scenario --- .github/workflows/causal_lm_cpp.yml | 60 +++++++++++++++++++++++++++++ src/cpp/src/llm_pipeline.cpp | 40 +++++++++++++++++-- 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 69ad8a56cb..f7cb11a8b8 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -524,3 +524,63 @@ jobs: && export PYTHONPATH=./build/:$PYTHONPATH && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" | diff ./pred_greedy.txt - + + cpp-chat_sample-ubuntu: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Download, convert and build + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Compare + run: | + source ./ov/setupvars.sh + printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\nStop!\n' > ./input.txt + timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + python -c " + from transformers import LlamaTokenizer, AutoModelForCausalLM + model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' + tokenizer = LlamaTokenizer.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id) + prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?'] + def gen_prompt(prompt): + return {'role': 'user', 'content': prompt} + def gen_answer(answer): + return {'role': 'assistant', 'content': answer} + chat_history = [] + chat_prompt = '' + output = open('ref.txt', 'w') + for prompt in prompts: + output.write('question:\n') + chat_history.append(gen_prompt(prompt)) + chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt') + answer = model.generate(**tokenized, max_length=1000, do_sample=False) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history.append(gen_answer(answer_str)) + output.write(answer_str) + output.write('\n----------\n') + output.write('question:\n') + output.close() + " + diff pred.txt ref.txt + echo "Chat sample cpp" passed + export PYTHONPATH=./build/:$PYTHONPATH + timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + diff pred2.txt ref.txt + echo "Chat sample python" passed diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 764a17560a..d2eb9f4a66 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -14,6 +14,24 @@ #include "utils.hpp" #include "text_callback_streamer.hpp" +namespace { + +ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){ + auto first_size = fisrt.input_ids.get_size(); + auto second_size = second.input_ids.get_size(); + ov::Shape new_shape{1, first_size - second_size}; + + ov::Tensor new_input_ids(ov::element::i64, new_shape); + auto data_ptr = fisrt.input_ids.data(); + std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data()); + + ov::Tensor new_attention_mask(ov::element::i64, new_shape); + std::fill_n(new_attention_mask.data(), new_shape[1], 1); + + return {new_input_ids, new_attention_mask}; +} +} + namespace ov { namespace genai { @@ -98,15 +116,29 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { std::string& prompt = *input_prompt; if (is_chat_conversation) { + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {, ...}. So if tokenizer applies only to the new prompt, + // will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - - prompt = new_templated_chat_history.substr(m_templated_chat_history.size()); + auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history); + if (m_is_cache_empty) { + encoded_input = new_chat_tokens; + } else { + auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history); + encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens); + } m_templated_chat_history = new_templated_chat_history; + } else { + encoded_input = m_tokenizer.encode(prompt); } - - encoded_input = m_tokenizer.encode(prompt); } auto encoded_results = generate(encoded_input, config, streamer); From 6667c3dec45088a3851a9b394fcb842165620bfe Mon Sep 17 00:00:00 2001 From: Zlobin Vladimir Date: Fri, 5 Jul 2024 09:46:13 +0400 Subject: [PATCH 2/2] Reuse GenerationConfig (#569) --- .../cpp/accuracy_sample/accuracy_sample.cpp | 10 +- .../throughput_benchmark.cpp | 6 +- src/cpp/continuous_batching/CMakeLists.txt | 1 - .../include/continuous_batching_pipeline.hpp | 8 +- .../include/generation_config.hpp | 78 ------------- .../include/generation_handle.hpp | 6 +- .../src/continuous_batching_pipeline.cpp | 14 +-- .../src/generation_config.cpp | 105 ------------------ .../src/logit_processor.hpp | 10 +- src/cpp/continuous_batching/src/sampler.hpp | 58 ++++++---- .../src/sequence_group.hpp | 16 +-- .../src/tests/block_manager.cpp | 2 +- .../src/tests/generate_config.cpp | 34 +++--- .../src/tests/scheduler.cpp | 21 ++-- .../openvino/genai/generation_config.hpp | 22 +++- src/cpp/src/generation_config.cpp | 54 ++++++++- src/cpp/src/llm_pipeline.cpp | 2 +- src/cpp/src/llm_pipeline_static.cpp | 2 +- src/python/py_generate_pipeline.cpp | 10 +- src/python/python.cpp | 30 ----- .../continuous_batching/common.py | 31 +++--- .../continuous_batching/test_sampling.py | 12 +- 22 files changed, 209 insertions(+), 323 deletions(-) delete mode 100644 src/cpp/continuous_batching/include/generation_config.hpp delete mode 100644 src/cpp/continuous_batching/src/generation_config.cpp diff --git a/samples/cpp/accuracy_sample/accuracy_sample.cpp b/samples/cpp/accuracy_sample/accuracy_sample.cpp index 5dbfc70844..2545621d4e 100644 --- a/samples/cpp/accuracy_sample/accuracy_sample.cpp +++ b/samples/cpp/accuracy_sample/accuracy_sample.cpp @@ -51,14 +51,14 @@ int main(int argc, char* argv[]) try { "What is OpenVINO?", }; - std::vector sampling_params_examples { - GenerationConfig::beam_search(), - GenerationConfig::greedy(), - GenerationConfig::multinomial(), + std::vector sampling_params_examples { + ov::genai::beam_search(), + ov::genai::greedy(), + ov::genai::multinomial(), }; std::vector prompts(num_prompts); - std::vector sampling_params(num_prompts); + std::vector sampling_params(num_prompts); for (size_t request_id = 0; request_id < num_prompts; ++request_id) { prompts[request_id] = prompt_examples[request_id % prompt_examples.size()]; diff --git a/samples/cpp/throughput_benchmark/throughput_benchmark.cpp b/samples/cpp/throughput_benchmark/throughput_benchmark.cpp index 09ee08934b..4e47d96a96 100644 --- a/samples/cpp/throughput_benchmark/throughput_benchmark.cpp +++ b/samples/cpp/throughput_benchmark/throughput_benchmark.cpp @@ -37,7 +37,7 @@ class AutoStartTimer { struct Dataset { std::vector m_prompts; - std::vector m_sampling_params; + std::vector m_sampling_params; std::vector m_input_lens, m_output_lens; size_t m_total_input_len = 0; @@ -50,7 +50,7 @@ struct Dataset { m_output_lens.reserve(size); } - void push_data(std::string prompt, GenerationConfig sampling_params) { + void push_data(std::string prompt, ov::genai::GenerationConfig sampling_params) { m_prompts.push_back(prompt); m_sampling_params.push_back(sampling_params); } @@ -121,7 +121,7 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data if (input_len > max_input_len || (input_len + output_len) > 2048) continue; - GenerationConfig greedy_search = GenerationConfig::greedy(); + ov::genai::GenerationConfig greedy_search = ov::genai::greedy(); greedy_search.max_new_tokens = std::min(max_output_len, output_len); dataset.push_data(human_question, greedy_search); diff --git a/src/cpp/continuous_batching/CMakeLists.txt b/src/cpp/continuous_batching/CMakeLists.txt index 41e49da143..7e5ff5c611 100644 --- a/src/cpp/continuous_batching/CMakeLists.txt +++ b/src/cpp/continuous_batching/CMakeLists.txt @@ -28,7 +28,6 @@ find_file(spda_to_pa_header sdpa_to_paged_attention.hpp set(TARGET_NAME openvino_continuous_batching) add_library(${TARGET_NAME} STATIC - src/generation_config.cpp src/generation_handle.cpp src/continuous_batching_pipeline.cpp src/paged_attention_transformations.cpp) diff --git a/src/cpp/continuous_batching/include/continuous_batching_pipeline.hpp b/src/cpp/continuous_batching/include/continuous_batching_pipeline.hpp index 58cf0fdf7e..e03d2fbf0f 100644 --- a/src/cpp/continuous_batching/include/continuous_batching_pipeline.hpp +++ b/src/cpp/continuous_batching/include/continuous_batching_pipeline.hpp @@ -8,7 +8,7 @@ #include "scheduler_config.hpp" #include "openvino/genai/tokenizer.hpp" -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" #include "generation_handle.hpp" struct PipelineMetrics { @@ -32,16 +32,16 @@ class ContinuousBatchingPipeline { std::shared_ptr get_tokenizer(); - GenerationConfig get_config() const; + ov::genai::GenerationConfig get_config() const; PipelineMetrics get_metrics() const; - GenerationHandle add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params); + GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params); void step(); bool has_non_finished_requests(); // more high level interface, which can process multiple prompts in continuous batching manner - std::vector generate(const std::vector& prompts, std::vector sampling_params); + std::vector generate(const std::vector& prompts, std::vector sampling_params); }; diff --git a/src/cpp/continuous_batching/include/generation_config.hpp b/src/cpp/continuous_batching/include/generation_config.hpp deleted file mode 100644 index e53cce86a7..0000000000 --- a/src/cpp/continuous_batching/include/generation_config.hpp +++ /dev/null @@ -1,78 +0,0 @@ - -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include - -enum class StopCriteria { - EARLY, - HEURISTIC, - NEVER -}; - -// TODO: implement better interface, because currently sequence is not available to public API -class Sequence; - -struct GenerationConfig { - // Generic - size_t max_new_tokens = std::numeric_limits::max(); - size_t min_new_tokens = 0; - size_t max_length = std::numeric_limits::max(); // m_max_new_tokens should have priority over m_max_length - bool ignore_eos = false; - - // Beam search specific - size_t num_groups = 1; - size_t group_size = 1; // beam_width - float diversity_penalty = 1.0f; // 0.0 means no diversity - StopCriteria stop_criteria = StopCriteria::HEURISTIC; - size_t num_return_sequences = 3; // is used by beam search, in other case is equal to batch size - - float repetition_penalty = 1.0f; // based on token repetition in prompt and generated tests - float presence_penalty = 0.0f; // based on token repetition and generated tests - float frequence_penalty = 0.0f; // based on quantity token repetition and generated tests - float length_penalty = 1.0f; - size_t no_repeat_ngram_size = std::numeric_limits::max(); - std::function early_finish = [] (const Sequence&) { return false; }; - - // Multinomial - float temperature = 0.0f; // by default we use greedy sampling - int top_k = 0; // HF transformers uses a value of 0 or `None` to disable top-K logit warping - float top_p = 1.0f; // by default convsider all tokens - bool do_sample = false; - size_t rng_seed = 0; - - // special tokens IDs - int64_t bos_token_id = -1; - int64_t pad_token_id = -1; - int64_t eos_token_id = -1; - - // reads generation config from HF generation_config.json - static GenerationConfig from_file(const std::string& generation_config_json); - - static GenerationConfig greedy(); - - static GenerationConfig beam_search(); - - static GenerationConfig multinomial(); - - bool is_greedy_sampling() const { - return temperature == 0.0f && !is_beam_search(); - } - - bool is_beam_search() const { - return num_groups * group_size > 1; - } - - bool is_multinomial() const { - return do_sample; - } - - void set_eos_token_id(size_t tokenizer_eos_token_id); - - void validate() const; -}; diff --git a/src/cpp/continuous_batching/include/generation_handle.hpp b/src/cpp/continuous_batching/include/generation_handle.hpp index 63d40ca935..07091a70c2 100644 --- a/src/cpp/continuous_batching/include/generation_handle.hpp +++ b/src/cpp/continuous_batching/include/generation_handle.hpp @@ -6,7 +6,7 @@ #include #include -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" enum class GenerationStatus { @@ -42,10 +42,10 @@ class GenerationStream; class GenerationHandleImpl { std::shared_ptr m_generation_stream; - GenerationConfig m_sampling_params; + ov::genai::GenerationConfig m_sampling_params; public: - GenerationHandleImpl(std::shared_ptr generation_stream, const GenerationConfig& sampling_params) : + GenerationHandleImpl(std::shared_ptr generation_stream, const ov::genai::GenerationConfig& sampling_params) : m_generation_stream(generation_stream), m_sampling_params(sampling_params) {}; diff --git a/src/cpp/continuous_batching/src/continuous_batching_pipeline.cpp b/src/cpp/continuous_batching/src/continuous_batching_pipeline.cpp index 9f2c8135ec..175e4cb2df 100644 --- a/src/cpp/continuous_batching/src/continuous_batching_pipeline.cpp +++ b/src/cpp/continuous_batching/src/continuous_batching_pipeline.cpp @@ -26,7 +26,7 @@ class ContinuousBatchingPipeline::Impl { // TODO (mzegla): GenerationConfig is request specific object // and pipeline only uses default rng_seed. - GenerationConfig m_generation_config; + ov::genai::GenerationConfig m_generation_config; PipelineMetrics m_pipeline_metrics; @@ -103,7 +103,7 @@ class ContinuousBatchingPipeline::Impl { // read default generation config } - GenerationConfig get_config() const { + ov::genai::GenerationConfig get_config() const { return m_generation_config; } @@ -115,7 +115,7 @@ class ContinuousBatchingPipeline::Impl { return m_tokenizer; } - GenerationHandle add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params) { + GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) { sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id()); sampling_params.validate(); @@ -233,7 +233,7 @@ class ContinuousBatchingPipeline::Impl { return !m_awaiting_requests.empty() || !m_requests.empty(); } - std::vector generate(const std::vector prompts, std::vector sampling_params) { + std::vector generate(const std::vector prompts, std::vector sampling_params) { OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); OPENVINO_ASSERT(prompts.size() == sampling_params.size()); @@ -285,7 +285,7 @@ std::shared_ptr ContinuousBatchingPipeline::get_tokenizer( return m_impl->get_tokenizer(); } -GenerationConfig ContinuousBatchingPipeline::get_config() const{ +ov::genai::GenerationConfig ContinuousBatchingPipeline::get_config() const{ return m_impl->get_config(); } @@ -293,7 +293,7 @@ PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{ return m_impl->get_metrics(); } -GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params) { +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) { return m_impl->add_request(request_id, prompt, sampling_params); } @@ -305,6 +305,6 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() { return m_impl->has_non_finished_requests(); } -std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, std::vector sampling_params) { +std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, std::vector sampling_params) { return m_impl->generate(prompts, sampling_params); } \ No newline at end of file diff --git a/src/cpp/continuous_batching/src/generation_config.cpp b/src/cpp/continuous_batching/src/generation_config.cpp deleted file mode 100644 index 54e3f045f6..0000000000 --- a/src/cpp/continuous_batching/src/generation_config.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "nlohmann/json.hpp" - -#include "generation_config.hpp" - -#include "openvino/core/except.hpp" - -void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { - if (eos_token_id < 0) { - eos_token_id = tokenizer_eos_token_id; - } else { - OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, - "EOS token ID is different in generation config (", eos_token_id, ") and tokenizer (", - tokenizer_eos_token_id, ")"); - } -} - -void GenerationConfig::validate() const { - OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); - OPENVINO_ASSERT(min_new_tokens >= 0, "min_new_tokens must be greater 0"); - OPENVINO_ASSERT(max_new_tokens >= 0, "max_new_tokens must be greater 0"); - if (is_beam_search()) { - OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); - } else { - OPENVINO_ASSERT(repetition_penalty >= 0.0f, "repetition penalty must be a positive value"); - OPENVINO_ASSERT(frequence_penalty >= -2.0f && frequence_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); - OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); - if (is_multinomial()) { - OPENVINO_ASSERT(top_p > 0.0f && top_p <= 1.0f, "top_p must be in the interval (0, 1]"); - OPENVINO_ASSERT(temperature >= 0.0f, "temperature must be a positive value"); - } - } -} - -GenerationConfig GenerationConfig::from_file(const std::string& generation_config_json) { - std::ifstream f(generation_config_json); - nlohmann::json json_data = nlohmann::json::parse(f); - - GenerationConfig config; - - config.bos_token_id = json_data.value("bos_token_id", -1); - config.eos_token_id = json_data.value("eos_token_id", -1); - config.pad_token_id = json_data.value("pad_token_id", -1); - - config.num_return_sequences = json_data.value("num_return_sequences", 1); - - config.max_new_tokens = json_data.value("max_new_tokens", std::numeric_limits::max()); - config.min_new_tokens = json_data.value("min_new_tokens", 0); - config.max_length = json_data.value("max_length", std::numeric_limits::max()); - - config.temperature = json_data.value("temperature", 0.0f); - config.do_sample = json_data.value("do_sample", false); - config.top_p = json_data.value("top_p", 0.0f); - - // beam_search_params - config.num_groups = json_data.value("num_beam_groups", 1); - config.diversity_penalty = json_data.value("diversity_penalty", 1.0f); - config.repetition_penalty = json_data.value("repetition_penalty", 1.0f); - config.frequence_penalty = json_data.value("frequence_penalty", 0.0f); - config.presence_penalty = json_data.value("presence_penalty", 0.0f); - const int num_beams = json_data.value("num_beams", 1); - config.group_size = num_beams / config.num_groups; - - return config; -} - -GenerationConfig GenerationConfig::greedy() { - GenerationConfig greedy_params; - greedy_params.temperature = 0.0f; - greedy_params.ignore_eos = true; - greedy_params.num_return_sequences = 1; - greedy_params.repetition_penalty = 3.0f; - greedy_params.presence_penalty = 0.1f; - greedy_params.frequence_penalty = 0.01f; - greedy_params.max_new_tokens = 30; - return greedy_params; -} - -GenerationConfig GenerationConfig::beam_search() { - GenerationConfig beam_search; - beam_search.num_groups = 2; - beam_search.num_return_sequences = 3; - beam_search.group_size = 2; - beam_search.max_new_tokens = 100; - beam_search.diversity_penalty = 2.0f; - return beam_search; -} - -GenerationConfig GenerationConfig::multinomial() { - GenerationConfig multinomial; - multinomial.do_sample = true; - multinomial.temperature = 0.9f; - multinomial.top_p = 0.9f; - multinomial.top_k = 20; - multinomial.num_return_sequences = 3; - multinomial.presence_penalty = 0.01f; - multinomial.frequence_penalty = 0.1f; - multinomial.min_new_tokens = 15; - multinomial.max_new_tokens = 30; - return multinomial; -} diff --git a/src/cpp/continuous_batching/src/logit_processor.hpp b/src/cpp/continuous_batching/src/logit_processor.hpp index ab151e55aa..048e97ea49 100644 --- a/src/cpp/continuous_batching/src/logit_processor.hpp +++ b/src/cpp/continuous_batching/src/logit_processor.hpp @@ -6,7 +6,7 @@ #include #include -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" struct Token { float m_log_prob = 0.; @@ -277,7 +277,7 @@ class LogitProcessor { size_t m_generated_tokens = 0; public: - LogitProcessor(const GenerationConfig& sampling_params, + LogitProcessor(const ov::genai::GenerationConfig& sampling_params, const LogitTransformers::TokenIds& input_ids) { for (const auto& input_id : input_ids) { m_unique_prompt_token_ids->insert(input_id); @@ -289,7 +289,7 @@ class LogitProcessor { ); } - if (sampling_params.is_multinomial() || sampling_params.is_greedy_sampling()) { + if (sampling_params.is_multinomial() || sampling_params.is_greedy_decoding()) { if (sampling_params.repetition_penalty != 1.0f) { std::shared_ptr transformer = std::shared_ptr(new LogitTransformers::RepetitionPenaltyTransform(sampling_params.repetition_penalty)); @@ -304,9 +304,9 @@ class LogitProcessor { m_logit_transformers.push_back(transformer); } - if (sampling_params.frequence_penalty != 0.0f) { + if (sampling_params.frequency_penalty != 0.0f) { std::shared_ptr transformer = - std::shared_ptr(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequence_penalty)); + std::shared_ptr(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequency_penalty)); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); } diff --git a/src/cpp/continuous_batching/src/sampler.hpp b/src/cpp/continuous_batching/src/sampler.hpp index 322c447435..6672825b15 100644 --- a/src/cpp/continuous_batching/src/sampler.hpp +++ b/src/cpp/continuous_batching/src/sampler.hpp @@ -110,14 +110,17 @@ struct Group { std::vector min_heap; // The worst of the best completed beams is the first bool done = false; - int64_t finish(Beam beam, const GenerationConfig& sampling_params) { + int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) { int64_t preeempted_sequence_id = -1; float generated_len = beam.get_generated_len() + (beam.m_token_id == sampling_params.eos_token_id ? 1 : 0); // HF counts EOS token in generation length beam.m_score /= std::pow(generated_len, sampling_params.length_penalty); min_heap.push_back(beam); std::push_heap(min_heap.begin(), min_heap.end(), greater); - if (min_heap.size() > sampling_params.group_size) { + OPENVINO_ASSERT(sampling_params.num_beams % sampling_params.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; + if (min_heap.size() > group_size) { std::pop_heap(min_heap.begin(), min_heap.end(), greater); preeempted_sequence_id = min_heap.back().m_sequence->get_id(); min_heap.pop_back(); @@ -126,8 +129,11 @@ struct Group { return preeempted_sequence_id; } - void is_done(const GenerationConfig& sampling_params) { - if (min_heap.size() < sampling_params.group_size) + void is_done(const ov::genai::GenerationConfig& sampling_params) { + OPENVINO_ASSERT(sampling_params.num_beams % sampling_params.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; + if (min_heap.size() < group_size) return; const Beam& best_running_sequence = ongoing.front(), & worst_finished_sequence = min_heap.front(); @@ -135,15 +141,15 @@ struct Group { float best_sum_logprobs = best_running_sequence.m_score; float worst_score = worst_finished_sequence.m_score; switch (sampling_params.stop_criteria) { - case StopCriteria::EARLY: + case ov::genai::StopCriteria::EARLY: done = true; return; - case StopCriteria::HEURISTIC: { + case ov::genai::StopCriteria::HEURISTIC: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), sampling_params.length_penalty); done = worst_score >= highest_attainable_score; return; } - case StopCriteria::NEVER: { + case ov::genai::StopCriteria::NEVER: { size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty); done = worst_score >= highest_attainable_score; @@ -165,7 +171,7 @@ struct SamplerOutput { class GroupBeamSearcher { SequenceGroup::Ptr m_sequence_group; - GenerationConfig m_parameters; + ov::genai::GenerationConfig m_parameters; std::vector m_groups; public: explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group); @@ -258,7 +264,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, size_t num_running_sequences = sequence_group->num_running_seqs(); size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); - const GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); + const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); const auto request_id = sequence_group->get_request_id(); if (!m_logit_processors.count(request_id)) { @@ -270,9 +276,9 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); if (sequence_group->requires_sampling()) { - if (sampling_params.is_greedy_sampling() || sampling_params.is_multinomial()) { + if (sampling_params.is_greedy_decoding() || sampling_params.is_multinomial()) { std::vector running_sequences = sequence_group->get_running_sequences(); - if (sampling_params.is_greedy_sampling()) { + if (sampling_params.is_greedy_decoding()) { OPENVINO_ASSERT(num_running_sequences == 1); } auto register_new_token = [&](const Token& sampled_token_id, Sequence::Ptr running_sequence) { @@ -284,7 +290,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, logit_vector = logit_processor.apply(logit_vector); Token sampled_token_id; - if (sampling_params.is_greedy_sampling()) { + if (sampling_params.is_greedy_decoding()) { sampled_token_id = _greedy_sample(logit_vector); } else { // is_multinomial() @@ -360,13 +366,16 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group) : m_sequence_group(sequence_group), m_parameters{m_sequence_group->get_sampling_parameters()}, - m_groups{m_parameters.num_groups} { + m_groups{m_parameters.num_beam_groups} { OPENVINO_ASSERT(m_sequence_group->num_running_seqs() == 1); + OPENVINO_ASSERT(m_parameters.num_beams % m_parameters.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; for (Group& group : m_groups) { - group.ongoing.reserve(m_parameters.group_size); + group.ongoing.reserve(group_size); // initially we just add our "base" sequence to beams inside each group - for (size_t i = 0; i < m_parameters.group_size; ++i) + for (size_t i = 0; i < group_size; ++i) group.ongoing.push_back(Beam((*sequence_group)[0])); // to avoid selecting the same tokens for beams within group, let's just initialize score // for the front one @@ -375,10 +384,13 @@ GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group) } void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) { + OPENVINO_ASSERT(m_parameters.num_beams % m_parameters.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; std::vector next_tokens; std::vector next_beams; - next_tokens.reserve(m_parameters.num_groups * m_parameters.group_size); - next_beams.reserve(m_parameters.num_groups * m_parameters.group_size); + next_tokens.reserve(m_parameters.num_beams); + next_beams.reserve(m_parameters.num_beams); // parent sequence ID -> number of child sequences std::map parent_2_num_childs_map; @@ -447,7 +459,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp continue; std::vector candidates; - candidates.reserve(m_parameters.group_size * 2 * m_parameters.group_size); + candidates.reserve(group_size * 2 * group_size); for (const Beam& beam : group.ongoing) { std::vector tokens = log_softmax(logits, beam.m_global_beam_idx); @@ -486,7 +498,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp try_to_finish_candidate(group, new_candidate); } else { candidates.push_back(new_candidate); - if (++add_count == 2 * m_parameters.group_size) { + if (++add_count == 2 * group_size) { break; } } @@ -494,16 +506,16 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp } // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam - OPENVINO_ASSERT(candidates.size() >= 2 * m_parameters.group_size, "No beams left to search"); + OPENVINO_ASSERT(candidates.size() >= 2 * group_size, "No beams left to search"); - auto to_sort = candidates.begin() + ptrdiff_t(2 * m_parameters.group_size); + auto to_sort = candidates.begin() + ptrdiff_t(2 * group_size); std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { Beam & candidate = candidates[cand_idx]; if (m_parameters.eos_token_id == candidate.m_token_id) { // If beam_token does not belong to top num_beams tokens, it should not be added - if (cand_idx >= m_parameters.group_size) + if (cand_idx >= group_size) continue; // try to finish candidate @@ -513,7 +525,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp child_beams_per_group[group_id].push_back(candidate); // if num childs are enough - if (child_beams_per_group[group_id].size() == m_parameters.group_size) { + if (child_beams_per_group[group_id].size() == group_size) { break; } } diff --git a/src/cpp/continuous_batching/src/sequence_group.hpp b/src/cpp/continuous_batching/src/sequence_group.hpp index b21ca273a0..4897789f6f 100644 --- a/src/cpp/continuous_batching/src/sequence_group.hpp +++ b/src/cpp/continuous_batching/src/sequence_group.hpp @@ -8,7 +8,7 @@ #include #include "generation_handle.hpp" -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" #include "generation_stream.hpp" enum class SequenceStatus { @@ -115,7 +115,7 @@ class Sequence { return m_cumulative_log_prob; } - float get_beam_search_score(const GenerationConfig& sampling_params) const { + float get_beam_search_score(const ov::genai::GenerationConfig& sampling_params) const { float cumulative_log_prob = get_cumulative_log_probs(), current_length = get_generated_len(); float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty); return score; @@ -129,7 +129,7 @@ class Sequence { class SequenceGroup { uint64_t m_request_id; std::vector m_sequences; - GenerationConfig m_sampling_params; + ov::genai::GenerationConfig m_sampling_params; std::size_t m_block_size; TokenIds m_prompt_ids; GenerationStream::Ptr m_generation_stream; @@ -146,7 +146,7 @@ class SequenceGroup { // context length of longest sequence within a group size_t m_max_content_len = 0; - SequenceGroup(uint64_t request_id, const GenerationConfig& sampling_params, std::size_t block_size) + SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size) : m_request_id(request_id), m_sampling_params(sampling_params), m_block_size(block_size) { @@ -156,11 +156,11 @@ class SequenceGroup { using Ptr = std::shared_ptr; using CPtr = std::shared_ptr; - SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const GenerationConfig& sampling_params, std::size_t block_size) + SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size) : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size) { } - SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const GenerationConfig& sampling_params, std::size_t block_size) + SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size) : SequenceGroup(request_id, sampling_params, block_size) { add_sequence(Sequence::create(m_next_sequence_id++)); @@ -363,7 +363,7 @@ class SequenceGroup { return m_sequences.back(); } - const GenerationConfig& get_sampling_parameters() const { + const ov::genai::GenerationConfig& get_sampling_parameters() const { return m_sampling_params; } @@ -459,7 +459,7 @@ class SequenceGroup { } } // For greedy or multinomial sampling we decide whever to stream partial results depending on the user parameter - } else if (m_sampling_params.is_greedy_sampling() || m_sampling_params.is_multinomial()) { + } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) { // TO DO: Now we always stream for greedy search for the sake of benchmarking if (num_total_seqs() == 1 /* m_sampling_params.stream */) { // TODO: support streamimg for n seqs diff --git a/src/cpp/continuous_batching/src/tests/block_manager.cpp b/src/cpp/continuous_batching/src/tests/block_manager.cpp index 79762318c9..6927a98164 100644 --- a/src/cpp/continuous_batching/src/tests/block_manager.cpp +++ b/src/cpp/continuous_batching/src/tests/block_manager.cpp @@ -7,7 +7,7 @@ #include "continuous_batching_pipeline.hpp" #include "sequence_group.hpp" #include "scheduler.hpp" -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" TEST(TestBlockManager, general_test) { BlockManager bm = BlockManager(6); diff --git a/src/cpp/continuous_batching/src/tests/generate_config.cpp b/src/cpp/continuous_batching/src/tests/generate_config.cpp index 1774553313..3bd53a4ca6 100644 --- a/src/cpp/continuous_batching/src/tests/generate_config.cpp +++ b/src/cpp/continuous_batching/src/tests/generate_config.cpp @@ -3,24 +3,24 @@ #include #include -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" TEST(GenerationConfigTest, invalid_temperature) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.temperature = -0.1; config.do_sample = true; EXPECT_THROW(config.validate(), ov::Exception); } TEST(GenerationConfigTest, valid_temperature) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.temperature = 0.1; EXPECT_NO_THROW(config.validate()); } TEST(GenerationConfigTest, invalid_top_p) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.top_p = -0.5; EXPECT_THROW(config.validate(), ov::Exception); @@ -29,14 +29,14 @@ TEST(GenerationConfigTest, invalid_top_p) { } TEST(GenerationConfigTest, valid_top_p) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.top_p = 0.1; EXPECT_NO_THROW(config.validate()); } TEST(GenerationConfigTest, invalid_repeatition_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.repetition_penalty = -3.0; EXPECT_THROW(config.validate(), ov::Exception); @@ -45,7 +45,7 @@ TEST(GenerationConfigTest, invalid_repeatition_penalty) { } TEST(GenerationConfigTest, valid_repeatition_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.repetition_penalty = 1.8; EXPECT_NO_THROW(config.validate()); @@ -54,7 +54,7 @@ TEST(GenerationConfigTest, valid_repeatition_penalty) { } TEST(GenerationConfigTest, invalid_presence_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.presence_penalty = 3.0; EXPECT_THROW(config.validate(), ov::Exception); @@ -63,7 +63,7 @@ TEST(GenerationConfigTest, invalid_presence_penalty) { } TEST(GenerationConfigTest, valid_presence_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; config.do_sample = true; config.presence_penalty = 1.8; EXPECT_NO_THROW(config.validate()); @@ -71,20 +71,20 @@ TEST(GenerationConfigTest, valid_presence_penalty) { EXPECT_NO_THROW(config.validate()); } -TEST(GenerationConfigTest, invalid_frequence_penalty) { - GenerationConfig config; +TEST(GenerationConfigTest, invalid_frequency_penalty) { + ov::genai::GenerationConfig config; config.do_sample = true; - config.frequence_penalty = 3.0; + config.frequency_penalty = 3.0; EXPECT_THROW(config.validate(), ov::Exception); - config.frequence_penalty = -3.1; + config.frequency_penalty = -3.1; EXPECT_THROW(config.validate(), ov::Exception); } -TEST(GenerationConfigTest, valid_frequence_penalty) { - GenerationConfig config; +TEST(GenerationConfigTest, valid_frequency_penalty) { + ov::genai::GenerationConfig config; config.do_sample = true; - config.frequence_penalty = 1.8; + config.frequency_penalty = 1.8; EXPECT_NO_THROW(config.validate()); - config.frequence_penalty = -2.0; + config.frequency_penalty = -2.0; EXPECT_NO_THROW(config.validate()); } diff --git a/src/cpp/continuous_batching/src/tests/scheduler.cpp b/src/cpp/continuous_batching/src/tests/scheduler.cpp index 73186f34e0..cf8e3f0dd9 100644 --- a/src/cpp/continuous_batching/src/tests/scheduler.cpp +++ b/src/cpp/continuous_batching/src/tests/scheduler.cpp @@ -7,7 +7,7 @@ #include "continuous_batching_pipeline.hpp" #include "sequence_group.hpp" #include "scheduler.hpp" -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" void clear_finished_sequences(std::vector& requests) { auto new_end = std::remove_if(requests.begin(), requests.end(), [] (SequenceGroup::CPtr seq_group) -> bool { @@ -16,7 +16,6 @@ void clear_finished_sequences(std::vector& requests) { requests.erase(new_end, requests.end()); } - TEST(TestScheduler, general_test) { std::vector configs{ SchedulerConfig { @@ -37,13 +36,13 @@ TEST(TestScheduler, general_test) { for (auto scheduler_config: configs) { std::vector tokens = {0,1,2,3,4,5,6,7}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); SequenceGroup::Ptr sequence_group3 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx2 = (*sequence_group3)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2, sequence_group3}; @@ -133,10 +132,10 @@ TEST(TestScheduler, test_append_slots_considers_all_sequences) { for (auto scheduler_config: configs) { std::vector tokens = {0,1,2,3,4,5,6,7}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; @@ -203,11 +202,11 @@ TEST(TestScheduler, test_partial_preemption) { for (auto scheduler_config: configs) { std::vector tokens1 = {0,1,2,3,4,5,6,7,8,9,10}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens1.size()}, tokens1.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); std::vector tokens2 = {0,1,2,3,4,5,6,7}; auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; @@ -300,10 +299,10 @@ TEST(TestScheduler, test_partially_preempted_prompt) { for (auto scheduler_config: configs) { std::vector tokens = {0,1,2,3,4,5,6,7,8,9,10,11}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 99a461deda..c74349fd4f 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -33,6 +33,7 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. * @param ignore_eos if set to true, then generation will not stop even if token is met. * @param eos_token_id token_id of (end of sentence) + * @param min_new_tokens set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. * * Beam search specific parameters: * @param num_beams number of beams for beam search. 1 disables beam search. @@ -56,6 +57,9 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + * @param presence_penalty reduces absolute log prob if the token was generated at least once. Ignored for non continuous batching. + * @param frequency_penalty reduces absolute log prob as many times as the token was generated. Ignored for non continuous batching. + * @param rng_seed initializes random generator. Ignored for non continuous batching. */ class OPENVINO_GENAI_EXPORTS GenerationConfig { public: @@ -66,6 +70,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t max_new_tokens = SIZE_MAX; size_t max_length = SIZE_MAX; bool ignore_eos = false; + size_t min_new_tokens = 0; // Beam search specific size_t num_beam_groups = 1; @@ -79,13 +84,20 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { // Multinomial float temperature = 1.0f; float top_p = 1.0f; - size_t top_k = 50; + size_t top_k = std::numeric_limits::max(); bool do_sample = false; float repetition_penalty = 1.0f; + float presence_penalty = 0.0; + float frequency_penalty = 0.0f; + size_t rng_seed = 0; // EOS special token int64_t eos_token_id = -1; + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. + * Otherwise verifies eos_token_id == tokenizer_eos_token_id. + */ + void set_eos_token_id(size_t tokenizer_eos_token_id); size_t get_max_new_tokens(size_t prompt_length = 0) const; bool is_greedy_decoding() const; bool is_beam_search() const; @@ -110,6 +122,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { static constexpr ov::Property max_new_tokens{"max_new_tokens"}; static constexpr ov::Property max_length{"max_length"}; static constexpr ov::Property ignore_eos{"ignore_eos"}; +static constexpr ov::Property min_new_tokens{"min_new_tokens"}; static constexpr ov::Property num_beam_groups{"num_beam_groups"}; static constexpr ov::Property num_beams{"num_beams"}; @@ -125,6 +138,13 @@ static constexpr ov::Property top_k{"top_k"}; static constexpr ov::Property do_sample{"do_sample"}; static constexpr ov::Property repetition_penalty{"repetition_penalty"}; static constexpr ov::Property eos_token_id{"eos_token_id"}; +static constexpr ov::Property presence_penalty{"presence_penalty"}; +static constexpr ov::Property frequency_penalty{"frequency_penalty"}; +static constexpr ov::Property rng_seed{"rng_seed"}; +// Predefined Configs +OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); +OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); +OPENVINO_GENAI_EXPORTS GenerationConfig multinomial(); } // namespace genai } // namespace ov diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index ce313de1c3..6578a6bd08 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -49,6 +49,16 @@ GenerationConfig::GenerationConfig(const std::string& json_path) { } } +void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { + if (eos_token_id < 0) { + eos_token_id = tokenizer_eos_token_id; + } else { + OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, + "EOS token ID is different in generation config (", eos_token_id, ") and tokenizer (", + tokenizer_eos_token_id, ")"); + } +} + void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { using ov::genai::utils::read_anymap_param; @@ -96,8 +106,9 @@ void GenerationConfig::validate() const { "Beam search with sampling is not supported yet. " "Please either set do_sample=false to use beam search " "or set num_beams=1 if you with to use multinomial sampling."); - OPENVINO_ASSERT(num_return_sequences <= num_beams, "num_return_sequences must be less or equal to num_beams"); + OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0"); + OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); // max_new_tokens has priority over max_length // if max_new_tokens is defined no need to check max_length @@ -123,7 +134,48 @@ void GenerationConfig::validate() const { OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + if (is_beam_search()) { + OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + } else { + OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); + OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); + } } +GenerationConfig beam_search() { + GenerationConfig beam_search_config; + beam_search_config.num_beams = 4; + beam_search_config.num_return_sequences = 3; + beam_search_config.num_beam_groups = 2; + beam_search_config.max_new_tokens = 100; + beam_search_config.diversity_penalty = 2.0f; + return beam_search_config; +} + +GenerationConfig greedy() { + GenerationConfig greedy_config; + greedy_config.temperature = 0.0f; + greedy_config.ignore_eos = true; + greedy_config.num_return_sequences = 1; + greedy_config.repetition_penalty = 3.0f; + greedy_config.presence_penalty = 0.1f; + greedy_config.frequency_penalty = 0.01f; + greedy_config.max_new_tokens = 30; + return greedy_config; +} + +GenerationConfig multinomial() { + GenerationConfig multinomial_config; + multinomial_config.do_sample = true; + multinomial_config.temperature = 0.9f; + multinomial_config.top_p = 0.9f; + multinomial_config.top_k = 20; + multinomial_config.num_return_sequences = 3; + multinomial_config.presence_penalty = 0.01f; + multinomial_config.frequency_penalty = 0.1f; + multinomial_config.min_new_tokens = 15; + multinomial_config.max_new_tokens = 30; + return multinomial_config; +} } // namespace genai } // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index d2eb9f4a66..200ce5a635 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -93,7 +93,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) - m_generation_config.eos_token_id = m_tokenizer.get_eos_token_id(); + m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } StatefulLLMPipeline( diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index ec123aa167..3a9ea4d1d9 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -203,7 +203,7 @@ EncodedResults StaticLLMPipeline::generate( GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; // If eos_token_id was not provided, take value from default m_generation_config if (config.eos_token_id == -1) - config.eos_token_id = m_generation_config.eos_token_id; + config.set_eos_token_id(m_generation_config.eos_token_id); config.validate(); std::shared_ptr streamer_ptr; diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 7e3f846c01..a1f8072798 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -137,7 +137,7 @@ OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfi } else if (key == "repetition_penalty") { res_config.repetition_penalty = py::cast(item.second); } else if (key == "eos_token_id") { - res_config.eos_token_id = py::cast(item.second); + res_config.set_eos_token_id(py::cast(item.second)); } else { throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); @@ -495,6 +495,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) .def_readwrite("max_length", &GenerationConfig::max_length) .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("min_new_tokens", &GenerationConfig::min_new_tokens) .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) .def_readwrite("num_beams", &GenerationConfig::num_beams) .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) @@ -507,7 +508,12 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readwrite("top_k", &GenerationConfig::top_k) .def_readwrite("do_sample", &GenerationConfig::do_sample) .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) - .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id); + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id) + .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) + .def_readwrite("frequency_penalty", &GenerationConfig::frequency_penalty) + .def_readwrite("rng_seed", &GenerationConfig::rng_seed) + .def("set_eos_token_id", &GenerationConfig::set_eos_token_id) + .def("is_beam_search", &GenerationConfig::is_beam_search); py::class_(m, "DecodedResults") .def(py::init<>()) diff --git a/src/python/python.cpp b/src/python/python.cpp index 0e5a35e7ac..8034028927 100644 --- a/src/python/python.cpp +++ b/src/python/python.cpp @@ -68,36 +68,6 @@ PYBIND11_MODULE(py_continuous_batching, m) { return res; }); - py::enum_(m, "StopCriteria") - .value("EARLY", StopCriteria::EARLY) - .value("HEURISTIC", StopCriteria::HEURISTIC) - .value("NEVER", StopCriteria::NEVER) - .export_values(); - - py::class_(m, "GenerationConfig") - .def(py::init<>()) - .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) - .def_readwrite("min_new_tokens", &GenerationConfig::min_new_tokens) - .def_readwrite("max_length", &GenerationConfig::max_length) - .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) - .def_readwrite("num_groups", &GenerationConfig::num_groups) - .def_readwrite("group_size", &GenerationConfig::group_size) - .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) - .def_readwrite("stop_criteria", &GenerationConfig::stop_criteria) - .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) - .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) - .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) - .def_readwrite("frequence_penalty", &GenerationConfig::frequence_penalty) - .def_readwrite("length_penalty", &GenerationConfig::length_penalty) - .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) - .def_readwrite("temperature", &GenerationConfig::temperature) - .def_readwrite("top_k", &GenerationConfig::top_k) - .def_readwrite("top_p", &GenerationConfig::top_p) - .def_readwrite("do_sample", &GenerationConfig::do_sample) - .def_readwrite("rng_seed", &GenerationConfig::rng_seed) - .def_property_readonly("is_greedy_sampling", &GenerationConfig::is_greedy_sampling) - .def_property_readonly("is_beam_search", &GenerationConfig::is_beam_search); - py::class_(m, "SchedulerConfig") .def(py::init<>()) .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens) diff --git a/tests/python_tests/continuous_batching/common.py b/tests/python_tests/continuous_batching/common.py index dfd911f206..2825ccd375 100644 --- a/tests/python_tests/continuous_batching/common.py +++ b/tests/python_tests/continuous_batching/common.py @@ -7,7 +7,8 @@ from optimum.intel import OVModelForCausalLM from pathlib import Path -from openvino_genai.py_continuous_batching import ContinuousBatchingPipeline, GenerationConfig, SchedulerConfig, GenerationResult +from openvino_genai.py_continuous_batching import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult +from openvino_genai import GenerationConfig from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import GenerationConfig as HFGenerationConfig from typing import List, Tuple @@ -37,7 +38,7 @@ def get_greedy_with_penalties() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 generation_config.presence_penalty = 2.0 - generation_config.frequence_penalty = 0.2 + generation_config.frequency_penalty = 0.2 generation_config.max_new_tokens = 30 return generation_config @@ -51,21 +52,21 @@ def get_greedy_with_min_and_max_tokens() -> GenerationConfig: def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_groups = 3 - generation_config.group_size = 2 + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_groups * generation_config.group_size + generation_config.num_return_sequences = generation_config.num_beams return generation_config def get_beam_search_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_groups = 3 - generation_config.group_size = 2 + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 generation_config.min_new_tokens = 15 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_groups * generation_config.group_size + generation_config.num_return_sequences = generation_config.num_beams return generation_config def get_multinomial_temperature() -> GenerationConfig: @@ -136,7 +137,7 @@ def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True generation_config.temperature = 0.8 - generation_config.frequence_penalty = 0.5 + generation_config.frequency_penalty = 0.5 generation_config.num_return_sequences = 1 generation_config.max_new_tokens = 30 return generation_config @@ -158,7 +159,7 @@ def get_multinomial_max_and_min_token() -> GenerationConfig: multinomial.top_k = 20 multinomial.num_return_sequences = 3 multinomial.presence_penalty = 0.01 - multinomial.frequence_penalty = 0.1 + multinomial.frequency_penalty = 0.1 multinomial.min_new_tokens = 15 multinomial.max_new_tokens = 30 return multinomial @@ -218,10 +219,10 @@ def convert_to_hf( kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty - if generation_config.num_groups * generation_config.group_size > 1: + if generation_config.num_beams > 1: # beam search case - kwargs['num_beam_groups'] = generation_config.num_groups - kwargs['num_beams'] = generation_config.num_groups * generation_config.group_size + kwargs['num_beam_groups'] = generation_config.num_beam_groups + kwargs['num_beams'] = generation_config.num_beams kwargs['diversity_penalty'] = generation_config.diversity_penalty kwargs['length_penalty'] = generation_config.length_penalty kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size @@ -257,7 +258,7 @@ def run_hugging_face( generation_result = GenerationResult() generation_result.m_generation_ids = all_text_batch # sequences_scores are available only for beam search case - if generation_config.is_beam_search: + if generation_config.is_beam_search(): generation_result.m_scores = [score for score in generate_outputs.sequences_scores] generation_results.append(generation_result) @@ -293,7 +294,7 @@ def get_models_list(file_name: str): def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): - if generation_config.is_beam_search: + if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): # Note, that for fp32 / fp16 models scores are different less than 0.001 diff --git a/tests/python_tests/continuous_batching/test_sampling.py b/tests/python_tests/continuous_batching/test_sampling.py index 265c8caa6a..0e5667ea1e 100644 --- a/tests/python_tests/continuous_batching/test_sampling.py +++ b/tests/python_tests/continuous_batching/test_sampling.py @@ -3,9 +3,11 @@ import os import pytest import shutil +import sys from dataclasses import dataclass from pathlib import Path -from openvino_genai.py_continuous_batching import GenerationConfig, ContinuousBatchingPipeline +from openvino_genai.py_continuous_batching import ContinuousBatchingPipeline +from openvino_genai import GenerationConfig from typing import List from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ @@ -22,6 +24,7 @@ @pytest.mark.precommit @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +@pytest.mark.xfail(reason='CPU: head size must be multiple of 16, current: 8. Ticket 145986.', raises=RuntimeError, strict=True) def test_sampling_precommit(tmp_path, model_id): run_test_pipeline(tmp_path, model_id) @@ -163,6 +166,13 @@ class RandomSamplingTestStruct: "greedy_with_penalties", "multinomial_max_and_min_token"]) def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): + if test_struct in ( + RANDOM_SAMPLING_TEST_CASES[1], + RANDOM_SAMPLING_TEST_CASES[3], + RANDOM_SAMPLING_TEST_CASES[6], + RANDOM_SAMPLING_TEST_CASES[10], + ) and sys.platform.startswith("win"): + pytest.xfail("assert ref_text == ov_text fails") generation_config = test_struct.generation_config prompts = test_struct.prompts