From cb100cb3bc7459bb489154937b3a076c5bd9f1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= Date: Thu, 11 Jul 2024 16:50:27 +0200 Subject: [PATCH 01/15] [Continuous batching] Replace standard max_element call with custom loop for greedy sampling (#607) Searching for max element in a custom loop gives better performance than using std::max_element --- src/cpp/src/sampler.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index dc631c68ac..6390fc8725 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -219,8 +219,13 @@ class Sampler { } Token _greedy_sample(const std::vector& logit_vector) const { - auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; }); - return *out_token; + Token max_token{-std::numeric_limits::infinity() , 0}; + for (const auto& logit : logit_vector) { + if (logit.m_log_prob > max_token.m_log_prob) { + max_token = logit; + } + } + return max_token; } std::vector _multinomial_sample(const std::vector& logit_vector, size_t num_tokens_per_sequence) { From f0e41909ab06e22c569f1af54654aad521ce4a6e Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 12 Jul 2024 12:21:38 +0200 Subject: [PATCH 02/15] wip --- samples/CMakeLists.txt | 1 + .../benchmark_vanilla_genai/CMakeLists.txt | 25 +++++++ samples/cpp/benchmark_vanilla_genai/README.md | 2 + .../benchmark_vanilla_genai.cpp | 65 +++++++++++++++++++ .../openvino/genai/generation_metrics.hpp | 40 ++++++++++++ .../include/openvino/genai/llm_pipeline.hpp | 4 ++ src/cpp/src/generation_metrics.cpp | 62 ++++++++++++++++++ src/cpp/src/greedy_decoding.cpp | 17 ++++- src/cpp/src/llm_pipeline.cpp | 10 ++- src/cpp/src/llm_pipeline_base.hpp | 2 + 10 files changed, 223 insertions(+), 5 deletions(-) create mode 100644 samples/cpp/benchmark_vanilla_genai/CMakeLists.txt create mode 100644 samples/cpp/benchmark_vanilla_genai/README.md create mode 100644 samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp create mode 100644 src/cpp/include/openvino/genai/generation_metrics.hpp create mode 100644 src/cpp/src/generation_metrics.cpp diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 0839d58428..44f8d580b2 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -10,6 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm) add_subdirectory(cpp/multinomial_causal_lm) add_subdirectory(cpp/prompt_lookup_decoding_lm) add_subdirectory(cpp/speculative_decoding_lm) +add_subdirectory(cpp/benchmark_vanilla_genai) install(FILES requirements.txt DESTINATION samples COMPONENT cpp_samples_genai) diff --git a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt b/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt new file mode 100644 index 0000000000..e871f5a33a --- /dev/null +++ b/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp) +target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_vanilla_genai PROPERTIES + COMPILE_PDB_NAME benchmark_vanilla_genai + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11) +install(TARGETS benchmark_vanilla_genai + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md new file mode 100644 index 0000000000..739c2e950c --- /dev/null +++ b/samples/cpp/benchmark_vanilla_genai/README.md @@ -0,0 +1,2 @@ +# benchmark OpenVINO GenAI sample + diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp new file mode 100644 index 0000000000..ccb7650b84 --- /dev/null +++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) try { + cxxopts::Options options("benchmark_vanilla_genai", "Help command"); + + options.add_options() + ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) + ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) + ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(1))) + ("d,device", "device", cxxopts::value()->default_value("CPU")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; + } + + std::string prompt = result["prompt"].as(); + const std::string model_path = result["model"].as(); + std::string device = result["device"].as(); + size_t num_warmup = result["num_warmup"].as(); + size_t num_iter = result["num_iter"].as(); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + + ov::genai::LLMPipeline pipe(model_path, device); + + for (size_t i = 0; i < num_warmup; i++) + pipe.generate(prompt, config); + + ov::genai::GenerationMetrics metrics; + for (size_t i = 0; i < num_iter; i++) { + ov::genai::DecodedResults res = pipe.generate(prompt, config); + metrics = metrics + res.metrics; + metrics.load_time = res.metrics.load_time; + } + + std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; + std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl; + std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl; + std::cout << "Tokens/s: " << metrics.get_tokens_per_sec().first << std::endl; + + return 0; +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/src/cpp/include/openvino/genai/generation_metrics.hpp b/src/cpp/include/openvino/genai/generation_metrics.hpp new file mode 100644 index 0000000000..7129e5c52b --- /dev/null +++ b/src/cpp/include/openvino/genai/generation_metrics.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +namespace ov { +namespace genai { + +using TimePoints = std::vector; + +struct GenerationMetrics { + GenerationMetrics() = default; + + GenerationMetrics(const TimePoints& tok_times, size_t batch_size = 1); + GenerationMetrics(const std::vector& durations, const std::vector& times_to_first_token, size_t batch_size = 1); + + // First token time. + float mean_ttft; + float std_ttft; + std::vector times_to_first_token; + + // Time per output token. + float mean_tpot; + float std_tpot; + std::vector durations; + + std::pair get_tokens_per_sec() const; + size_t batch_size; + float load_time; + + GenerationMetrics operator+(GenerationMetrics const& metrics) const; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 84dc02bd58..9f0c9fba97 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -5,11 +5,13 @@ #include #include +#include #include "openvino/core/any.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/generation_metrics.hpp" namespace ov { namespace genai { @@ -34,6 +36,7 @@ class EncodedResults { public: std::vector> tokens; std::vector scores; + GenerationMetrics metrics; }; /** @@ -47,6 +50,7 @@ class DecodedResults { public: std::vector texts; std::vector scores; + GenerationMetrics metrics; // @brief Convert DecodedResults to a string. operator std::string() const { diff --git a/src/cpp/src/generation_metrics.cpp b/src/cpp/src/generation_metrics.cpp new file mode 100644 index 0000000000..8ca8e0a07d --- /dev/null +++ b/src/cpp/src/generation_metrics.cpp @@ -0,0 +1,62 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/generation_metrics.hpp" +#include + +namespace { + +std::pair calc_mean_and_std(const std::vector& durations) { + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size(); + + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const float& duration) -> float { + return acc + duration * duration; + }); + float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); + return {mean, std}; +} + +} // namespace + +namespace ov { +namespace genai { + + +GenerationMetrics::GenerationMetrics(const TimePoints& tok_times, size_t batch_size) { + this->batch_size = batch_size; + durations = std::vector(tok_times.size() - 1); + for (size_t i = 1; i < tok_times.size(); ++i) { + durations[i - 1] = std::chrono::duration_cast(tok_times[i] - tok_times[i - 1]).count(); + } + times_to_first_token.emplace_back(durations[0]); + + std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations); + std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token); +} + +GenerationMetrics::GenerationMetrics(const std::vector& durations_, const std::vector& times_to_first_token_, size_t batch_size) + : durations(durations_), times_to_first_token(times_to_first_token_) { + this->batch_size = batch_size; + std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations); + std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token); +} + +GenerationMetrics GenerationMetrics::operator+(GenerationMetrics const& metrics) const { + std::vector new_durations = durations; + std::vector new_times_to_first_token = times_to_first_token; + new_durations.insert(new_durations.end(), metrics.durations.begin(), metrics.durations.end()); + new_times_to_first_token.insert(new_times_to_first_token.end(), metrics.times_to_first_token.begin(), metrics.times_to_first_token.end()); + + return GenerationMetrics(new_durations, new_times_to_first_token); +} + +std::pair GenerationMetrics::get_tokens_per_sec() const { + auto mean_tps = 1000.0f * batch_size / mean_tpot; + auto std_tps = 1000.0f * std_tpot / (mean_tpot * mean_tpot); + return {mean_tps, std_tps}; +} + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 9170c7d2f9..dad93a0e6e 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -19,12 +19,18 @@ EncodedResults greedy_decoding( const size_t batch_size = prompts_shape[0]; size_t running_batch_size = batch_size; size_t prompt_len = prompts_shape[1]; + size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); EncodedResults results; + // Time before the first token generated as a reference point. + ov::genai::TimePoints tok_times; + tok_times.reserve(max_new_tokens); + tok_times.emplace_back(std::chrono::steady_clock::now()); + results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); - + m_model_runner.set_tensor("input_ids", input_ids); m_model_runner.set_tensor("attention_mask", attention_mask); if (position_ids.has_value()) @@ -50,6 +56,8 @@ EncodedResults greedy_decoding( eos_met[batch] = (out_token == generation_config.eos_token_id); m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } + tok_times.emplace_back(std::chrono::steady_clock::now()); + if (streamer && streamer->put(token_iter_results[0])) { return results; } @@ -58,8 +66,8 @@ EncodedResults greedy_decoding( if (!generation_config.ignore_eos && all_are_eos) return results; - size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); - for (size_t i = 0; i < max_tokens - 1; ++i) { + + for (size_t i = 0; i < max_new_tokens - 1; ++i) { if (position_ids.has_value()) utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); @@ -80,6 +88,7 @@ EncodedResults greedy_decoding( m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } + tok_times.emplace_back(std::chrono::steady_clock::now()); if (streamer && streamer->put(token_iter_results[0])) return results; @@ -106,6 +115,8 @@ EncodedResults greedy_decoding( if (streamer) { streamer->end(); } + + results.metrics = GenerationMetrics(tok_times); return results; } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 507d988a6a..918e744286 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -9,6 +9,7 @@ #include #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/generation_metrics.hpp" #include "llm_pipeline_base.hpp" #include "llm_pipeline_static.hpp" #include "utils.hpp" @@ -155,6 +156,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { m_history.push_back({{"role", "assistant"}, {"content", answer}}); } + decoded_results.metrics = std::move(encoded_results.metrics); + decoded_results.metrics.load_time = m_load_time_ms; return decoded_results; } @@ -253,7 +256,6 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else { m_is_cache_empty = false; } - return result; } @@ -350,6 +352,7 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& plugin_config ) { + if (device == "NPU") { m_pimpl = make_unique(std::filesystem::path(model_path), tokenizer, device, plugin_config); } else { @@ -361,12 +364,15 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& path, const std::string& device, const ov::AnyMap& config -) { +) { + auto start_time = std::chrono::steady_clock::now(); if (device == "NPU") { m_pimpl = make_unique(std::filesystem::path(path), device, config); } else { m_pimpl = make_unique(std::filesystem::path(path), device, config); } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp index 9df6442b35..7e58cd3b37 100644 --- a/src/cpp/src/llm_pipeline_base.hpp +++ b/src/cpp/src/llm_pipeline_base.hpp @@ -36,6 +36,8 @@ class LLMPipelineImplBase { Tokenizer m_tokenizer; GenerationConfig m_generation_config; + + float m_load_time_ms = 0; }; } // namespace genai From 7cab496c63a598dcb96027c9a88d3c96ef1b5b48 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 19 Jul 2024 13:01:02 +0200 Subject: [PATCH 03/15] add detokenization metric; refactor split to perf_conter & perf_metrics --- .../benchmark_vanilla_genai.cpp | 8 +- .../openvino/genai/generation_metrics.hpp | 40 --------- .../include/openvino/genai/llm_pipeline.hpp | 6 +- .../include/openvino/genai/perf_metrics.hpp | 50 ++++++++++++ src/cpp/src/generation_metrics.cpp | 62 -------------- src/cpp/src/greedy_decoding.cpp | 19 ++--- src/cpp/src/group_beam_searcher.cpp | 19 +++-- src/cpp/src/llm_pipeline.cpp | 30 +++++-- src/cpp/src/perf_counters.cpp | 21 +++++ src/cpp/src/perf_counters.hpp | 44 ++++++++++ src/cpp/src/perf_metrics.cpp | 81 +++++++++++++++++++ src/cpp/src/tokenizer.cpp | 2 + src/cpp/src/utils.hpp | 14 ++++ src/python/py_generate_pipeline.cpp | 14 ++++ tests/python_tests/ov_genai_test_utils.py | 2 + 15 files changed, 282 insertions(+), 130 deletions(-) delete mode 100644 src/cpp/include/openvino/genai/generation_metrics.hpp create mode 100644 src/cpp/include/openvino/genai/perf_metrics.hpp delete mode 100644 src/cpp/src/generation_metrics.cpp create mode 100644 src/cpp/src/perf_counters.cpp create mode 100644 src/cpp/src/perf_counters.hpp create mode 100644 src/cpp/src/perf_metrics.cpp diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp index ccb7650b84..6489282b0b 100644 --- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp +++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp @@ -37,23 +37,25 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config; config.max_new_tokens = 100; + config.num_beam_groups = 3; + config.num_beams = 15; ov::genai::LLMPipeline pipe(model_path, device); for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, config); - ov::genai::GenerationMetrics metrics; + ov::genai::PerfMetrics metrics; for (size_t i = 0; i < num_iter; i++) { ov::genai::DecodedResults res = pipe.generate(prompt, config); metrics = metrics + res.metrics; metrics.load_time = res.metrics.load_time; } - + std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl; std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl; - std::cout << "Tokens/s: " << metrics.get_tokens_per_sec().first << std::endl; + std::cout << "Tokens/s: " << metrics.mean_throughput << std::endl; return 0; } catch (const std::exception& error) { diff --git a/src/cpp/include/openvino/genai/generation_metrics.hpp b/src/cpp/include/openvino/genai/generation_metrics.hpp deleted file mode 100644 index 7129e5c52b..0000000000 --- a/src/cpp/include/openvino/genai/generation_metrics.hpp +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include - -namespace ov { -namespace genai { - -using TimePoints = std::vector; - -struct GenerationMetrics { - GenerationMetrics() = default; - - GenerationMetrics(const TimePoints& tok_times, size_t batch_size = 1); - GenerationMetrics(const std::vector& durations, const std::vector& times_to_first_token, size_t batch_size = 1); - - // First token time. - float mean_ttft; - float std_ttft; - std::vector times_to_first_token; - - // Time per output token. - float mean_tpot; - float std_tpot; - std::vector durations; - - std::pair get_tokens_per_sec() const; - size_t batch_size; - float load_time; - - GenerationMetrics operator+(GenerationMetrics const& metrics) const; -}; - -} // namespace genai -} // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 9f0c9fba97..4db3c613e7 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -11,7 +11,7 @@ #include "openvino/genai/generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/streamer_base.hpp" -#include "openvino/genai/generation_metrics.hpp" +#include "openvino/genai/perf_metrics.hpp" namespace ov { namespace genai { @@ -36,7 +36,7 @@ class EncodedResults { public: std::vector> tokens; std::vector scores; - GenerationMetrics metrics; + PerfMetrics metrics; }; /** @@ -50,7 +50,7 @@ class DecodedResults { public: std::vector texts; std::vector scores; - GenerationMetrics metrics; + PerfMetrics metrics; // @brief Convert DecodedResults to a string. operator std::string() const { diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp new file mode 100644 index 0000000000..a11c4e0374 --- /dev/null +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -0,0 +1,50 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "openvino/genai/visibility.hpp" +#include +#include + +namespace ov { +namespace genai { + +using TimePoint = std::chrono::steady_clock::time_point; + +struct PerfCounters; + +struct OPENVINO_GENAI_EXPORTS PerfMetrics { + // First token time. + float mean_ttft; + float std_ttft; + + // Time per output token. + float mean_tpot; + float std_tpot; + + float load_time; + float start_time; + + float mean_generate_duration; + float mean_decoding_duration; + float mean_encoding_duration; + + float mean_throughput; + float std_throughput; + + size_t num_generated_tokens; + size_t num_input_tokens; + + std::shared_ptr m_counters; + void evaluate(TimePoint start_time); + + PerfMetrics operator+(const PerfMetrics& metrics) const; + PerfMetrics& operator+=(const PerfMetrics& right); + + +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/generation_metrics.cpp b/src/cpp/src/generation_metrics.cpp deleted file mode 100644 index 8ca8e0a07d..0000000000 --- a/src/cpp/src/generation_metrics.cpp +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/generation_metrics.hpp" -#include - -namespace { - -std::pair calc_mean_and_std(const std::vector& durations) { - float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size(); - - float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, - [](const float& acc, const float& duration) -> float { - return acc + duration * duration; - }); - float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); - return {mean, std}; -} - -} // namespace - -namespace ov { -namespace genai { - - -GenerationMetrics::GenerationMetrics(const TimePoints& tok_times, size_t batch_size) { - this->batch_size = batch_size; - durations = std::vector(tok_times.size() - 1); - for (size_t i = 1; i < tok_times.size(); ++i) { - durations[i - 1] = std::chrono::duration_cast(tok_times[i] - tok_times[i - 1]).count(); - } - times_to_first_token.emplace_back(durations[0]); - - std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations); - std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token); -} - -GenerationMetrics::GenerationMetrics(const std::vector& durations_, const std::vector& times_to_first_token_, size_t batch_size) - : durations(durations_), times_to_first_token(times_to_first_token_) { - this->batch_size = batch_size; - std::tie(mean_tpot, std_tpot) = calc_mean_and_std(durations); - std::tie(mean_ttft, std_ttft) = calc_mean_and_std(times_to_first_token); -} - -GenerationMetrics GenerationMetrics::operator+(GenerationMetrics const& metrics) const { - std::vector new_durations = durations; - std::vector new_times_to_first_token = times_to_first_token; - new_durations.insert(new_durations.end(), metrics.durations.begin(), metrics.durations.end()); - new_times_to_first_token.insert(new_times_to_first_token.end(), metrics.times_to_first_token.begin(), metrics.times_to_first_token.end()); - - return GenerationMetrics(new_durations, new_times_to_first_token); -} - -std::pair GenerationMetrics::get_tokens_per_sec() const { - auto mean_tps = 1000.0f * batch_size / mean_tpot; - auto std_tps = 1000.0f * std_tpot / (mean_tpot * mean_tpot); - return {mean_tps, std_tps}; -} - - -} // namespace genai -} // namespace ov diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index dad93a0e6e..0802b87e66 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -1,7 +1,8 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/perf_metrics.hpp" +#include "perf_counters.hpp" #include "utils.hpp" namespace ov { @@ -22,11 +23,8 @@ EncodedResults greedy_decoding( size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); EncodedResults results; - // Time before the first token generated as a reference point. - ov::genai::TimePoints tok_times; - tok_times.reserve(max_new_tokens); - tok_times.emplace_back(std::chrono::steady_clock::now()); - + auto& perf_counters = results.metrics.m_counters; + results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); @@ -56,8 +54,8 @@ EncodedResults greedy_decoding( eos_met[batch] = (out_token == generation_config.eos_token_id); m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } - tok_times.emplace_back(std::chrono::steady_clock::now()); - + perf_counters->add_timestamp(running_batch_size); + if (streamer && streamer->put(token_iter_results[0])) { return results; } @@ -88,7 +86,7 @@ EncodedResults greedy_decoding( m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } - tok_times.emplace_back(std::chrono::steady_clock::now()); + perf_counters->add_timestamp(running_batch_size); if (streamer && streamer->put(token_iter_results[0])) return results; @@ -116,9 +114,8 @@ EncodedResults greedy_decoding( streamer->end(); } - results.metrics = GenerationMetrics(tok_times); return results; } } //namespace genai -} //namespace ov \ No newline at end of file +} //namespace ov diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 8695aeac02..4f5cb79f2a 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -362,14 +362,20 @@ std::pair beam_search(ov::InferRequest& lm, std::optional selected_beam_idx) { OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); - - // Initialize beam search + auto batch_size = input_ids.get_shape().at(0); + auto sequence_length = input_ids.get_shape().at(1); + + // Initialize time metric counters. + // ov::genai::TimePoints tok_times; + // tok_times.reserve(config.get_max_new_tokens(sequence_length)); + // tok_times.emplace_back(std::chrono::steady_clock::now()); + + // Initialize beam search. const int64_t* prompt_data = input_ids.data(); std::vector> prompts; prompts.reserve(batch_size); for (size_t batch = 0; batch < batch_size; batch++) { - size_t sequence_length = input_ids.get_shape().at(1); size_t batch_offset = batch * sequence_length; const int64_t* prompt_start = prompt_data + batch_offset; prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); @@ -389,7 +395,7 @@ std::pair beam_search(ov::InferRequest& lm, lm.set_tensor("beam_idx", beam_idx); Parameters parameters{std::move(prompts)}; - parameters.max_new_tokens = config.max_new_tokens; + parameters.max_new_tokens = config.get_max_new_tokens(sequence_length); parameters.eos_token_id = config.eos_token_id; parameters.n_groups = config.num_beam_groups; parameters.group_size = config.num_beams / config.num_beam_groups; @@ -406,6 +412,8 @@ std::pair beam_search(ov::InferRequest& lm, lm.infer(); std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); + // tok_times.emplace_back(std::chrono::steady_clock::now()); + if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) { // Break the cycle before masks are extended in update_attention_mask_with_beams. // If generation is continued, attention_mask length should be equal to KV cache size. @@ -462,7 +470,8 @@ std::pair beam_search(ov::InferRequest& lm, results.tokens.push_back(std::move(beam->get().tokens)); } } - + + // results.metrics = PerfCounters(tok_times); return {results, res_selected_beam_idx}; } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 918e744286..81f807c149 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -1,6 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include "perf_counters.hpp" #include #include #include @@ -9,7 +10,7 @@ #include #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" -#include "openvino/genai/generation_metrics.hpp" +#include "openvino/genai/perf_metrics.hpp" #include "llm_pipeline_base.hpp" #include "llm_pipeline_static.hpp" #include "utils.hpp" @@ -111,8 +112,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { OptionalGenerationConfig generation_config, StreamerVariant streamer ) override { + auto start_time = std::chrono::steady_clock::now(); GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - EncodedInputs encoded_input; + TokenizedInputs encoded_input; if (auto input_vector = std::get_if>(&inputs)) { encoded_input = m_tokenizer.encode(*input_vector); @@ -144,9 +146,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { encoded_input = m_tokenizer.encode(prompt); } } + auto encode_stop_time = std::chrono::steady_clock::now(); + auto encoded_results = generate(encoded_input, config, streamer); - auto encoded_results = generate(encoded_input, config, streamer); + auto decode_start_time = std::chrono::steady_clock::now(); DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + auto decode_stop_time = std::chrono::steady_clock::now(); if (is_chat_conversation) { // Tail of chat template is missing in KV cache. @@ -155,9 +160,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { m_templated_chat_history.append(answer); m_history.push_back({{"role", "assistant"}, {"content", answer}}); } + + auto& metrics = encoded_results.metrics; + // metrics.tokenization_duration = std::chrono::duration_cast(encode_stop_time - start_time).count(); + // metrics.detokenization_duration = std::chrono::duration_cast(decode_stop_time - decode_start_time).count(); - decoded_results.metrics = std::move(encoded_results.metrics); - decoded_results.metrics.load_time = m_load_time_ms; + // auto stop_time = std::chrono::steady_clock::now(); + // metrics.generate_durations.emplace_back(std::chrono::duration_cast(stop_time - start_time).count()); + decoded_results.metrics = std::move(metrics); return decoded_results; } @@ -166,9 +176,9 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { OptionalGenerationConfig generation_config, StreamerVariant streamer ) override { + auto start_time = std::chrono::steady_clock::now(); ov::Tensor input_ids; ov::Tensor attention_mask; - if (auto data = std::get_if(&inputs)) { input_ids = *data; attention_mask = ov::genai::utils::init_attention_mask(input_ids); @@ -256,6 +266,14 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else { m_is_cache_empty = false; } + + + + auto& metrics = result.metrics; + // metrics.batch_size = batch_size; + // metrics.num_generated_tokens = (metrics.m_durations.size() + 1) * batch_size; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(0); + result.metrics = std::move(metrics); return result; } diff --git a/src/cpp/src/perf_counters.cpp b/src/cpp/src/perf_counters.cpp new file mode 100644 index 0000000000..c9dac6eca0 --- /dev/null +++ b/src/cpp/src/perf_counters.cpp @@ -0,0 +1,21 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "perf_counters.hpp" +#include "openvino/genai/perf_metrics.hpp" +#include "openvino/openvino.hpp" +#include +#include +#include + +namespace ov { +namespace genai { + +void PerfCounters::add_timestamp(size_t batch_size) { + m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + m_batch_sizes.emplace_back(batch_size); +} + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/perf_counters.hpp b/src/cpp/src/perf_counters.hpp new file mode 100644 index 0000000000..7d33490205 --- /dev/null +++ b/src/cpp/src/perf_counters.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +namespace ov { +namespace genai { + +struct PerfCounters { + std::vector generate_durations; + std::vector tokenization_duration; + std::vector detokenization_duration; + size_t num_generated_tokens; + size_t num_input_tokens; + + std::vector m_batch_sizes; + std::vector m_durations; + std::vector m_times_to_first_token; + std::vector m_new_token_times; + void add_timestamp(size_t batch_size); + // void add_gen_finish_timestamp(size_t batch_size); + +}; + +// class StopWatch { +// TimePoint m_start; +// public: +// StopWatch& start() { +// m_start = std::chrono::steady_clock::now(); +// return *this; +// } + +// float split() { +// std::chrono::steady_clock::time_point curr_time = std::chrono::steady_clock::now(); +// return std::chrono::duration_cast(curr_time - m_start).count(); +// } +// }; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp new file mode 100644 index 0000000000..4a8b1d76c6 --- /dev/null +++ b/src/cpp/src/perf_metrics.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/perf_metrics.hpp" +#include "perf_counters.hpp" +#include "openvino/openvino.hpp" +#include +#include +#include + +namespace { + +std::pair calc_mean_and_std(const std::vector& durations) { + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size(); + + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const float& duration) -> float { + return acc + duration * duration; + }); + float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); + return {mean, std}; +} + + +} // namespace + +namespace ov { +namespace genai { + +void PerfMetrics::evaluate(TimePoint start_time) { + + auto& tok_times = m_counters->m_new_token_times; + auto& batch_sizes = m_counters->m_batch_sizes; + m_counters->m_durations = std::vector(tok_times.size()); + + auto ttft = std::chrono::duration_cast(tok_times[0] - start_time).count(); + m_counters->m_times_to_first_token.emplace_back(ttft); + + for (size_t i = 0; i < tok_times.size(); ++i) { + m_counters->m_durations[i] = std::chrono::duration_cast(tok_times[i] - start_time).count(); + // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5. + // todo: float check that it's valid for batch > 1. + m_counters->m_durations[i] /= batch_sizes[i]; + start_time = tok_times[i]; + } + + std::tie(mean_tpot, std_tpot) = calc_mean_and_std(m_counters->m_durations); + std::tie(mean_ttft, std_ttft) = calc_mean_and_std(m_counters->m_times_to_first_token); +} + +PerfMetrics PerfMetrics::operator+(const PerfMetrics& metrics) const { + PerfMetrics nm; // new metrics + nm.m_counters = m_counters; + auto& new_counters = nm.m_counters; + + auto& new_durations = new_counters->m_durations; + auto& new_times_to_first_token = new_counters->m_times_to_first_token; + + auto& counters_to_appnd = metrics.m_counters; + new_durations.insert(new_durations.end(), counters_to_appnd->m_durations.begin(), counters_to_appnd->m_durations.end()); + new_times_to_first_token.insert(new_times_to_first_token.end(), counters_to_appnd->m_times_to_first_token.begin(), counters_to_appnd->m_times_to_first_token.end()); + + OPENVINO_ASSERT(metrics.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); + + std::tie(nm.mean_tpot, nm.std_tpot) = calc_mean_and_std(new_counters->m_durations); + std::tie(nm.mean_ttft, nm.std_ttft) = calc_mean_and_std(new_counters->m_times_to_first_token); + + // todo: add tokenization statistics concatenation. + + return nm; +} + +PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) { + *this = *this + right; + return *this; +} + + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index ac6b925dcb..501d0e86cf 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -323,6 +323,8 @@ class Tokenizer::TokenizerImpl { // Replace what jinja2cpp doesn't support std::pair replace_str_map[] = { + {"{-", "{"}, + {"{%-", "{%"}, {"'}", "' }"}, {"{'", "{ '"}, {".strip()", ""} diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 25acc1c87f..446ef8549b 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -12,6 +12,20 @@ namespace ov { namespace genai { namespace utils { +#include +#include +#include + +// Templated function to measure execution time of an object method. +template +std::pair execution_time_wrapper(T& instance, Ret(T::*method)(Args...), Args&&... args) { + auto start = std::chrono::steady_clock::now(); + Ret result = (instance.*method)(std::forward(args)...); + auto end = std::chrono::steady_clock::now(); + auto duration = std::chrono::duration_cast(end - start).count(); + return {result, duration}; +} + Tensor init_attention_mask(const Tensor& position_ids); void print_tensor(const ov::Tensor& tensor); diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index d7b2aab29c..c78c760b6c 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -21,6 +21,7 @@ using ov::genai::GenerationConfig; using ov::genai::GenerationResult; using ov::genai::LLMPipeline; using ov::genai::OptionalGenerationConfig; +using ov::genai::PerfMetrics; using ov::genai::SchedulerConfig; using ov::genai::StopCriteria; using ov::genai::StreamerBase; @@ -536,6 +537,19 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("scores", &DecodedResults::scores) .def("__str__", &DecodedResults::operator std::string);; + py::class_(m, "PerfMetrics") + .def(py::init<>()) + .def_readonly("mean_generate_duration", &PerfMetrics::mean_generate_duration) + .def_readonly("mean_decoding_duration", &PerfMetrics::mean_decoding_duration) + .def_readonly("mean_encoding_duration", &PerfMetrics::mean_encoding_duration) + .def_readonly("mean_tpot", &PerfMetrics::mean_tpot) + .def_readonly("mean_ttft", &PerfMetrics::mean_ttft) + .def_readonly("std_tpot", &PerfMetrics::std_tpot) + .def_readonly("std_ttft", &PerfMetrics::std_ttft) + .def_readonly("load_time", &PerfMetrics::load_time) + .def("__add__", &PerfMetrics::operator+) + .def("__iadd__", &PerfMetrics::operator+=); + py::class_(m, "TokenizedInputs") .def(py::init()) .def_readwrite("input_ids", &TokenizedInputs::input_ids) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 4ba71a1d48..5d038e65e2 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -81,6 +81,8 @@ def get_chat_templates(): # but skips some models that currently are not processed correctly. skipped_models = { + "berkeley-nest/Starling-LM-7B-alpha", # TODO: Need to enable and unskip, since it's preset in continious batching and has ~30 000 downloads. + # These models fail even on HF so no need to check if applying chat matches. "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", "codellama/CodeLlama-34b-Instruct-hf", From bb1113ce69dc0126a1b83a66394f63d09146044a Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 22 Jul 2024 13:10:03 +0200 Subject: [PATCH 04/15] refactor structure, add python sample --- samples/cpp/benchmark_vanilla_genai/README.md | 1 + .../benchmark_vanilla_genai.cpp | 22 ++-- .../python/benchmark_vanilla_genai/README.md | 66 ++++++++++++ .../benchmark_vanilla_genai.py | 50 +++++++++ .../include/openvino/genai/llm_pipeline.hpp | 2 + .../include/openvino/genai/perf_metrics.hpp | 37 +++++-- src/cpp/src/greedy_decoding.cpp | 10 +- src/cpp/src/group_beam_searcher.cpp | 20 ++-- src/cpp/src/llm_pipeline.cpp | 31 +++--- src/cpp/src/perf_counters.cpp | 21 ---- src/cpp/src/perf_counters.hpp | 44 -------- src/cpp/src/perf_metrics.cpp | 100 +++++++++++------- src/cpp/src/tokenizer.cpp | 2 - src/python/py_generate_pipeline.cpp | 25 ++++- tests/python_tests/ov_genai_test_utils.py | 2 - 15 files changed, 279 insertions(+), 154 deletions(-) create mode 100644 samples/python/benchmark_vanilla_genai/README.md create mode 100755 samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py delete mode 100644 src/cpp/src/perf_counters.cpp delete mode 100644 src/cpp/src/perf_counters.hpp diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md index 739c2e950c..50197dad1d 100644 --- a/samples/cpp/benchmark_vanilla_genai/README.md +++ b/samples/cpp/benchmark_vanilla_genai/README.md @@ -1,2 +1,3 @@ # benchmark OpenVINO GenAI sample +TODO: adapt from python sample to c++ \ No newline at end of file diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp index 6489282b0b..6d96d24fc5 100644 --- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp +++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp @@ -11,7 +11,8 @@ int main(int argc, char* argv[]) try { ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) - ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(1))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(5))) + ("mt,max_new_tokens", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) ("d,device", "device", cxxopts::value()->default_value("CPU")) ("h,help", "Print usage"); @@ -36,26 +37,27 @@ int main(int argc, char* argv[]) try { size_t num_iter = result["num_iter"].as(); ov::genai::GenerationConfig config; - config.max_new_tokens = 100; - config.num_beam_groups = 3; - config.num_beams = 15; + config.max_new_tokens = result["max_new_tokens"].as(); ov::genai::LLMPipeline pipe(model_path, device); for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, config); - ov::genai::PerfMetrics metrics; - for (size_t i = 0; i < num_iter; i++) { - ov::genai::DecodedResults res = pipe.generate(prompt, config); + ov::genai::DecodedResults res = pipe.generate(prompt, config); + ov::genai::PerfMetrics metrics = res.metrics; + for (size_t i = 0; i < num_iter - 1; i++) { + res = pipe.generate(prompt, config); metrics = metrics + res.metrics; - metrics.load_time = res.metrics.load_time; } std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; + std::cout << "Generate time: " << metrics.mean_generate_duration << " ± " << metrics.std_generate_duration << " ms" << std::endl; + std::cout << "Tokenization time: " << metrics.mean_tokenization_duration << " ± " << metrics.std_tokenization_duration << " ms" << std::endl; + std::cout << "Detokenization time: " << metrics.mean_detokenization_duration << " ± " << metrics.std_detokenization_duration << " ms" << std::endl; std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl; - std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl; - std::cout << "Tokens/s: " << metrics.mean_throughput << std::endl; + std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms " << std::endl; + std::cout << "Tokens/s: " << metrics.mean_throughput << " ± " << metrics.std_throughput << std::endl; return 0; } catch (const std::exception& error) { diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_vanilla_genai/README.md new file mode 100644 index 0000000000..af66ea545d --- /dev/null +++ b/samples/python/benchmark_vanilla_genai/README.md @@ -0,0 +1,66 @@ +# Benchmark Vanilla GenAI + +This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +# ov.genai.PerfMetrics structure +ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics: +- mean_ttft + - std_ttft + - mean_tpot + - std_tpot + - load_time + - mean_generate_duration + - std_generate_duration + - mean_tokenization_duration + - std_tokenization_duration + - mean_detokenization_duration + - std_detokenization_duration + - mean_throughput + - std_throughput + - num_generated_tokens + - num_input_tokens + +Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated. + + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +python benchmark_vanilla_genai.py [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/ +``` + +``` +Load time: 3446 ms +Generate time: 876.2 ± 3.30719 ms +Tokenization time: 0 ± 0 ms +Detokenization time: 0 ± 0 ms +ttft: 168 ± 0 ms +tpot: 174.68 ± 4.08671 ms +Tokens/s: 5.72475 ± 0.133933 +``` diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py new file mode 100755 index 0000000000..4c87234179 --- /dev/null +++ b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py @@ -0,0 +1,50 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai as ov_genai +import pdb + +def main(): + parser = argparse.ArgumentParser(description="Help command") + parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") + parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") + parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") + parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations") + parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") + parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") + + args = parser.parse_args() + + prompt = [args.prompt] + model_path = args.model + device = args.device + num_warmup = args.num_warmup + num_iter = args.num_iter + + + config = ov_genai.GenerationConfig() + config.max_new_tokens = args.num_new_tokens + + pipe = ov_genai.LLMPipeline(model_path, device) + + for _ in range(num_warmup): + pipe.generate(prompt, config) + + res = pipe.generate(prompt, config) + metrics = res.metrics + for _ in range(num_iter - 1): + # pdb.set_trace() + res = pipe.generate(prompt, config) + metrics += res.metrics + + print(f"Load time: {metrics.load_time} ms") + print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms") + print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms") + print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms") + print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms") + print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms") + print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}") + +if __name__ == "__main__": + main() diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 4db3c613e7..14100d4f16 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -31,6 +31,7 @@ using StringInputs = std::variant>; * * @param tokens sequence of resulting tokens * @param scores sum of logarithmic probabilities of all tokens in the sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics */ class EncodedResults { public: @@ -45,6 +46,7 @@ class EncodedResults { * * @param texts vector of resulting sequences * @param scores scores for each sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics */ class DecodedResults { public: diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index a11c4e0374..e66c917e81 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -7,14 +7,34 @@ #include "openvino/genai/visibility.hpp" #include #include +#include namespace ov { namespace genai { using TimePoint = std::chrono::steady_clock::time_point; -struct PerfCounters; +/** +* @brief Structure with raw performance metrics for each generation before any statistics calculated. +*/ +struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { + std::vector generate_durations; + std::vector tokenization_durations; + std::vector detokenization_durations; + + std::vector m_times_to_first_token; + std::vector m_new_token_times; + std::vector m_batch_sizes; + std::vector m_durations; + size_t num_generated_tokens; + size_t num_input_tokens; +}; + +/** +* @brief Structure to store performance metric for each generation +* +*/ struct OPENVINO_GENAI_EXPORTS PerfMetrics { // First token time. float mean_ttft; @@ -25,11 +45,13 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { float std_tpot; float load_time; - float start_time; float mean_generate_duration; - float mean_decoding_duration; - float mean_encoding_duration; + float std_generate_duration; + float mean_tokenization_duration; + float std_tokenization_duration; + float mean_detokenization_duration; + float std_detokenization_duration; float mean_throughput; float std_throughput; @@ -37,13 +59,12 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { size_t num_generated_tokens; size_t num_input_tokens; - std::shared_ptr m_counters; - void evaluate(TimePoint start_time); - + void evaluate_statistics(std::optional start_time = std::nullopt); + static float get_duration_ms(std::chrono::steady_clock::duration duration); PerfMetrics operator+(const PerfMetrics& metrics) const; PerfMetrics& operator+=(const PerfMetrics& right); - + RawPerfMetrics raw_counters; }; } // namespace genai diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 0802b87e66..c5bf10a2d1 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "openvino/genai/perf_metrics.hpp" -#include "perf_counters.hpp" +// #include "perf_counters.hpp" #include "utils.hpp" namespace ov { @@ -23,7 +23,7 @@ EncodedResults greedy_decoding( size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); EncodedResults results; - auto& perf_counters = results.metrics.m_counters; + auto& raw_perf_counters = results.metrics.raw_counters; results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); @@ -54,7 +54,8 @@ EncodedResults greedy_decoding( eos_met[batch] = (out_token == generation_config.eos_token_id); m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } - perf_counters->add_timestamp(running_batch_size); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); if (streamer && streamer->put(token_iter_results[0])) { return results; @@ -86,7 +87,8 @@ EncodedResults greedy_decoding( m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } - perf_counters->add_timestamp(running_batch_size); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); if (streamer && streamer->put(token_iter_results[0])) return results; diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 4f5cb79f2a..784ff1a915 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -366,11 +366,6 @@ std::pair beam_search(ov::InferRequest& lm, auto batch_size = input_ids.get_shape().at(0); auto sequence_length = input_ids.get_shape().at(1); - // Initialize time metric counters. - // ov::genai::TimePoints tok_times; - // tok_times.reserve(config.get_max_new_tokens(sequence_length)); - // tok_times.emplace_back(std::chrono::steady_clock::now()); - // Initialize beam search. const int64_t* prompt_data = input_ids.data(); std::vector> prompts; @@ -407,12 +402,19 @@ std::pair beam_search(ov::InferRequest& lm, std::vector next_tokens; std::vector next_beams; - + + // Reserve for performance counters. + std::vector new_token_times; + std::vector batch_sizes; + new_token_times.reserve(parameters.max_new_tokens); + batch_sizes.reserve(parameters.max_new_tokens); + for (size_t length_count = 0; ; ++length_count) { lm.infer(); std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); - // tok_times.emplace_back(std::chrono::steady_clock::now()); + new_token_times.emplace_back(std::chrono::steady_clock::now()); + batch_sizes.emplace_back(batch_size); if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) { // Break the cycle before masks are extended in update_attention_mask_with_beams. @@ -442,6 +444,9 @@ std::pair beam_search(ov::InferRequest& lm, int32_t res_selected_beam_idx = 0; results.scores.reserve(config.num_return_sequences * result.size()); results.tokens.reserve(config.num_return_sequences * result.size()); + auto& raw_perf_counters = results.metrics.raw_counters; + raw_perf_counters.m_new_token_times = new_token_times; + raw_perf_counters.m_batch_sizes = batch_sizes; // align output with HF for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) { @@ -471,7 +476,6 @@ std::pair beam_search(ov::InferRequest& lm, } } - // results.metrics = PerfCounters(tok_times); return {results, res_selected_beam_idx}; } diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 81f807c149..5241142afe 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -1,7 +1,6 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "perf_counters.hpp" #include #include #include @@ -160,14 +159,18 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { m_templated_chat_history.append(answer); m_history.push_back({{"role", "assistant"}, {"content", answer}}); } - - auto& metrics = encoded_results.metrics; - // metrics.tokenization_duration = std::chrono::duration_cast(encode_stop_time - start_time).count(); - // metrics.detokenization_duration = std::chrono::duration_cast(decode_stop_time - decode_start_time).count(); - // auto stop_time = std::chrono::steady_clock::now(); - // metrics.generate_durations.emplace_back(std::chrono::duration_cast(stop_time - start_time).count()); - decoded_results.metrics = std::move(metrics); + // generate_durations + decoded_results.metrics = encoded_results.metrics; + + auto& raw_counters = decoded_results.metrics.raw_counters; + auto stop_time = std::chrono::steady_clock::now(); + + raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time)); + + decoded_results.metrics.evaluate_statistics(start_time); return decoded_results; } @@ -267,13 +270,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { m_is_cache_empty = false; } - - + // If is called without tokenization then that stat will not be reported. auto& metrics = result.metrics; - // metrics.batch_size = batch_size; - // metrics.num_generated_tokens = (metrics.m_durations.size() + 1) * batch_size; - metrics.num_input_tokens = batch_size * input_ids.get_shape().at(0); - result.metrics = std::move(metrics); + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.evaluate_statistics(start_time); return result; } @@ -390,7 +391,7 @@ ov::genai::LLMPipeline::LLMPipeline( m_pimpl = make_unique(std::filesystem::path(path), device, config); } auto stop_time = std::chrono::steady_clock::now(); - m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); + m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time); } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { diff --git a/src/cpp/src/perf_counters.cpp b/src/cpp/src/perf_counters.cpp deleted file mode 100644 index c9dac6eca0..0000000000 --- a/src/cpp/src/perf_counters.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "perf_counters.hpp" -#include "openvino/genai/perf_metrics.hpp" -#include "openvino/openvino.hpp" -#include -#include -#include - -namespace ov { -namespace genai { - -void PerfCounters::add_timestamp(size_t batch_size) { - m_new_token_times.emplace_back(std::chrono::steady_clock::now()); - m_batch_sizes.emplace_back(batch_size); -} - - -} // namespace genai -} // namespace ov diff --git a/src/cpp/src/perf_counters.hpp b/src/cpp/src/perf_counters.hpp deleted file mode 100644 index 7d33490205..0000000000 --- a/src/cpp/src/perf_counters.hpp +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include - -namespace ov { -namespace genai { - -struct PerfCounters { - std::vector generate_durations; - std::vector tokenization_duration; - std::vector detokenization_duration; - size_t num_generated_tokens; - size_t num_input_tokens; - - std::vector m_batch_sizes; - std::vector m_durations; - std::vector m_times_to_first_token; - std::vector m_new_token_times; - void add_timestamp(size_t batch_size); - // void add_gen_finish_timestamp(size_t batch_size); - -}; - -// class StopWatch { -// TimePoint m_start; -// public: -// StopWatch& start() { -// m_start = std::chrono::steady_clock::now(); -// return *this; -// } - -// float split() { -// std::chrono::steady_clock::time_point curr_time = std::chrono::steady_clock::now(); -// return std::chrono::duration_cast(curr_time - m_start).count(); -// } -// }; - -} // namespace genai -} // namespace ov diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 4a8b1d76c6..3947793802 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 #include "openvino/genai/perf_metrics.hpp" -#include "perf_counters.hpp" #include "openvino/openvino.hpp" #include #include @@ -17,7 +16,7 @@ std::pair calc_mean_and_std(const std::vector& durations) { [](const float& acc, const float& duration) -> float { return acc + duration * duration; }); - float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); + float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); return {mean, std}; } @@ -26,48 +25,77 @@ std::pair calc_mean_and_std(const std::vector& durations) { namespace ov { namespace genai { - -void PerfMetrics::evaluate(TimePoint start_time) { - - auto& tok_times = m_counters->m_new_token_times; - auto& batch_sizes = m_counters->m_batch_sizes; - m_counters->m_durations = std::vector(tok_times.size()); - auto ttft = std::chrono::duration_cast(tok_times[0] - start_time).count(); - m_counters->m_times_to_first_token.emplace_back(ttft); +float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) { + return std::chrono::duration_cast(duration).count(); +} - for (size_t i = 0; i < tok_times.size(); ++i) { - m_counters->m_durations[i] = std::chrono::duration_cast(tok_times[i] - start_time).count(); - // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5. - // todo: float check that it's valid for batch > 1. - m_counters->m_durations[i] /= batch_sizes[i]; - start_time = tok_times[i]; - } +void PerfMetrics::evaluate_statistics(std::optional start_time) { + // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that. + if (start_time.has_value()) { + auto start_time_val = *start_time; + auto& tok_times = raw_counters.m_new_token_times; + auto& batch_sizes = raw_counters.m_batch_sizes; + raw_counters.m_durations = std::vector(tok_times.size()); - std::tie(mean_tpot, std_tpot) = calc_mean_and_std(m_counters->m_durations); - std::tie(mean_ttft, std_ttft) = calc_mean_and_std(m_counters->m_times_to_first_token); -} + auto ttft = std::chrono::duration_cast(tok_times[0] - start_time_val).count(); + raw_counters.m_times_to_first_token = std::vector(); + raw_counters.m_times_to_first_token.emplace_back(ttft); + num_generated_tokens = 0; + for (size_t i = 0; i < tok_times.size(); ++i) { + raw_counters.m_durations[i] = std::chrono::duration_cast(tok_times[i] - start_time_val).count(); + + // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5. + // todo: float check that it's valid for batch > 1. + raw_counters.m_durations[i] /= batch_sizes[i]; + num_generated_tokens += batch_sizes[i]; + start_time_val = tok_times[i]; + } + } -PerfMetrics PerfMetrics::operator+(const PerfMetrics& metrics) const { - PerfMetrics nm; // new metrics - nm.m_counters = m_counters; - auto& new_counters = nm.m_counters; + std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_counters.m_durations); + std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_counters.m_times_to_first_token); - auto& new_durations = new_counters->m_durations; - auto& new_times_to_first_token = new_counters->m_times_to_first_token; - - auto& counters_to_appnd = metrics.m_counters; - new_durations.insert(new_durations.end(), counters_to_appnd->m_durations.begin(), counters_to_appnd->m_durations.end()); - new_times_to_first_token.insert(new_times_to_first_token.end(), counters_to_appnd->m_times_to_first_token.begin(), counters_to_appnd->m_times_to_first_token.end()); + std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_counters.generate_durations); + std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_counters.tokenization_durations); + std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_counters.detokenization_durations); - OPENVINO_ASSERT(metrics.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); + mean_throughput = 1000.0f / mean_tpot; + std_throughput = (std_tpot * 1000.0f) / (mean_tpot * mean_tpot); +} + +PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { + OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); - std::tie(nm.mean_tpot, nm.std_tpot) = calc_mean_and_std(new_counters->m_durations); - std::tie(nm.mean_ttft, nm.std_ttft) = calc_mean_and_std(new_counters->m_times_to_first_token); + // Copy left value to res. + PerfMetrics res = *this; + + // Concatenate duration and first token times. + auto& new_durations = res.raw_counters.m_durations; + auto& new_times_to_first_token = res.raw_counters.m_times_to_first_token; + auto& right_durations = right.raw_counters.m_durations; + auto& right_times_to_first_token = right.raw_counters.m_times_to_first_token; - // todo: add tokenization statistics concatenation. + new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); + new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); + + // Concatenate tokenization/detokenization and total generation times. + auto& new_tok_durations = res.raw_counters.tokenization_durations; + auto& new_detok_durations = res.raw_counters.detokenization_durations; + auto& new_gen_durations = res.raw_counters.generate_durations; + auto& right_tok_durations = right.raw_counters.tokenization_durations; + auto& right_detok_durations = right.raw_counters.detokenization_durations; + auto& right_gen_durations = right.raw_counters.generate_durations; - return nm; + new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); + new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); + new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); + + res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens; + res.num_input_tokens = num_generated_tokens + right.num_input_tokens; + res.load_time = load_time; + res.evaluate_statistics(); + return res; } PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) { @@ -75,7 +103,5 @@ PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) { return *this; } - - } // namespace genai } // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 501d0e86cf..ac6b925dcb 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -323,8 +323,6 @@ class Tokenizer::TokenizerImpl { // Replace what jinja2cpp doesn't support std::pair replace_str_map[] = { - {"{-", "{"}, - {"{%-", "{%"}, {"'}", "' }"}, {"{'", "{ '"}, {".strip()", ""} diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index c78c760b6c..860d3c3592 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -22,6 +22,7 @@ using ov::genai::GenerationResult; using ov::genai::LLMPipeline; using ov::genai::OptionalGenerationConfig; using ov::genai::PerfMetrics; +using ov::genai::RawPerfMetrics; using ov::genai::SchedulerConfig; using ov::genai::StopCriteria; using ov::genai::StreamerBase; @@ -535,13 +536,30 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def(py::init<>()) .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) .def_readonly("scores", &DecodedResults::scores) + .def_readonly("metrics", &DecodedResults::metrics) .def("__str__", &DecodedResults::operator std::string);; + py::class_(m, "RawPerfMetrics") + .def(py::init<>()) + .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) + .def_readonly("tokenization_durations", &RawPerfMetrics::tokenization_durations) + .def_readonly("detokenization_durations", &RawPerfMetrics::detokenization_durations) + .def_readonly("m_times_to_first_token", &RawPerfMetrics::m_times_to_first_token) + .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes) + .def_readonly("m_durations", &RawPerfMetrics::m_durations) + .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens) + .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens); + py::class_(m, "PerfMetrics") .def(py::init<>()) .def_readonly("mean_generate_duration", &PerfMetrics::mean_generate_duration) - .def_readonly("mean_decoding_duration", &PerfMetrics::mean_decoding_duration) - .def_readonly("mean_encoding_duration", &PerfMetrics::mean_encoding_duration) + .def_readonly("std_generate_duration", &PerfMetrics::std_generate_duration) + .def_readonly("mean_tokenization_duration", &PerfMetrics::mean_tokenization_duration) + .def_readonly("std_tokenization_duration", &PerfMetrics::std_tokenization_duration) + .def_readonly("mean_detokenization_duration", &PerfMetrics::mean_detokenization_duration) + .def_readonly("std_detokenization_duration", &PerfMetrics::std_detokenization_duration) + .def_readonly("mean_throughput", &PerfMetrics::mean_throughput) + .def_readonly("std_throughput", &PerfMetrics::std_throughput) .def_readonly("mean_tpot", &PerfMetrics::mean_tpot) .def_readonly("mean_ttft", &PerfMetrics::mean_ttft) .def_readonly("std_tpot", &PerfMetrics::std_tpot) @@ -557,7 +575,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "EncodedResults") .def_readonly("tokens", &EncodedResults::tokens) - .def_readonly("scores", &EncodedResults::scores); + .def_readonly("scores", &EncodedResults::scores) + .def_readonly("metrics", &EncodedResults::metrics); py::class_>(m, "StreamerBase") // Change the holder form unique_ptr to shared_ptr .def(py::init<>()) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 5d038e65e2..4ba71a1d48 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -81,8 +81,6 @@ def get_chat_templates(): # but skips some models that currently are not processed correctly. skipped_models = { - "berkeley-nest/Starling-LM-7B-alpha", # TODO: Need to enable and unskip, since it's preset in continious batching and has ~30 000 downloads. - # These models fail even on HF so no need to check if applying chat matches. "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", "codellama/CodeLlama-34b-Instruct-hf", From 0a8f0d95dcd37e59cced6a959de719d8a53e5c98 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 22 Jul 2024 17:24:33 +0200 Subject: [PATCH 05/15] add more preicise durations --- .../benchmark_vanilla_genai.cpp | 2 +- .../python/benchmark_vanilla_genai/README.md | 15 ++++++------ .../benchmark_vanilla_genai.py | 9 ++++--- .../include/openvino/genai/perf_metrics.hpp | 11 +++++---- src/cpp/src/greedy_decoding.cpp | 1 + src/cpp/src/llm_pipeline.cpp | 6 +++-- src/cpp/src/multinomial_decoding.cpp | 8 ++++++- src/cpp/src/perf_metrics.cpp | 24 ++++++++++++------- src/cpp/src/sampler.hpp | 9 ++----- src/cpp/src/utils.hpp | 14 ----------- src/python/py_generate_pipeline.cpp | 6 +++-- 11 files changed, 52 insertions(+), 53 deletions(-) diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp index 6d96d24fc5..a9bc07f641 100644 --- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp +++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp @@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try { ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) - ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(5))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) ("mt,max_new_tokens", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) ("d,device", "device", cxxopts::value()->default_value("CPU")) ("h,help", "Print usage"); diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_vanilla_genai/README.md index af66ea545d..13666a7de9 100644 --- a/samples/python/benchmark_vanilla_genai/README.md +++ b/samples/python/benchmark_vanilla_genai/README.md @@ -56,11 +56,12 @@ python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/ ``` ``` -Load time: 3446 ms -Generate time: 876.2 ± 3.30719 ms -Tokenization time: 0 ± 0 ms -Detokenization time: 0 ± 0 ms -ttft: 168 ± 0 ms -tpot: 174.68 ± 4.08671 ms -Tokens/s: 5.72475 ± 0.133933 +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 ``` +s \ No newline at end of file diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py index 4c87234179..9e4debe847 100755 --- a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py +++ b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py @@ -10,7 +10,7 @@ def main(): parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") - parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations") + parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") @@ -22,9 +22,8 @@ def main(): num_warmup = args.num_warmup num_iter = args.num_iter - config = ov_genai.GenerationConfig() - config.max_new_tokens = args.num_new_tokens + config.max_new_tokens = args.max_new_tokens pipe = ov_genai.LLMPipeline(model_path, device) @@ -37,8 +36,8 @@ def main(): # pdb.set_trace() res = pipe.generate(prompt, config) metrics += res.metrics - - print(f"Load time: {metrics.load_time} ms") + + print(f"Load time: {metrics.load_time:.2f} ms") print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms") print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms") print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms") diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index e66c917e81..5779b9b080 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -13,19 +13,20 @@ namespace ov { namespace genai { using TimePoint = std::chrono::steady_clock::time_point; +using MicroSeconds = std::chrono::duration>; /** * @brief Structure with raw performance metrics for each generation before any statistics calculated. */ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { - std::vector generate_durations; - std::vector tokenization_durations; - std::vector detokenization_durations; + std::vector generate_durations; + std::vector tokenization_durations; + std::vector detokenization_durations; - std::vector m_times_to_first_token; + std::vector m_times_to_first_token; std::vector m_new_token_times; std::vector m_batch_sizes; - std::vector m_durations; + std::vector m_durations; size_t num_generated_tokens; size_t num_input_tokens; diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index c5bf10a2d1..c8fd36cbdd 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -22,6 +22,7 @@ EncodedResults greedy_decoding( size_t prompt_len = prompts_shape[1]; size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); + // Initialize results and performance metrics. EncodedResults results; auto& raw_perf_counters = results.metrics.raw_counters; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 5241142afe..adac9110e1 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -165,7 +165,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto& raw_counters = decoded_results.metrics.raw_counters; auto stop_time = std::chrono::steady_clock::now(); - + raw_counters.generate_durations = std::vector(); raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time)); raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time)); @@ -269,11 +269,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else { m_is_cache_empty = false; } + auto stop_time = std::chrono::steady_clock::now(); // If is called without tokenization then that stat will not be reported. auto& metrics = result.metrics; metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); metrics.load_time = this->m_load_time_ms; + metrics.raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); metrics.evaluate_statistics(start_time); return result; } @@ -391,7 +393,7 @@ ov::genai::LLMPipeline::LLMPipeline( m_pimpl = make_unique(std::filesystem::path(path), device, config); } auto stop_time = std::chrono::steady_clock::now(); - m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time); + m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time) / 1000.0f; } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index fd16e948c1..fc59f00e12 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -162,7 +162,9 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner size_t prompt_len = prompts_shape[1]; - ov::genai::EncodedResults results; + // Initialize results and performance metrics. + EncodedResults results; + auto& raw_perf_counters = results.metrics.raw_counters; results.scores.resize(batch_size, 0); results.tokens.resize(batch_size); @@ -179,6 +181,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("beam_idx").data()[0] = 0; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); auto logits_tensor = m_model_runner.get_tensor("logits"); @@ -222,6 +226,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); logits = m_model_runner.get_tensor("logits").data(); out_token = sampling.get_out_token(logits, vocab_size, tokens); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 3947793802..d4dc6c8de6 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -9,12 +9,18 @@ namespace { -std::pair calc_mean_and_std(const std::vector& durations) { - float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size(); +// std::pair calc_mean_and_std(const std::vector& durations) { +std::pair calc_mean_and_std(const std::vector& durations) { + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count(); + }); + mean /= durations.size(); + mean /= 1000.f; float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, - [](const float& acc, const float& duration) -> float { - return acc + duration * duration; + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count() * duration.count() / 1000000.0f; }); float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); return {mean, std}; @@ -27,7 +33,7 @@ namespace ov { namespace genai { float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) { - return std::chrono::duration_cast(duration).count(); + return std::chrono::duration_cast(duration).count(); } void PerfMetrics::evaluate_statistics(std::optional start_time) { @@ -36,14 +42,14 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { auto start_time_val = *start_time; auto& tok_times = raw_counters.m_new_token_times; auto& batch_sizes = raw_counters.m_batch_sizes; - raw_counters.m_durations = std::vector(tok_times.size()); + raw_counters.m_durations = std::vector(tok_times.size()); - auto ttft = std::chrono::duration_cast(tok_times[0] - start_time_val).count(); - raw_counters.m_times_to_first_token = std::vector(); + auto ttft = tok_times[0] - start_time_val; + raw_counters.m_times_to_first_token = std::vector(); raw_counters.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = 0; for (size_t i = 0; i < tok_times.size(); ++i) { - raw_counters.m_durations[i] = std::chrono::duration_cast(tok_times[i] - start_time_val).count(); + raw_counters.m_durations[i] = tok_times[i] - start_time_val; // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5. // todo: float check that it's valid for batch > 1. diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 6390fc8725..dc631c68ac 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -219,13 +219,8 @@ class Sampler { } Token _greedy_sample(const std::vector& logit_vector) const { - Token max_token{-std::numeric_limits::infinity() , 0}; - for (const auto& logit : logit_vector) { - if (logit.m_log_prob > max_token.m_log_prob) { - max_token = logit; - } - } - return max_token; + auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; }); + return *out_token; } std::vector _multinomial_sample(const std::vector& logit_vector, size_t num_tokens_per_sequence) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 446ef8549b..25acc1c87f 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -12,20 +12,6 @@ namespace ov { namespace genai { namespace utils { -#include -#include -#include - -// Templated function to measure execution time of an object method. -template -std::pair execution_time_wrapper(T& instance, Ret(T::*method)(Args...), Args&&... args) { - auto start = std::chrono::steady_clock::now(); - Ret result = (instance.*method)(std::forward(args)...); - auto end = std::chrono::steady_clock::now(); - auto duration = std::chrono::duration_cast(end - start).count(); - return {result, duration}; -} - Tensor init_attention_mask(const Tensor& position_ids); void print_tensor(const ov::Tensor& tensor); diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 860d3c3592..e2f89cd962 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -537,7 +537,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) .def_readonly("scores", &DecodedResults::scores) .def_readonly("metrics", &DecodedResults::metrics) - .def("__str__", &DecodedResults::operator std::string);; + .def("__str__", &DecodedResults::operator std::string); py::class_(m, "RawPerfMetrics") .def(py::init<>()) @@ -566,7 +566,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("std_ttft", &PerfMetrics::std_ttft) .def_readonly("load_time", &PerfMetrics::load_time) .def("__add__", &PerfMetrics::operator+) - .def("__iadd__", &PerfMetrics::operator+=); + .def("__iadd__", &PerfMetrics::operator+=) + .def_readonly("raw_counters", &PerfMetrics::raw_counters) + ; py::class_(m, "TokenizedInputs") .def(py::init()) From 90320f411257e215d06bcdf100d37bbe20f1622e Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Tue, 23 Jul 2024 21:57:11 +0200 Subject: [PATCH 06/15] add cpp Readme, ensured correct batch processing, add PerfMetrics to Readme --- samples/CMakeLists.txt | 2 +- .../CMakeLists.txt | 12 ++-- samples/cpp/benchmark_genai/README.md | 47 +++++++++++++ .../benchmark_genai.cpp} | 12 ++-- samples/cpp/benchmark_vanilla_genai/README.md | 3 - .../README.md | 30 ++------- .../benchmark_genai.py} | 24 ++++--- .../benchmark_genai_automatic.py | 62 +++++++++++++++++ src/README.md | 49 ++++++++++++++ .../include/openvino/genai/llm_pipeline.hpp | 4 +- .../include/openvino/genai/perf_metrics.hpp | 24 ++++--- src/cpp/src/greedy_decoding.cpp | 2 +- src/cpp/src/group_beam_searcher.cpp | 2 +- src/cpp/src/llm_pipeline.cpp | 18 ++--- src/cpp/src/multinomial_decoding.cpp | 2 +- src/cpp/src/perf_metrics.cpp | 67 ++++++++++--------- src/python/py_generate_pipeline.cpp | 33 +++++++-- 17 files changed, 278 insertions(+), 115 deletions(-) rename samples/cpp/{benchmark_vanilla_genai => benchmark_genai}/CMakeLists.txt (64%) create mode 100644 samples/cpp/benchmark_genai/README.md rename samples/cpp/{benchmark_vanilla_genai/benchmark_vanilla_genai.cpp => benchmark_genai/benchmark_genai.cpp} (90%) delete mode 100644 samples/cpp/benchmark_vanilla_genai/README.md rename samples/python/{benchmark_vanilla_genai => benchmark_genai}/README.md (64%) rename samples/python/{benchmark_vanilla_genai/benchmark_vanilla_genai.py => benchmark_genai/benchmark_genai.py} (58%) create mode 100755 samples/python/benchmark_genai/benchmark_genai_automatic.py diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 44f8d580b2..5339817c1f 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm) add_subdirectory(cpp/multinomial_causal_lm) add_subdirectory(cpp/prompt_lookup_decoding_lm) add_subdirectory(cpp/speculative_decoding_lm) -add_subdirectory(cpp/benchmark_vanilla_genai) +add_subdirectory(cpp/benchmark_genai) install(FILES requirements.txt DESTINATION samples COMPONENT cpp_samples_genai) diff --git a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt similarity index 64% rename from samples/cpp/benchmark_vanilla_genai/CMakeLists.txt rename to samples/cpp/benchmark_genai/CMakeLists.txt index e871f5a33a..bfa1592f61 100644 --- a/samples/cpp/benchmark_vanilla_genai/CMakeLists.txt +++ b/samples/cpp/benchmark_genai/CMakeLists.txt @@ -12,14 +12,14 @@ FetchContent_Declare(cxxopts URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) FetchContent_MakeAvailable(cxxopts) -add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp) -target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts) -set_target_properties(benchmark_vanilla_genai PROPERTIES - COMPILE_PDB_NAME benchmark_vanilla_genai +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11) -install(TARGETS benchmark_vanilla_genai +# target_compile_features(benchmark_genai PRIVATE cxx_std_11) +install(TARGETS benchmark_genai RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md new file mode 100644 index 0000000000..bac16c2f7d --- /dev/null +++ b/samples/cpp/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# Benchmarking Vanilla GenAI + +This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +benchmark_vanilla_genai [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 +``` + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp similarity index 90% rename from samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp rename to samples/cpp/benchmark_genai/benchmark_genai.cpp index a9bc07f641..9610aabe54 100644 --- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -8,11 +8,11 @@ int main(int argc, char* argv[]) try { cxxopts::Options options("benchmark_vanilla_genai", "Help command"); options.add_options() - ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) + ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) - ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) - ("mt,max_new_tokens", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(3))) + ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value()->default_value(std::to_string(20))) ("d,device", "device", cxxopts::value()->default_value("CPU")) ("h,help", "Print usage"); @@ -38,6 +38,8 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); + config.num_beam_groups = 3; + config.num_beams = 15; ov::genai::LLMPipeline pipe(model_path, device); @@ -45,10 +47,10 @@ int main(int argc, char* argv[]) try { pipe.generate(prompt, config); ov::genai::DecodedResults res = pipe.generate(prompt, config); - ov::genai::PerfMetrics metrics = res.metrics; + ov::genai::PerfMetrics metrics = res.perf_metrics; for (size_t i = 0; i < num_iter - 1; i++) { res = pipe.generate(prompt, config); - metrics = metrics + res.metrics; + metrics = metrics + res.perf_metrics; } std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md deleted file mode 100644 index 50197dad1d..0000000000 --- a/samples/cpp/benchmark_vanilla_genai/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# benchmark OpenVINO GenAI sample - -TODO: adapt from python sample to c++ \ No newline at end of file diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_genai/README.md similarity index 64% rename from samples/python/benchmark_vanilla_genai/README.md rename to samples/python/benchmark_genai/README.md index 13666a7de9..fa4fa85576 100644 --- a/samples/python/benchmark_vanilla_genai/README.md +++ b/samples/python/benchmark_genai/README.md @@ -1,28 +1,7 @@ -# Benchmark Vanilla GenAI +# Benchmarking Vanilla GenAI This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. -# ov.genai.PerfMetrics structure -ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics: -- mean_ttft - - std_ttft - - mean_tpot - - std_tpot - - load_time - - mean_generate_duration - - std_generate_duration - - mean_tokenization_duration - - std_tokenization_duration - - mean_detokenization_duration - - std_detokenization_duration - - mean_throughput - - std_throughput - - num_generated_tokens - - num_input_tokens - -Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated. - - ## Download and convert the model and tokenizers The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -45,14 +24,14 @@ python benchmark_vanilla_genai.py [OPTIONS] - `-m, --model`: Path to the model and tokenizers base directory. - `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. - `-nw, --num_warmup` (default: `1`): Number of warmup iterations. -- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. - `-n, --num_iter` (default: `3`): Number of iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. - `-d, --device` (default: `"CPU"`): Device to run the model on. ### Output: ``` -python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/ +python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 ``` ``` @@ -64,4 +43,5 @@ TTFT: 81.60 ± 0.54 ms TPOT: 71.52 ± 2.72 ms Throughput tokens/s: 13.98 ± 0.53 ``` -s \ No newline at end of file + +For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_genai/benchmark_genai.py similarity index 58% rename from samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py rename to samples/python/benchmark_genai/benchmark_genai.py index 9e4debe847..06bd8b0f48 100755 --- a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -3,7 +3,6 @@ import argparse import openvino_genai as ov_genai -import pdb def main(): parser = argparse.ArgumentParser(description="Help command") @@ -16,6 +15,8 @@ def main(): args = parser.parse_args() + # Perf metrics is stored in DecodedResults. + # In order to get DecodedResults instead of a string input should be a list. prompt = [args.prompt] model_path = args.model device = args.device @@ -24,6 +25,8 @@ def main(): config = ov_genai.GenerationConfig() config.max_new_tokens = args.max_new_tokens + config.num_beam_groups = 3 + config.num_beams = 15 pipe = ov_genai.LLMPipeline(model_path, device) @@ -31,19 +34,18 @@ def main(): pipe.generate(prompt, config) res = pipe.generate(prompt, config) - metrics = res.metrics + perf_metrics = res.perf_metrics for _ in range(num_iter - 1): - # pdb.set_trace() res = pipe.generate(prompt, config) - metrics += res.metrics + perf_metrics += res.perf_metrics - print(f"Load time: {metrics.load_time:.2f} ms") - print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms") - print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms") - print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms") - print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms") - print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms") - print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}") + print(f"Load time: {perf_metrics.load_time:.2f} ms") + print(f"Generate time: {perf_metrics.mean_generate_duration:.2f} ± {perf_metrics.std_generate_duration:.2f} ms") + print(f"Tokenization time: {perf_metrics.mean_tokenization_duration:.2f} ± {perf_metrics.std_tokenization_duration:.2f} ms") + print(f"Detokenization time: {perf_metrics.mean_detokenization_duration:.2f} ± {perf_metrics.std_detokenization_duration:.2f} ms") + print(f"TTFT: {perf_metrics.mean_ttft:.2f} ± {perf_metrics.std_ttft:.2f} ms") + print(f"TPOT: {perf_metrics.mean_tpot:.2f} ± {perf_metrics.std_tpot:.2f} ms") + print(f"Throughput tokens/s: {perf_metrics.mean_throughput:.2f} ± {perf_metrics.std_throughput:.2f}") if __name__ == "__main__": main() diff --git a/samples/python/benchmark_genai/benchmark_genai_automatic.py b/samples/python/benchmark_genai/benchmark_genai_automatic.py new file mode 100755 index 0000000000..98a00a8c99 --- /dev/null +++ b/samples/python/benchmark_genai/benchmark_genai_automatic.py @@ -0,0 +1,62 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai as ov_genai +import pdb + +def main(): + parser = argparse.ArgumentParser(description="Help command") + parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") + parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") + parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") + parser.add_argument("-n", "--num_iter", type=int, default=5, help="Number of iterations") + parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") + parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") + + args = parser.parse_args() + + # Perf metrics is stored in DecodedResults. + # In order to get DecodedResults instead of a string input should be a list. + + model_path = args.model + device = args.device + num_warmup = args.num_warmup + num_iter = args.num_iter + + config = ov_genai.GenerationConfig() + config.max_new_tokens = 20 + # config.num_beam_groups = 3 + # config.num_beams = 15 + + pipe = ov_genai.LLMPipeline(model_path, device) + + import pandas as pd + metrics_df = pd.DataFrame(columns=['batch_size', 'throughput', 'ttft', 'tpot', 'std_throughput', 'std_ttft', 'std_tpot']) + + batch_sizes = [1, 2, 4, 16, 32, 64, 256] + for batch_size in batch_sizes: + prompt = [args.prompt] * batch_size + for _ in range(num_warmup): + pipe.generate(prompt, config) + + res = pipe.generate(prompt, config) + metrics = res.metrics + for _ in range(num_iter - 1): + res = pipe.generate(prompt, config) + metrics += res.metrics + # pdb.set_trace() + metrics_df = metrics_df._append({ + 'batch_size': batch_size, + 'throughput': metrics.mean_throughput, + 'ttft': metrics.mean_ttft, + 'tpot': metrics.mean_tpot, + 'std_throughput': metrics.std_throughput, + 'std_ttft': metrics.std_ttft, + 'std_tpot': metrics.std_tpot, + }, ignore_index=True) + + metrics_df.to_csv('metrics.csv', index=False) + +if __name__ == "__main__": + main() diff --git a/src/README.md b/src/README.md index 445b88aa58..a5530ea578 100644 --- a/src/README.md +++ b/src/README.md @@ -196,6 +196,55 @@ int main(int argc, char* argv[]) { } ``` +### Performance Metrics + +`ov.genai.PerfMetrics` (referred to as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` hold fields with mean and standard deviations for the following metrics: +- `ttft` +- `tpot` +- `load_time` +- `generate_duration` +- `tokenization_duration` +- `detokenization_duration` +- `throughput` + +and: +- `num_generated_tokens` +- `num_input_tokens` + +Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `ov.genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`. + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +res = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = res.perf_metrics +print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}') +print(f'ttft: {perf_metrics.mean_ttft:.2f}') +print(f'tpot: {perf_metrics.mean_tpot:.2f}') +``` +output: +```sh +mean_generate_duration: 76.28 +mean_ttft: 42.58 +mean_tpot 3.80 +``` + +>**Note**: If the input prompt is just a string, the generate function will return only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. + +Several `perf_metrics` can be added with each other. In that case `raw_metrics` will be concatenated and mean/std values will be recalculated. This enhances benchmarking and accumulating statistics from several calls. + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) +perf_metrics = res_1.perf_metrics + res_2.perf_metrics + +print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}') +print(f'ttft: {perf_metrics.mean_ttft:.2f}') +print(f'tpot: {perf_metrics.mean_tpot:.2f}') +``` + ## How It Works For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md). diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 14100d4f16..4be298128e 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -37,7 +37,7 @@ class EncodedResults { public: std::vector> tokens; std::vector scores; - PerfMetrics metrics; + PerfMetrics perf_metrics; }; /** @@ -52,7 +52,7 @@ class DecodedResults { public: std::vector texts; std::vector scores; - PerfMetrics metrics; + PerfMetrics perf_metrics; // @brief Convert DecodedResults to a string. operator std::string() const { diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index 5779b9b080..44535cf3a2 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -37,23 +37,25 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { * */ struct OPENVINO_GENAI_EXPORTS PerfMetrics { - // First token time. + // Load time in ms. + float load_time; + + // First token time (in ms). float mean_ttft; float std_ttft; - // Time per output token. + // Time (in ms) per output token. float mean_tpot; float std_tpot; - float load_time; - float mean_generate_duration; float std_generate_duration; - float mean_tokenization_duration; - float std_tokenization_duration; - float mean_detokenization_duration; - float std_detokenization_duration; - + float mean_tokenization_duration = -1; + float std_tokenization_duration = -1; + float mean_detokenization_duration = -1; + float std_detokenization_duration = -1; + + // Tokens per second. float mean_throughput; float std_throughput; @@ -61,11 +63,11 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { size_t num_input_tokens; void evaluate_statistics(std::optional start_time = std::nullopt); - static float get_duration_ms(std::chrono::steady_clock::duration duration); + static float get_microsec(std::chrono::steady_clock::duration duration); PerfMetrics operator+(const PerfMetrics& metrics) const; PerfMetrics& operator+=(const PerfMetrics& right); - RawPerfMetrics raw_counters; + RawPerfMetrics raw_metrics; }; } // namespace genai diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index c8fd36cbdd..8b0cf19c1f 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -24,7 +24,7 @@ EncodedResults greedy_decoding( // Initialize results and performance metrics. EncodedResults results; - auto& raw_perf_counters = results.metrics.raw_counters; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 784ff1a915..1b9729b2f6 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -444,7 +444,7 @@ std::pair beam_search(ov::InferRequest& lm, int32_t res_selected_beam_idx = 0; results.scores.reserve(config.num_return_sequences * result.size()); results.tokens.reserve(config.num_return_sequences * result.size()); - auto& raw_perf_counters = results.metrics.raw_counters; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; raw_perf_counters.m_new_token_times = new_token_times; raw_perf_counters.m_batch_sizes = batch_sizes; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index adac9110e1..1c1bd5ccd8 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -161,16 +161,16 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } // generate_durations - decoded_results.metrics = encoded_results.metrics; + decoded_results.perf_metrics = encoded_results.perf_metrics; - auto& raw_counters = decoded_results.metrics.raw_counters; + auto& raw_counters = decoded_results.perf_metrics.raw_metrics; auto stop_time = std::chrono::steady_clock::now(); raw_counters.generate_durations = std::vector(); - raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); - raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time)); - raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time)); + raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); - decoded_results.metrics.evaluate_statistics(start_time); + decoded_results.perf_metrics.evaluate_statistics(start_time); return decoded_results; } @@ -272,10 +272,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto stop_time = std::chrono::steady_clock::now(); // If is called without tokenization then that stat will not be reported. - auto& metrics = result.metrics; + auto& metrics = result.perf_metrics; metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); metrics.load_time = this->m_load_time_ms; - metrics.raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); metrics.evaluate_statistics(start_time); return result; } @@ -393,7 +393,7 @@ ov::genai::LLMPipeline::LLMPipeline( m_pimpl = make_unique(std::filesystem::path(path), device, config); } auto stop_time = std::chrono::steady_clock::now(); - m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time) / 1000.0f; + m_pimpl->m_load_time_ms = PerfMetrics::get_microsec(stop_time - start_time) / 1000.0f; } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index fc59f00e12..b00c62aed7 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -164,7 +164,7 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner // Initialize results and performance metrics. EncodedResults results; - auto& raw_perf_counters = results.metrics.raw_counters; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; results.scores.resize(batch_size, 0); results.tokens.resize(batch_size); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index d4dc6c8de6..c319032449 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -9,18 +9,18 @@ namespace { -// std::pair calc_mean_and_std(const std::vector& durations) { std::pair calc_mean_and_std(const std::vector& durations) { + // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { - return acc + duration.count(); + return acc + duration.count() / 1000.0f; }); mean /= durations.size(); - mean /= 1000.f; float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { - return acc + duration.count() * duration.count() / 1000000.0f; + auto d = duration.count() / 1000.0f; + return acc + d * d; }); float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); return {mean, std}; @@ -32,7 +32,7 @@ std::pair calc_mean_and_std(const std::vector(duration).count(); } @@ -40,33 +40,33 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that. if (start_time.has_value()) { auto start_time_val = *start_time; - auto& tok_times = raw_counters.m_new_token_times; - auto& batch_sizes = raw_counters.m_batch_sizes; - raw_counters.m_durations = std::vector(tok_times.size()); + auto& tok_times = raw_metrics.m_new_token_times; + auto& batch_sizes = raw_metrics.m_batch_sizes; + raw_metrics.m_durations = std::vector(tok_times.size()); auto ttft = tok_times[0] - start_time_val; - raw_counters.m_times_to_first_token = std::vector(); - raw_counters.m_times_to_first_token.emplace_back(ttft); + raw_metrics.m_times_to_first_token = std::vector(); + raw_metrics.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = 0; for (size_t i = 0; i < tok_times.size(); ++i) { - raw_counters.m_durations[i] = tok_times[i] - start_time_val; + raw_metrics.m_durations[i] = tok_times[i] - start_time_val; - // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5. - // todo: float check that it's valid for batch > 1. - raw_counters.m_durations[i] /= batch_sizes[i]; + // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. + raw_metrics.m_durations[i] /= batch_sizes[i]; num_generated_tokens += batch_sizes[i]; start_time_val = tok_times[i]; } } + + // calc_mean_and_std will convert microsecond to milliseconds. + std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_metrics.m_durations); + std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_metrics.m_times_to_first_token); - std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_counters.m_durations); - std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_counters.m_times_to_first_token); - - std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_counters.generate_durations); - std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_counters.tokenization_durations); - std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_counters.detokenization_durations); + std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_metrics.generate_durations); + std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_metrics.tokenization_durations); + std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_metrics.detokenization_durations); - mean_throughput = 1000.0f / mean_tpot; + mean_throughput = 1000.0f / mean_tpot; // tokens per second std_throughput = (std_tpot * 1000.0f) / (mean_tpot * mean_tpot); } @@ -76,22 +76,25 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { // Copy left value to res. PerfMetrics res = *this; - // Concatenate duration and first token times. - auto& new_durations = res.raw_counters.m_durations; - auto& new_times_to_first_token = res.raw_counters.m_times_to_first_token; - auto& right_durations = right.raw_counters.m_durations; - auto& right_times_to_first_token = right.raw_counters.m_times_to_first_token; + // Concatenate durations, batch_sizes first token times. + auto& new_durations = res.raw_metrics.m_durations; + auto& new_batch_sizes = res.raw_metrics.m_batch_sizes; + auto& new_times_to_first_token = res.raw_metrics.m_times_to_first_token; + auto& right_durations = right.raw_metrics.m_durations; + auto& right_batch_sizes = right.raw_metrics.m_batch_sizes; + auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token; new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); + new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end()); // Concatenate tokenization/detokenization and total generation times. - auto& new_tok_durations = res.raw_counters.tokenization_durations; - auto& new_detok_durations = res.raw_counters.detokenization_durations; - auto& new_gen_durations = res.raw_counters.generate_durations; - auto& right_tok_durations = right.raw_counters.tokenization_durations; - auto& right_detok_durations = right.raw_counters.detokenization_durations; - auto& right_gen_durations = right.raw_counters.generate_durations; + auto& new_tok_durations = res.raw_metrics.tokenization_durations; + auto& new_detok_durations = res.raw_metrics.detokenization_durations; + auto& new_gen_durations = res.raw_metrics.generate_durations; + auto& right_tok_durations = right.raw_metrics.tokenization_durations; + auto& right_detok_durations = right.raw_metrics.detokenization_durations; + auto& right_gen_durations = right.raw_metrics.generate_durations; new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index e2f89cd962..6c88b3ffcc 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -38,6 +38,17 @@ using PyBindStreamerVariant = std::variant, std::sh template struct overloaded : Ts... { using Ts::operator()...; }; template overloaded(Ts...) -> overloaded; +template +std::vector get_ms(const T& instance, U T::*member) { + // Converts c++ duration to float so that it can be used in Python. + std::vector res; + const auto& durations = instance.*member; + res.reserve(durations.size()); + std::transform(durations.begin(), durations.end(), std::back_inserter(res), + [](const auto& duration) { return duration.count(); }); + return res; +} + namespace { auto generate_docstring = R"( @@ -536,17 +547,25 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def(py::init<>()) .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) .def_readonly("scores", &DecodedResults::scores) - .def_readonly("metrics", &DecodedResults::metrics) + .def_readonly("perf_metrics", &DecodedResults::perf_metrics) .def("__str__", &DecodedResults::operator std::string); py::class_(m, "RawPerfMetrics") .def(py::init<>()) .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) - .def_readonly("tokenization_durations", &RawPerfMetrics::tokenization_durations) - .def_readonly("detokenization_durations", &RawPerfMetrics::detokenization_durations) - .def_readonly("m_times_to_first_token", &RawPerfMetrics::m_times_to_first_token) + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::tokenization_durations); + }) + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::detokenization_durations); + }) + .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); + }) + .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_durations); + }) .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes) - .def_readonly("m_durations", &RawPerfMetrics::m_durations) .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens) .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens); @@ -567,7 +586,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("load_time", &PerfMetrics::load_time) .def("__add__", &PerfMetrics::operator+) .def("__iadd__", &PerfMetrics::operator+=) - .def_readonly("raw_counters", &PerfMetrics::raw_counters) + .def_readonly("raw_metrics", &PerfMetrics::raw_metrics) ; py::class_(m, "TokenizedInputs") @@ -578,7 +597,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "EncodedResults") .def_readonly("tokens", &EncodedResults::tokens) .def_readonly("scores", &EncodedResults::scores) - .def_readonly("metrics", &EncodedResults::metrics); + .def_readonly("perf_metrics", &EncodedResults::perf_metrics); py::class_>(m, "StreamerBase") // Change the holder form unique_ptr to shared_ptr .def(py::init<>()) From aeec730c4ebd14c90c081df40e50fd49d3c66f0d Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Tue, 23 Jul 2024 23:08:39 +0200 Subject: [PATCH 07/15] use MeanStdPair --- .../cpp/benchmark_genai/benchmark_genai.cpp | 14 +++-- .../python/benchmark_genai/benchmark_genai.py | 14 +++-- .../include/openvino/genai/perf_metrics.hpp | 51 +++++++++++-------- src/cpp/src/perf_metrics.cpp | 18 ++++--- src/python/py_generate_pipeline.cpp | 27 +++++----- 5 files changed, 64 insertions(+), 60 deletions(-) diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp index 9610aabe54..24b9491219 100644 --- a/samples/cpp/benchmark_genai/benchmark_genai.cpp +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -38,8 +38,6 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); - config.num_beam_groups = 3; - config.num_beams = 15; ov::genai::LLMPipeline pipe(model_path, device); @@ -54,12 +52,12 @@ int main(int argc, char* argv[]) try { } std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; - std::cout << "Generate time: " << metrics.mean_generate_duration << " ± " << metrics.std_generate_duration << " ms" << std::endl; - std::cout << "Tokenization time: " << metrics.mean_tokenization_duration << " ± " << metrics.std_tokenization_duration << " ms" << std::endl; - std::cout << "Detokenization time: " << metrics.mean_detokenization_duration << " ± " << metrics.std_detokenization_duration << " ms" << std::endl; - std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl; - std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms " << std::endl; - std::cout << "Tokens/s: " << metrics.mean_throughput << " ± " << metrics.std_throughput << std::endl; + std::cout << "Generate time: " << metrics.generate_duration.mean << " ± " << metrics.generate_duration.std << " ms" << std::endl; + std::cout << "Tokenization time: " << metrics.tokenization_duration.mean << " ± " << metrics.tokenization_duration.std << " ms" << std::endl; + std::cout << "Detokenization time: " << metrics.detokenization_duration.mean << " ± " << metrics.detokenization_duration.std << " ms" << std::endl; + std::cout << "ttft: " << metrics.ttft.mean << " ± " << metrics.ttft.std << " ms" << std::endl; + std::cout << "tpot: " << metrics.tpot.mean << " ± " << metrics.tpot.std << " ms " << std::endl; + std::cout << "Tokens/s: " << metrics.throughput.mean << " ± " << metrics.throughput.std << std::endl; return 0; } catch (const std::exception& error) { diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py index 06bd8b0f48..c29c508bf4 100755 --- a/samples/python/benchmark_genai/benchmark_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -25,8 +25,6 @@ def main(): config = ov_genai.GenerationConfig() config.max_new_tokens = args.max_new_tokens - config.num_beam_groups = 3 - config.num_beams = 15 pipe = ov_genai.LLMPipeline(model_path, device) @@ -40,12 +38,12 @@ def main(): perf_metrics += res.perf_metrics print(f"Load time: {perf_metrics.load_time:.2f} ms") - print(f"Generate time: {perf_metrics.mean_generate_duration:.2f} ± {perf_metrics.std_generate_duration:.2f} ms") - print(f"Tokenization time: {perf_metrics.mean_tokenization_duration:.2f} ± {perf_metrics.std_tokenization_duration:.2f} ms") - print(f"Detokenization time: {perf_metrics.mean_detokenization_duration:.2f} ± {perf_metrics.std_detokenization_duration:.2f} ms") - print(f"TTFT: {perf_metrics.mean_ttft:.2f} ± {perf_metrics.std_ttft:.2f} ms") - print(f"TPOT: {perf_metrics.mean_tpot:.2f} ± {perf_metrics.std_tpot:.2f} ms") - print(f"Throughput tokens/s: {perf_metrics.mean_throughput:.2f} ± {perf_metrics.std_throughput:.2f}") + print(f"Generate time: {perf_metrics.generate_duration.mean:.2f} ± {perf_metrics.generate_duration.std:.2f} ms") + print(f"Tokenization time: {perf_metrics.tokenization_duration.mean:.2f} ± {perf_metrics.tokenization_duration.std:.2f} ms") + print(f"Detokenization time: {perf_metrics.detokenization_duration.mean:.2f} ± {perf_metrics.detokenization_duration.std:.2f} ms") + print(f"TTFT: {perf_metrics.ttft.mean:.2f} ± {perf_metrics.ttft.std:.2f} ms") + print(f"TPOT: {perf_metrics.tpot.mean:.2f} ± {perf_metrics.tpot.std:.2f} ms") + print(f"Throughput tokens/s: {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f}") if __name__ == "__main__": main() diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index 44535cf3a2..8715761792 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -33,36 +33,43 @@ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { }; /** -* @brief Structure to store performance metric for each generation -* +* @brief Structure to store mean and standart deviation values. */ -struct OPENVINO_GENAI_EXPORTS PerfMetrics { - // Load time in ms. - float load_time; - - // First token time (in ms). - float mean_ttft; - float std_ttft; +struct OPENVINO_GENAI_EXPORTS MeanStdPair { + float mean; + float std; +}; - // Time (in ms) per output token. - float mean_tpot; - float std_tpot; +/** +* @brief Structure to store performance metric for each generation. +* +* @param +*/ +struct OPENVINO_GENAI_EXPORTS PerfMetrics { + float load_time; // Load time in ms. + MeanStdPair ttft; // Time to the first token (in ms) (TTTFT). + MeanStdPair tpot; // Time (in ms) per output token (TPOT). + MeanStdPair throughput; // Tokens per second. - float mean_generate_duration; - float std_generate_duration; - float mean_tokenization_duration = -1; - float std_tokenization_duration = -1; - float mean_detokenization_duration = -1; - float std_detokenization_duration = -1; - - // Tokens per second. - float mean_throughput; - float std_throughput; + MeanStdPair generate_duration; + MeanStdPair tokenization_duration = {-1, -1}; + MeanStdPair detokenization_duration = {-1. -1}; size_t num_generated_tokens; size_t num_input_tokens; + /** + * @brief calculates mean/std values from raw_metrics. + * + * @param start_time optional start_time in case if duration needs to be updated. + */ void evaluate_statistics(std::optional start_time = std::nullopt); + + /** + * @brief convert duration to microseconds + * + * @param duration duration in + */ static float get_microsec(std::chrono::steady_clock::duration duration); PerfMetrics operator+(const PerfMetrics& metrics) const; PerfMetrics& operator+=(const PerfMetrics& right); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index c319032449..bc394fae52 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -9,7 +9,7 @@ namespace { -std::pair calc_mean_and_std(const std::vector& durations) { +ov::genai::MeanStdPair calc_mean_and_std(const std::vector& durations) { // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { @@ -59,15 +59,17 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { } // calc_mean_and_std will convert microsecond to milliseconds. - std::tie(mean_tpot, std_tpot) = calc_mean_and_std(raw_metrics.m_durations); - std::tie(mean_ttft, std_ttft) = calc_mean_and_std(raw_metrics.m_times_to_first_token); + tpot = calc_mean_and_std(raw_metrics.m_durations); + ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); - std::tie(mean_generate_duration, std_generate_duration) = calc_mean_and_std(raw_metrics.generate_durations); - std::tie(mean_tokenization_duration, std_tokenization_duration) = calc_mean_and_std(raw_metrics.tokenization_durations); - std::tie(mean_detokenization_duration, std_detokenization_duration) = calc_mean_and_std(raw_metrics.detokenization_durations); + generate_duration = calc_mean_and_std(raw_metrics.generate_durations); + generate_duration = calc_mean_and_std(raw_metrics.generate_durations); + + tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); + detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); - mean_throughput = 1000.0f / mean_tpot; // tokens per second - std_throughput = (std_tpot * 1000.0f) / (mean_tpot * mean_tpot); + // tokens per second + throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)}; } PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 6c88b3ffcc..e744179c34 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -20,6 +20,7 @@ using ov::genai::EncodedResults; using ov::genai::GenerationConfig; using ov::genai::GenerationResult; using ov::genai::LLMPipeline; +using ov::genai::MeanStdPair; using ov::genai::OptionalGenerationConfig; using ov::genai::PerfMetrics; using ov::genai::RawPerfMetrics; @@ -569,25 +570,23 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens) .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens); + py::class_(m, "MeanStdPair") + .def(py::init<>()) + .def_readonly("mean", &MeanStdPair::mean) + .def_readonly("std", &MeanStdPair::std); + py::class_(m, "PerfMetrics") .def(py::init<>()) - .def_readonly("mean_generate_duration", &PerfMetrics::mean_generate_duration) - .def_readonly("std_generate_duration", &PerfMetrics::std_generate_duration) - .def_readonly("mean_tokenization_duration", &PerfMetrics::mean_tokenization_duration) - .def_readonly("std_tokenization_duration", &PerfMetrics::std_tokenization_duration) - .def_readonly("mean_detokenization_duration", &PerfMetrics::mean_detokenization_duration) - .def_readonly("std_detokenization_duration", &PerfMetrics::std_detokenization_duration) - .def_readonly("mean_throughput", &PerfMetrics::mean_throughput) - .def_readonly("std_throughput", &PerfMetrics::std_throughput) - .def_readonly("mean_tpot", &PerfMetrics::mean_tpot) - .def_readonly("mean_ttft", &PerfMetrics::mean_ttft) - .def_readonly("std_tpot", &PerfMetrics::std_tpot) - .def_readonly("std_ttft", &PerfMetrics::std_ttft) + .def_readonly("generate_duration", &PerfMetrics::generate_duration) + .def_readonly("tokenization_duration", &PerfMetrics::tokenization_duration) + .def_readonly("detokenization_duration", &PerfMetrics::detokenization_duration) + .def_readonly("throughput", &PerfMetrics::throughput) + .def_readonly("tpot", &PerfMetrics::tpot) + .def_readonly("ttft", &PerfMetrics::ttft) .def_readonly("load_time", &PerfMetrics::load_time) .def("__add__", &PerfMetrics::operator+) .def("__iadd__", &PerfMetrics::operator+=) - .def_readonly("raw_metrics", &PerfMetrics::raw_metrics) - ; + .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); py::class_(m, "TokenizedInputs") .def(py::init()) From 406393f93063f2a82bc16d106ffb6df8893511d1 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 26 Jul 2024 08:51:32 +0200 Subject: [PATCH 08/15] Prefix caching. (#675) Port of https://github.com/openvinotoolkit/openvino.genai/pull/639 --- .../openvino/genai/scheduler_config.hpp | 8 + src/cpp/src/block_manager.hpp | 259 +++++++++++++++++- src/cpp/src/scheduler.hpp | 28 +- src/cpp/src/sequence_group.hpp | 21 ++ src/python/py_generate_pipeline.cpp | 4 +- tests/cpp/CMakeLists.txt | 5 +- tests/cpp/block_manager.cpp | 32 ++- tests/cpp/evictor.cpp | 54 ++++ tests/cpp/scheduler.cpp | 66 +++++ 9 files changed, 442 insertions(+), 35 deletions(-) create mode 100644 tests/cpp/evictor.cpp diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp index 9d808fd424..aca823fa63 100644 --- a/src/cpp/include/openvino/genai/scheduler_config.hpp +++ b/src/cpp/include/openvino/genai/scheduler_config.hpp @@ -30,5 +30,13 @@ struct SchedulerConfig { // max number of scheduled sequences (you can think of it as "max batch size") std::size_t max_num_seqs = 256; + + // Enable caching of KV-blocks. + // When turned on all previously calculated KV-caches are kept in memory for future usages. + // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. + // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + // When turend off only KV-cache required for batch calculation is kept in memory and + // when a sequence has finished genegartion its cache is released. + bool enable_prefix_caching = false; }; } diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp index ab60b7f5ff..8c9c3ed512 100644 --- a/src/cpp/src/block_manager.hpp +++ b/src/cpp/src/block_manager.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "sequence_group.hpp" @@ -13,13 +14,17 @@ namespace ov::genai { class KVCacheBlock { int m_ref_count; int m_index; + size_t m_hash; + size_t m_num_hashed_tokens; + std::chrono::time_point m_timestamp; public: using Ptr = std::shared_ptr; using CPtr = std::shared_ptr; explicit KVCacheBlock(int index) : m_ref_count(0), - m_index(index) { } + m_index(index), + m_timestamp(std::chrono::system_clock::now()) { } int get_index() const { return m_index; @@ -34,6 +39,7 @@ class KVCacheBlock { } void release() { + OPENVINO_ASSERT(m_ref_count > 0); --m_ref_count; } @@ -44,15 +50,79 @@ class KVCacheBlock { int get_references_count() const { return m_ref_count; } + + size_t get_hash() const { + return m_hash; + } + + size_t get_num_hashed_tokens() const { + return m_num_hashed_tokens; + } + + void set_hash(size_t hash, size_t num_hashed_tokens) { + m_hash = hash; + m_num_hashed_tokens = num_hashed_tokens; + } + + void set_timestamp(const std::chrono::time_point& timestamp) { + m_timestamp = timestamp; + } + + std::chrono::time_point get_timestamp() { + return m_timestamp; + } +}; + + +class Evictor { + std::map blocks; +public: + void add(size_t hash, KVCacheBlock::Ptr block) { + blocks[hash] = block; + } + + static bool block_is_less(const std::pair& lhs, const std::pair& rhs) { + return lhs.second->get_timestamp() < rhs.second->get_timestamp(); + } + + KVCacheBlock::Ptr get_block(size_t hash) { + if (blocks.find(hash)== blocks.end()) + { + return nullptr; + } + KVCacheBlock::Ptr block = blocks[hash]; + block->set_timestamp(std::chrono::system_clock::now()); + block->increment(); + blocks.erase(hash); + return block; + } + + KVCacheBlock::Ptr get_lru_block() { + if (!blocks.size()) { + return nullptr; + } + auto hash_block = std::min_element(std::begin(blocks), std::end(blocks), block_is_less); + auto block = hash_block->second; + block->set_timestamp(std::chrono::system_clock::now()); + block->increment(); + blocks.erase(hash_block->first); + return block; + } + + size_t num_blocks() const { + return blocks.size(); + } }; class BlockAllocator { std::list m_free_blocks; + ov::genai::Evictor m_evictor; int m_total_num_blocks; + bool m_enable_prefix_caching; public: - BlockAllocator(int num_blocks) : - m_total_num_blocks(num_blocks) { + BlockAllocator(int num_blocks, bool enable_prefix_caching) : + m_total_num_blocks(num_blocks), m_enable_prefix_caching(enable_prefix_caching) { for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { m_free_blocks.push_back(std::make_shared(block_id)); } @@ -64,21 +134,28 @@ class BlockAllocator { } size_t num_free_blocks() const { - return m_free_blocks.size(); + return m_free_blocks.size() + m_evictor.num_blocks(); } bool can_allocate_blocks(size_t num_blocks) const { - return num_blocks <= m_free_blocks.size(); + return num_blocks <= num_free_blocks(); } void free(KVCacheBlock::Ptr block) { block->release(); if (block->is_free()) { - m_free_blocks.push_back(block); + if (m_enable_prefix_caching) + { + m_evictor.add(block->get_hash(), block); + } + else { + m_free_blocks.push_back(block); + } } } KVCacheBlock::Ptr allocate_block() { + OPENVINO_ASSERT(!m_enable_prefix_caching); OPENVINO_ASSERT(can_allocate_blocks(1)); KVCacheBlock::Ptr allocated_block = m_free_blocks.front(); allocated_block->increment(); @@ -86,20 +163,84 @@ class BlockAllocator { return allocated_block; } + KVCacheBlock::Ptr allocate_block(size_t hash, size_t num_hashed_tokens, std::map& cached_blocks) { + OPENVINO_ASSERT(m_enable_prefix_caching); + OPENVINO_ASSERT(can_allocate_blocks(1)); + auto block = m_evictor.get_block(hash); + if (block != nullptr) { + // use cached block from evictor + cached_blocks[hash] = block; + return block; + } + // TODO: Currently we cache all allocated blocks which might be redundant for beam search, + // where blocks of non-used candidates are not needed in cache. + // This part can be improved if we cache only blocks for prompt. + if (cached_blocks.find(hash) != cached_blocks.end()) { + // use cashed block from cached_blocks + block = cached_blocks[hash]; + cached_blocks[hash]->increment(); + return block; + } + if (m_free_blocks.size() > 0) { + // allocate new empty block + KVCacheBlock::Ptr allocated_block = m_free_blocks.front(); + allocated_block->increment(); + allocated_block->set_hash(hash, num_hashed_tokens); + cached_blocks[hash] = allocated_block; + + m_free_blocks.pop_front(); + return allocated_block; + } + if (m_evictor.num_blocks() > 0) { + // get least resently used block from evictor and reuse it + KVCacheBlock::Ptr block = m_evictor.get_lru_block(); + cached_blocks.erase(block->get_hash()); + + // update block with new hash + block->set_hash(hash, num_hashed_tokens); + cached_blocks[hash] = block; + return block; + } + // out of memory + return nullptr; + } + + KVCacheBlock::Ptr get_cached_block(size_t hash, std::map& cached_blocks) { + auto block = m_evictor.get_block(hash); + if (block != nullptr) { + // use cashed block from evictor + cached_blocks[hash] = block; + return block; + } + if (cached_blocks.find(hash) != cached_blocks.end()) { + // use cashed block from cached_blocks + // TODO: add tokens validation in case of hash collision + block = cached_blocks[hash]; + cached_blocks[hash]->increment(); + return block; + } + return nullptr; + } + + float get_used_percentage() const { - return static_cast(m_total_num_blocks - m_free_blocks.size()) / m_total_num_blocks; + return static_cast(m_total_num_blocks - num_free_blocks()) / m_total_num_blocks; } }; class BlockManager { BlockAllocator m_allocator; + bool m_enable_prefix_caching; + size_t m_block_size; + // TODO: caching time can probably be improved if we use the prefix tree + std::map cached_blocks; // stores blocks for each sequence (not sequence group) // the same block can be seen in multiple block_tables for different sequences std::map> m_block_table; public: - BlockManager(int num_blocks) - : m_allocator(num_blocks) { } + BlockManager(int num_blocks, bool enable_prefix_caching, size_t block_size) + : m_allocator(num_blocks, enable_prefix_caching), m_enable_prefix_caching(enable_prefix_caching), m_block_size(block_size) { } ~BlockManager() { // sanity check that all sequences are freed @@ -195,11 +336,32 @@ class BlockManager { return m_allocator.can_allocate_blocks(num_blocks); } - void allocate(uint64_t sequence_id, size_t num_blocks) { + void allocate(ov::genai::Sequence::CPtr sequence, size_t num_blocks, const ov::genai::TokenIds& prompt_ids = {}) { OPENVINO_ASSERT(num_blocks > 0 && can_allocate_blocks(num_blocks)); + if (m_enable_prefix_caching) { + OPENVINO_ASSERT(prompt_ids.size() > 0, "prompt_ids should be set for hash calculation."); + } + auto sequence_id = sequence->get_id(); + auto block_table = m_block_table[sequence_id]; + auto content_length = sequence->get_generated_len() + prompt_ids.size(); + size_t num_hashed_tokens = block_table.size() * m_block_size; for (size_t i = 0; i < num_blocks; ++i) { - m_block_table[sequence_id].push_back(m_allocator.allocate_block()); + + ov::genai::KVCacheBlock::Ptr block = nullptr; + if (m_enable_prefix_caching) { + num_hashed_tokens += m_block_size; + if (num_hashed_tokens > content_length) { + num_hashed_tokens = content_length; + } + auto hash = sequence->get_hash(num_hashed_tokens, prompt_ids); + block = m_allocator.allocate_block(hash, num_hashed_tokens, cached_blocks); + } + else { + block = m_allocator.allocate_block(); + } + OPENVINO_ASSERT(block != nullptr); + m_block_table[sequence_id].push_back(block); } } @@ -324,21 +486,36 @@ class BlockManager { if (num_logical_blocks > num_physical_blocks) { OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks)); - allocate(seq_id, num_logical_blocks - num_physical_blocks); + allocate(sequence, num_logical_blocks - num_physical_blocks, seq_group->get_prompt_ids()); } else { OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path"); KVCacheBlock::Ptr last_block = block_table.back(); - if (last_block->copy_on_write()) { // we need to fork current block, because reference counter is more than 1 - KVCacheBlock::Ptr new_block = m_allocator.allocate_block(); + KVCacheBlock::Ptr new_block = nullptr; + if (m_enable_prefix_caching) { + auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids()); + new_block = m_allocator.allocate_block(hash, seq_group->get_context_len(), cached_blocks); + cached_blocks[hash] = new_block; + } + else { + new_block = m_allocator.allocate_block(); + } block_table[num_physical_blocks - 1] = new_block; // write information about block forking for later usage in CacheManager copy_blocks_map[last_block->get_index()].push_back(new_block->get_index()); // release `last_block` usage m_allocator.free(last_block); } else { - // nothing to do, because we are the only users of this block + // we are the only users of this block + if (m_enable_prefix_caching) { + // update hash of block + auto prev_hash = last_block->get_hash(); + auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids()); + last_block->set_hash(hash, seq_group->get_context_len()); + cached_blocks.erase(prev_hash); + cached_blocks[hash] = last_block; + } } } } @@ -346,5 +523,57 @@ class BlockManager { // it returns information which blocks should be forked by CacheManager return copy_blocks_map; } + + + void _restore_cached_blocks(SequenceGroup::Ptr group, size_t block_size) { + auto prompt_ids = group->get_prompt_ids(); + auto sequences = group->get_not_finished_sequences(); + OPENVINO_ASSERT(sequences.size() == 1); + auto sequence = sequences[0]; + auto seq_id = sequence->get_id(); + auto& block_table = m_block_table[seq_id]; + + size_t content_len = 0; + while (content_len < prompt_ids.size()) { + size_t prev_iteration_content_len = content_len; + content_len += block_size; + if (content_len > prompt_ids.size()) { + content_len = prompt_ids.size(); + } + // restore fully filled blocks + auto hash = sequence->get_hash(content_len, prompt_ids); + auto block = m_allocator.get_cached_block(hash, cached_blocks); + if (block != nullptr) { + block->set_timestamp(std::chrono::system_clock::now()); + m_block_table[seq_id].push_back(block); + group->update_processed_tokens_num(content_len); + } + else { + // restore partially filled block + for (size_t i = 1; i < block_size; i++) { + if (prev_iteration_content_len + i > prompt_ids.size()) { + break; + } + auto hash = sequence->get_hash(prev_iteration_content_len + i, prompt_ids); + auto block = m_allocator.get_cached_block(hash, cached_blocks); + if (block != nullptr) { + block->set_timestamp(std::chrono::system_clock::now()); + m_block_table[seq_id].push_back(block); + group->update_processed_tokens_num(prev_iteration_content_len + i); + + size_t new_tokens_count_in_block = std::min(content_len, prev_iteration_content_len + block_size); + if (new_tokens_count_in_block > prev_iteration_content_len + i) { + cached_blocks.erase(hash); + auto new_hash = sequence->get_hash(new_tokens_count_in_block, prompt_ids); + cached_blocks[new_hash] = block; + } + + break; + } + } + break; + } + } + } }; } diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index ca749137db..cbd6668f90 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -10,7 +10,6 @@ #include "openvino/genai/scheduler_config.hpp" #include "block_manager.hpp" #include "sequence_group.hpp" -#include "block_manager.hpp" namespace ov::genai { class Scheduler { @@ -34,11 +33,14 @@ class Scheduler { }; explicit Scheduler(const SchedulerConfig & config = {}) : - m_config(config), m_block_manager(m_config.num_kv_blocks) { } + m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, m_config.block_size) { } Output schedule(std::vector& sequence_groups) { Output scheduler_output; + if (m_config.enable_prefix_caching) + _restore_cached_blocks(sequence_groups); + if (m_config.dynamic_split_fuse) { // deepspeed-mii case // generation phase is always scheduled first @@ -167,6 +169,15 @@ class Scheduler { return std::numeric_limits::max(); } + void _restore_cached_blocks(const std::vector& sequence_groups) { + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + if (sequence_group->can_generate_tokens() || sequence_group->num_running_seqs() != 1) + continue; + m_block_manager._restore_cached_blocks(sequence_group, m_config.block_size); + } + } + void _apply_preemption(size_t sequence_group_id, const std::vector& sequence_groups) { SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; @@ -222,7 +233,7 @@ class Scheduler { if (num_scheduled_tokens > 0) { // allocate KV blocks if required if (num_scheduled_blocks > 0) - m_block_manager.allocate(seq_id, num_scheduled_blocks); + m_block_manager.allocate(sequence, num_scheduled_blocks, sequence_group->get_prompt_ids()); // and schedule tokens sequence_group->schedule_tokens(num_scheduled_tokens); @@ -326,7 +337,8 @@ class Scheduler { // prompt phases can have a single running sequence OPENVINO_ASSERT(num_running_seqs == 1); // here we also assume that sequence must be scheduler in a single shot and has no already generated context - OPENVINO_ASSERT(sequence_group->get_context_len() == 0); + if (!m_config.enable_prefix_caching) + OPENVINO_ASSERT(sequence_group->get_context_len() == 0); size_t num_available_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens; size_t sequence_len = sequence_group->get_num_available_tokens_for_batching(); @@ -354,11 +366,15 @@ class Scheduler { Sequence::Ptr sequence = (*sequence_group)[0]; uint64_t seq_id = sequence->get_id(); - // allocate KV blocks - m_block_manager.allocate(seq_id, num_required_blocks); // and schedule tokens sequence_group->schedule_tokens(sequence_len); + // allocate KV blocks + if (sequence_group->get_num_processed_tokens() == 0) + m_block_manager.allocate(sequence, num_required_blocks, sequence_group->get_prompt_ids()); + else + m_block_manager.append_slots(sequence_group); + // add information to scheduler_output { scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id); diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 3df1820cfb..008a36282e 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "openvino/genai/generation_handle.hpp" #include "openvino/genai/generation_config.hpp" @@ -121,6 +122,21 @@ class Sequence { float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty); return score; } + + // Each KV block can be uniquely identified by + // the tokens within the block and the tokens in the prefix before the block. + // hash(prefix tokens + block tokens) <--> KV Block + size_t get_hash(size_t content_length, const ov::genai::TokenIds& prompt_ids) const { + std::vector content; + OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size()); + content.insert( content.end(), prompt_ids.begin(), prompt_ids.begin() + std::min(prompt_ids.size(), content_length)); + if (content_length > prompt_ids.size()) { + content.insert(content.end(), m_generated_ids.begin(), m_generated_ids.begin() + content_length - prompt_ids.size()); + } + const char* data = reinterpret_cast(content.data()); + std::size_t size = content.size() * sizeof(content[0]); + return std::hash{}(std::string_view(data, size)); + } }; // contains a list of Sequences in generic case (beam search or parallel sampling) @@ -345,6 +361,11 @@ class SequenceGroup { clear_scheduled_tokens(); } + void update_processed_tokens_num(size_t processed_tokens) { + m_num_processed_tokens = processed_tokens; + m_max_content_len = processed_tokens; + } + void clear_waiting_sequences() { for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { if (m_sequences[seq_id]->is_waiting()) { diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index f8888ba258..6175001c29 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -618,11 +618,11 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def(py::init<>()) .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens) .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks) - .def_readwrite("cache_size", &SchedulerConfig::cache_size) .def_readwrite("block_size", &SchedulerConfig::block_size) .def_readwrite("cache_size", &SchedulerConfig::cache_size) .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse) - .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs); + .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs) + .def_readwrite("enable_prefix_caching", &SchedulerConfig::enable_prefix_caching); py::class_(m, "ContinuousBatchingPipeline") .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt index 025a58a507..083b911416 100644 --- a/tests/cpp/CMakeLists.txt +++ b/tests/cpp/CMakeLists.txt @@ -4,6 +4,9 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(googletest) set(TEST_TARGET_NAME "tests_continuous_batching") -add_executable(${TEST_TARGET_NAME} scheduler.cpp block_manager.cpp logit_filtering.cpp cache_manager.cpp generate_config.cpp) +file(GLOB tests_src + "*.cpp" +) +add_executable(${TEST_TARGET_NAME} ${tests_src}) target_link_libraries(${TEST_TARGET_NAME} PUBLIC openvino::genai gtest_main) target_include_directories(${TEST_TARGET_NAME} PRIVATE "${PROJECT_SOURCE_DIR}/src/cpp/src") diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp index b3c89535a6..5a76a7a0ce 100644 --- a/tests/cpp/block_manager.cpp +++ b/tests/cpp/block_manager.cpp @@ -10,30 +10,40 @@ #include "scheduler.hpp" TEST(TestBlockManager, general_test) { - ov::genai::BlockManager bm = ov::genai::BlockManager(6); + ov::genai::BlockManager bm = ov::genai::BlockManager(6, false, 4); + ov::genai::TokenIds prompt_ids; + + ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared( + 0, + ov::Tensor(ov::element::i64, { + prompt_ids.size()}, prompt_ids.data()), + ov::genai::beam_search(), + 4); + auto sequence = sequence_group->get_not_finished_sequences()[0]; + bm.allocate(sequence, 6); + auto seq_id = sequence->get_id(); + EXPECT_TRUE(bm.has_block_table(seq_id)); + EXPECT_EQ(bm.get_block_table(seq_id).size(), 6); - bm.allocate(0, 6); - EXPECT_TRUE(bm.has_block_table(0)); - EXPECT_EQ(bm.get_block_table(0).size(), 6); EXPECT_EQ(bm.num_free_blocks(), 0); - bm.free_sequence_partially_single_runnning_sequence(0, 4); - EXPECT_EQ(bm.get_block_table(0).size(), 2); + bm.free_sequence_partially_single_runnning_sequence(seq_id, 4); + EXPECT_EQ(bm.get_block_table(seq_id).size(), 2); EXPECT_EQ(bm.num_free_blocks(), 4); - bm.free_sequence(0); - EXPECT_FALSE(bm.has_block_table(0)); + bm.free_sequence(seq_id); + EXPECT_FALSE(bm.has_block_table(seq_id)); EXPECT_EQ(bm.num_free_blocks(), 6); - bm.allocate(0, 2); - bm.fork_sequence(0, 1); + bm.allocate(sequence, 2); + bm.fork_sequence(seq_id, 1); EXPECT_TRUE(bm.has_block_table(1)); EXPECT_EQ(bm.get_block_table(1).back()->get_references_count(), 2); } TEST(TestBlockManager, required_blocks_count) { - ov::genai::BlockManager bm = ov::genai::BlockManager(8); + ov::genai::BlockManager bm = ov::genai::BlockManager(8, false, 4); std::vector tokens = {0,1,2,3,4}; ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared( diff --git a/tests/cpp/evictor.cpp b/tests/cpp/evictor.cpp new file mode 100644 index 0000000000..9867dfa2b5 --- /dev/null +++ b/tests/cpp/evictor.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "openvino/runtime/core.hpp" +#include "scheduler.hpp" +#include +#include + +TEST(TestEvictor, general_test) { + ov::genai::Evictor evictor; + auto block0 = std::make_shared(0); + block0->set_hash(77, 1); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block1 = std::make_shared(1); + block1->set_hash(56, 2); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block2 = std::make_shared(2); + block2->set_hash(23, 3); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + evictor.add(block0->get_hash(), block0); + evictor.add(block1->get_hash(), block1); + evictor.add(block2->get_hash(), block2); + EXPECT_EQ(evictor.num_blocks(), 3); + + auto block = evictor.get_block(56); + EXPECT_EQ(block->get_index(), 1); + EXPECT_EQ(block->get_hash(), 56); + EXPECT_EQ(block->get_references_count(), 1); + EXPECT_EQ(evictor.num_blocks(), 2); + + EXPECT_EQ(evictor.get_block(44), nullptr); + EXPECT_EQ(evictor.num_blocks(), 2); + + EXPECT_EQ(evictor.get_lru_block()->get_index(), 0); + EXPECT_EQ(evictor.num_blocks(), 1); + + auto block3 = std::make_shared(7); + block3->set_hash(12, 4); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block4 = std::make_shared(10); + block4->set_hash(99, 5); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + evictor.add(block3->get_hash(), block3); + evictor.add(block4->get_hash(), block4); + block2->set_timestamp(std::chrono::system_clock::now()); + + EXPECT_EQ(evictor.get_lru_block()->get_index(), 7); + EXPECT_EQ(evictor.get_lru_block()->get_index(), 10); + EXPECT_EQ(evictor.get_lru_block()->get_index(), 2); + EXPECT_EQ(evictor.get_lru_block(), nullptr); + EXPECT_EQ(evictor.num_blocks(), 0); +} diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index b4114dd1b2..82b104223c 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -366,3 +366,69 @@ TEST(TestScheduler, test_partially_preempted_prompt) { EXPECT_FALSE(scheduler.has_block_table(idx0)); } } + +TEST(TestScheduler, prefix_caching_test) { + std::array configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 100; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(0).enable_prefix_caching = true; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 100; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; + configs.at(1).enable_prefix_caching = true; + for (auto scheduler_config: configs) { + std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; + std::vector histrory_tokens = {}; + // schedule prompt + Scheduler scheduler = Scheduler(scheduler_config); + + size_t chat_iterations = 10; + + for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) { + std::vector tokens = histrory_tokens; + tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + SequenceGroup::Ptr sequence_group = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size); + std::vector requests = {sequence_group}; + + auto out1 = scheduler.schedule(requests); + if (chat_iteration == 0) + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size()); + else + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1); + for (auto seq: requests) { + std::vector running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(23, 0.7); + seq->finish_iteration(); + } + + // schedule generate + size_t num_generate_tokens = 10; + for (size_t i = 0; i < num_generate_tokens; i++) { + auto out2 = scheduler.schedule(requests); + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); + for (auto seq: requests) { + std::vector running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(16, 0.9); + seq->finish_iteration(); + } + } + + // finish sequence + auto sequence = requests[0]->get_running_sequences()[0]; + sequence->set_status(SequenceStatus::FINISHED); + auto idx0 = sequence->get_id(); + scheduler.free_sequence(idx0); + auto generated_ids = sequence->get_generated_ids(); + + histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end()); + } + } + +} \ No newline at end of file From be2fdafb273319084999fe944d02e5653d030de7 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 26 Jul 2024 10:12:55 +0200 Subject: [PATCH 09/15] resolve conflicts --- src/cpp/src/llm_pipeline.cpp | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 40d4377b00..8505daf3b2 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -510,7 +510,10 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, OptionalGenerationConfig generation_config ) { + auto start_time = std::chrono::steady_clock::now(); m_pimpl = std::make_unique(request, tokenizer, generation_config); + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::LLMPipeline::LLMPipeline( @@ -518,27 +521,35 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config -): m_pimpl{[&]() -> std::unique_ptr { +){ + auto start_time = std::chrono::steady_clock::now(); if ("CB" == device) { - return std::make_unique(model_path, tokenizer, "CPU", plugin_config); - } if ("NPU" == device) { - return std::make_unique(model_path, tokenizer, device, plugin_config); + m_pimpl = std::make_unique(model_path, tokenizer, "CPU", plugin_config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); + } else { + m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); } - return std::make_unique(model_path, tokenizer, device, plugin_config); -}()} {} + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); +} ov::genai::LLMPipeline::LLMPipeline( const std::string& path, const std::string& device, const ov::AnyMap& config -): m_pimpl{[&]() -> std::unique_ptr { +){ + auto start_time = std::chrono::steady_clock::now(); if ("CB" == device) { - return std::make_unique(path, "CPU", config); - } if ("NPU" == device) { - return std::make_unique(path, device, config); + m_pimpl = std::make_unique(path, "CPU", config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(path, device, config); + } else { + m_pimpl = std::make_unique(path, device, config); } - return std::make_unique(path, device, config); -}()} {} + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); +} ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { return m_pimpl->m_generation_config; From b00bcd8f411e65c7a5d455fec502fcf2639fa022 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 26 Jul 2024 14:05:36 +0200 Subject: [PATCH 10/15] apply comments --- samples/cpp/benchmark_genai/CMakeLists.txt | 1 - .../cpp/benchmark_genai/benchmark_genai.cpp | 9 +- .../python/benchmark_genai/benchmark_genai.py | 2 +- .../benchmark_genai_automatic.py | 62 -------------- src/README.md | 82 ++++++++++++++----- src/cpp/src/greedy_decoding.cpp | 1 - src/cpp/src/perf_metrics.cpp | 2 - 7 files changed, 67 insertions(+), 92 deletions(-) delete mode 100755 samples/python/benchmark_genai/benchmark_genai_automatic.py diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt index bfa1592f61..5443439de5 100644 --- a/samples/cpp/benchmark_genai/CMakeLists.txt +++ b/samples/cpp/benchmark_genai/CMakeLists.txt @@ -18,7 +18,6 @@ set_target_properties(benchmark_genai PROPERTIES COMPILE_PDB_NAME benchmark_genai # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -# target_compile_features(benchmark_genai PRIVATE cxx_std_11) install(TARGETS benchmark_genai RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp index 24b9491219..2fd5eafc69 100644 --- a/samples/cpp/benchmark_genai/benchmark_genai.cpp +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -50,14 +50,15 @@ int main(int argc, char* argv[]) try { res = pipe.generate(prompt, config); metrics = metrics + res.perf_metrics; } - + + std::cout << std::fixed << std::setprecision(2); std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; std::cout << "Generate time: " << metrics.generate_duration.mean << " ± " << metrics.generate_duration.std << " ms" << std::endl; std::cout << "Tokenization time: " << metrics.tokenization_duration.mean << " ± " << metrics.tokenization_duration.std << " ms" << std::endl; std::cout << "Detokenization time: " << metrics.detokenization_duration.mean << " ± " << metrics.detokenization_duration.std << " ms" << std::endl; - std::cout << "ttft: " << metrics.ttft.mean << " ± " << metrics.ttft.std << " ms" << std::endl; - std::cout << "tpot: " << metrics.tpot.mean << " ± " << metrics.tpot.std << " ms " << std::endl; - std::cout << "Tokens/s: " << metrics.throughput.mean << " ± " << metrics.throughput.std << std::endl; + std::cout << "TTFT: " << metrics.ttft.mean << " ± " << metrics.ttft.std << " ms" << std::endl; + std::cout << "TPOT: " << metrics.tpot.mean << " ± " << metrics.tpot.std << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.throughput.mean << " ± " << metrics.throughput.std << " tokens/s" << std::endl; return 0; } catch (const std::exception& error) { diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py index c29c508bf4..ef468053d8 100755 --- a/samples/python/benchmark_genai/benchmark_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -43,7 +43,7 @@ def main(): print(f"Detokenization time: {perf_metrics.detokenization_duration.mean:.2f} ± {perf_metrics.detokenization_duration.std:.2f} ms") print(f"TTFT: {perf_metrics.ttft.mean:.2f} ± {perf_metrics.ttft.std:.2f} ms") print(f"TPOT: {perf_metrics.tpot.mean:.2f} ± {perf_metrics.tpot.std:.2f} ms") - print(f"Throughput tokens/s: {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f}") + print(f"Throughput : {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f} tokens/s") if __name__ == "__main__": main() diff --git a/samples/python/benchmark_genai/benchmark_genai_automatic.py b/samples/python/benchmark_genai/benchmark_genai_automatic.py deleted file mode 100755 index 98a00a8c99..0000000000 --- a/samples/python/benchmark_genai/benchmark_genai_automatic.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import argparse -import openvino_genai as ov_genai -import pdb - -def main(): - parser = argparse.ArgumentParser(description="Help command") - parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") - parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") - parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") - parser.add_argument("-n", "--num_iter", type=int, default=5, help="Number of iterations") - parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") - parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") - - args = parser.parse_args() - - # Perf metrics is stored in DecodedResults. - # In order to get DecodedResults instead of a string input should be a list. - - model_path = args.model - device = args.device - num_warmup = args.num_warmup - num_iter = args.num_iter - - config = ov_genai.GenerationConfig() - config.max_new_tokens = 20 - # config.num_beam_groups = 3 - # config.num_beams = 15 - - pipe = ov_genai.LLMPipeline(model_path, device) - - import pandas as pd - metrics_df = pd.DataFrame(columns=['batch_size', 'throughput', 'ttft', 'tpot', 'std_throughput', 'std_ttft', 'std_tpot']) - - batch_sizes = [1, 2, 4, 16, 32, 64, 256] - for batch_size in batch_sizes: - prompt = [args.prompt] * batch_size - for _ in range(num_warmup): - pipe.generate(prompt, config) - - res = pipe.generate(prompt, config) - metrics = res.metrics - for _ in range(num_iter - 1): - res = pipe.generate(prompt, config) - metrics += res.metrics - # pdb.set_trace() - metrics_df = metrics_df._append({ - 'batch_size': batch_size, - 'throughput': metrics.mean_throughput, - 'ttft': metrics.mean_ttft, - 'tpot': metrics.mean_tpot, - 'std_throughput': metrics.std_throughput, - 'std_ttft': metrics.std_ttft, - 'std_tpot': metrics.std_tpot, - }, ignore_index=True) - - metrics_df.to_csv('metrics.csv', index=False) - -if __name__ == "__main__": - main() diff --git a/src/README.md b/src/README.md index 3a53e175dd..aa4dc0f301 100644 --- a/src/README.md +++ b/src/README.md @@ -198,29 +198,49 @@ int main(int argc, char* argv[]) { ### Performance Metrics -`ov.genai.PerfMetrics` (referred to as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` hold fields with mean and standard deviations for the following metrics: -- `ttft` -- `tpot` -- `load_time` -- `generate_duration` -- `tokenization_duration` -- `detokenization_duration` -- `throughput` +`openvino_genai.PerfMetrics` (referred as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` holds fields with mean and standard deviations for the following metrics: +- Time To the First Token (TTFT), ms +- Time per Output Token (TPOT), ms/token +- Generate total duration, ms +- Tokenization duration, ms +- Detokenization duration, ms +- Throughput, tokens/s and: -- `num_generated_tokens` -- `num_input_tokens` +- Load time, ms +- Number of generated tokens +- Number of tokens in the input prompt -Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `ov.genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`. +Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `openvino_genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`. ```python import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -res = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) -perf_metrics = res.perf_metrics -print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}') -print(f'ttft: {perf_metrics.mean_ttft:.2f}') -print(f'tpot: {perf_metrics.mean_tpot:.2f}') +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics + +print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}') +print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms') +print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s') +``` + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result.perf_metrics; + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.ttft.mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.tpot.mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.throughput.mean << " tokens/s" << std::endl; +} ``` output: ```sh @@ -229,9 +249,28 @@ mean_ttft: 42.58 mean_tpot 3.80 ``` ->**Note**: If the input prompt is just a string, the generate function will return only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. +>**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. -Several `perf_metrics` can be added with each other. In that case `raw_metrics` will be concatenated and mean/std values will be recalculated. This enhances benchmarking and accumulating statistics from several calls. +Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result_1 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto result_2 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.ttft.mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.tpot.mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.throughput.mean << " tokens/s" << std::endl; +} +``` ```python import openvino_genai as ov_genai @@ -240,9 +279,10 @@ res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) perf_metrics = res_1.perf_metrics + res_2.perf_metrics -print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}') -print(f'ttft: {perf_metrics.mean_ttft:.2f}') -print(f'tpot: {perf_metrics.mean_tpot:.2f}') +print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}') +print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms') +print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s') ``` ## How It Works diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 8b0cf19c1f..8dc56b4ba8 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 #include "openvino/genai/perf_metrics.hpp" -// #include "perf_counters.hpp" #include "utils.hpp" namespace ov { diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index bc394fae52..92b6315990 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -63,8 +63,6 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); generate_duration = calc_mean_and_std(raw_metrics.generate_durations); - generate_duration = calc_mean_and_std(raw_metrics.generate_durations); - tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); From 60e71881766334a2dfd05e4b17b22e7de740d2d1 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 26 Jul 2024 14:56:13 +0200 Subject: [PATCH 11/15] uset getter and cache evaluate results --- .../cpp/benchmark_genai/benchmark_genai.cpp | 16 +++--- .../python/benchmark_genai/benchmark_genai.py | 14 ++--- src/README.md | 32 ++++++------ .../include/openvino/genai/perf_metrics.hpp | 16 ++++++ src/cpp/src/perf_metrics.cpp | 52 ++++++++++++++++++- src/python/py_generate_pipeline.cpp | 14 ++--- 6 files changed, 104 insertions(+), 40 deletions(-) diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp index 2fd5eafc69..287d6b379a 100644 --- a/samples/cpp/benchmark_genai/benchmark_genai.cpp +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -50,15 +50,15 @@ int main(int argc, char* argv[]) try { res = pipe.generate(prompt, config); metrics = metrics + res.perf_metrics; } - + std::cout << std::fixed << std::setprecision(2); - std::cout << "Load time: " << metrics.load_time << " ms" << std::endl; - std::cout << "Generate time: " << metrics.generate_duration.mean << " ± " << metrics.generate_duration.std << " ms" << std::endl; - std::cout << "Tokenization time: " << metrics.tokenization_duration.mean << " ± " << metrics.tokenization_duration.std << " ms" << std::endl; - std::cout << "Detokenization time: " << metrics.detokenization_duration.mean << " ± " << metrics.detokenization_duration.std << " ms" << std::endl; - std::cout << "TTFT: " << metrics.ttft.mean << " ± " << metrics.ttft.std << " ms" << std::endl; - std::cout << "TPOT: " << metrics.tpot.mean << " ± " << metrics.tpot.std << " ms/token " << std::endl; - std::cout << "Throughput: " << metrics.throughput.mean << " ± " << metrics.throughput.std << " tokens/s" << std::endl; + std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl; + std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl; + std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl; + std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ± " << metrics.get_ttft().std << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl; return 0; } catch (const std::exception& error) { diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py index ef468053d8..9851483880 100755 --- a/samples/python/benchmark_genai/benchmark_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -37,13 +37,13 @@ def main(): res = pipe.generate(prompt, config) perf_metrics += res.perf_metrics - print(f"Load time: {perf_metrics.load_time:.2f} ms") - print(f"Generate time: {perf_metrics.generate_duration.mean:.2f} ± {perf_metrics.generate_duration.std:.2f} ms") - print(f"Tokenization time: {perf_metrics.tokenization_duration.mean:.2f} ± {perf_metrics.tokenization_duration.std:.2f} ms") - print(f"Detokenization time: {perf_metrics.detokenization_duration.mean:.2f} ± {perf_metrics.detokenization_duration.std:.2f} ms") - print(f"TTFT: {perf_metrics.ttft.mean:.2f} ± {perf_metrics.ttft.std:.2f} ms") - print(f"TPOT: {perf_metrics.tpot.mean:.2f} ± {perf_metrics.tpot.std:.2f} ms") - print(f"Throughput : {perf_metrics.throughput.mean:.2f} ± {perf_metrics.throughput.std:.2f} tokens/s") + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") + print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") + print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") + print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms") + print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms") + print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms") + print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s") if __name__ == "__main__": main() diff --git a/src/README.md b/src/README.md index aa4dc0f301..aefa993d8e 100644 --- a/src/README.md +++ b/src/README.md @@ -219,10 +219,10 @@ pipe = ov_genai.LLMPipeline(model_path, "CPU") result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) perf_metrics = result.perf_metrics -print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}') -print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms') -print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token') -print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s') +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput()get_.mean():.2f} tokens/s') ``` ```cpp @@ -236,10 +236,10 @@ int main(int argc, char* argv[]) { auto perf_metrics = result.perf_metrics; std::cout << std::fixed << std::setprecision(2); - std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl; - std::cout << "TTFT: " << metrics.ttft.mean << " ms" << std::endl; - std::cout << "TPOT: " << metrics.tpot.mean << " ms/token " << std::endl; - std::cout << "Throughput: " << metrics.throughput.mean << " tokens/s" << std::endl; + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; } ``` output: @@ -265,10 +265,10 @@ int main(int argc, char* argv[]) { auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics std::cout << std::fixed << std::setprecision(2); - std::cout << "Generate duration: " << perf_metrics.generate_duration.mean << " ms" << std::endl; - std::cout << "TTFT: " << metrics.ttft.mean << " ms" << std::endl; - std::cout << "TPOT: " << metrics.tpot.mean << " ms/token " << std::endl; - std::cout << "Throughput: " << metrics.throughput.mean << " tokens/s" << std::endl; + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; } ``` @@ -279,10 +279,10 @@ res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) perf_metrics = res_1.perf_metrics + res_2.perf_metrics -print(f'Generate duration: {perf_metrics.generate_duration.mean:.2f}') -print(f'TTFT: {perf_metrics.ttft.mean:.2f} ms') -print(f'TPOT: {perf_metrics.tpot.mean:.2f} ms/token') -print(f'Throughput: {perf_metrics.throughput.mean:.2f} tokens/s') +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') ``` ## How It Works diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index 8715761792..ddb9ff581f 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -57,6 +57,22 @@ struct OPENVINO_GENAI_EXPORTS PerfMetrics { size_t num_generated_tokens; size_t num_input_tokens; + + float get_load_time(); // Load time in ms. + float get_num_generated_tokens(); + float get_num_input_tokens(); + MeanStdPair get_ttft(); // Time to the first token (in ms) (TTTFT). + MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). + MeanStdPair get_throughput(); // Tokens per second. + + MeanStdPair get_generate_duration(); + MeanStdPair get_tokenization_duration(); + MeanStdPair get_detokenization_duration(); + + // Flag indicating if raw metrics were evaluated. + // If false means current mean/std ttft, tpot, etc. are not actual + // and evaluate_statistics() should recalculate them. + bool m_evaluated = false; /** * @brief calculates mean/std values from raw_metrics. diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 92b6315990..2f378ab302 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -32,11 +32,58 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector(duration).count(); } - + void PerfMetrics::evaluate_statistics(std::optional start_time) { + if (m_evaluated){ + return; + } // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that. if (start_time.has_value()) { auto start_time_val = *start_time; @@ -68,6 +115,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { // tokens per second throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)}; + m_evaluated = true; } PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { @@ -103,7 +151,7 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens; res.num_input_tokens = num_generated_tokens + right.num_input_tokens; res.load_time = load_time; - res.evaluate_statistics(); + res.m_evaluated = false; return res; } diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index ed687d6f40..9bee185ff7 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -606,13 +606,13 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "PerfMetrics") .def(py::init<>()) - .def_readonly("generate_duration", &PerfMetrics::generate_duration) - .def_readonly("tokenization_duration", &PerfMetrics::tokenization_duration) - .def_readonly("detokenization_duration", &PerfMetrics::detokenization_duration) - .def_readonly("throughput", &PerfMetrics::throughput) - .def_readonly("tpot", &PerfMetrics::tpot) - .def_readonly("ttft", &PerfMetrics::ttft) - .def_readonly("load_time", &PerfMetrics::load_time) + .def("get_generate_duration", &PerfMetrics::get_generate_duration) + .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration) + .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration) + .def("get_throughput", &PerfMetrics::get_throughput) + .def("get_tpot", &PerfMetrics::get_tpot) + .def("get_ttft", &PerfMetrics::get_ttft) + .def("get_load_time", &PerfMetrics::get_load_time) .def("__add__", &PerfMetrics::operator+) .def("__iadd__", &PerfMetrics::operator+=) .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); From e553ef5dd78ea6bb11cc32bdfb6fb397cba55a24 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 26 Jul 2024 15:11:41 +0200 Subject: [PATCH 12/15] update Readme's --- samples/cpp/benchmark_genai/README.md | 4 ++-- samples/python/benchmark_genai/README.md | 4 ++-- src/README.md | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md index bac16c2f7d..616bb6a36d 100644 --- a/samples/cpp/benchmark_genai/README.md +++ b/samples/cpp/benchmark_genai/README.md @@ -1,6 +1,6 @@ -# Benchmarking Vanilla GenAI +# LLMs benchmarking sample -This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. ## Download and convert the model and tokenizers diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md index fa4fa85576..1ff9ef4305 100644 --- a/samples/python/benchmark_genai/README.md +++ b/samples/python/benchmark_genai/README.md @@ -1,6 +1,6 @@ -# Benchmarking Vanilla GenAI +# LLMs benchmarking sample -This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. ## Download and convert the model and tokenizers diff --git a/src/README.md b/src/README.md index aefa993d8e..e88c2f784f 100644 --- a/src/README.md +++ b/src/README.md @@ -285,6 +285,8 @@ print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') ``` +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/cpp/benchmark_genai/README.md) samples. + ## How It Works For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md). From 3bfbab55b3d862c9f360a3ba1a58536a328b28fc Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Fri, 26 Jul 2024 15:10:11 +0100 Subject: [PATCH 13/15] StaticLLMPipeline dangling models hotfix (#693) --- src/cpp/src/llm_pipeline_static.cpp | 18 +++++++++--------- src/cpp/src/llm_pipeline_static.hpp | 4 ++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 3f50d30ec9..351e10b523 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -144,26 +144,26 @@ StaticLLMPipeline::StaticLLMPipeline( */ ov::Core core; // (1) Read the template model - this will be kvcache model - auto kvcache_model = core.read_model(path / "openvino_model.xml"); + m_kvcache_model = core.read_model(path / "openvino_model.xml"); // (2) Expose KV-cache input and output layers from kvcache model - ov::pass::StatefulToStateless().run_on_model(kvcache_model); + ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); // (3) Clone the model - this will be prefill - auto prefill_model = kvcache_model->clone(); - prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); + m_prefill_model = m_kvcache_model->clone(); + m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); // (4) Reshape both models to static shape m_kvcache_desc = KVCacheDesc { 1024u, 0u }; const uint32_t max_prompt_size = m_kvcache_desc.total_size; const uint32_t max_kvcache_size = m_kvcache_desc.total_size; - reshape_to_static(prefill_model, max_prompt_size, max_kvcache_size); - reshape_to_static(kvcache_model, 1u, max_kvcache_size); + reshape_to_static(m_prefill_model, max_prompt_size, max_kvcache_size); + reshape_to_static(m_kvcache_model, 1u, max_kvcache_size); // (5) Add slices to kvcache model - kvcache_model = add_slices_to_kvcache_inputs(kvcache_model); + m_kvcache_model = add_slices_to_kvcache_inputs(m_kvcache_model); // (6) Compile both model m_prefill_request = core.compile_model( - prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG") + m_prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG") ).create_infer_request(); m_kvcache_request = core.compile_model( - kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG") + m_kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG") ).create_infer_request(); // (7) Initialize tensors prepare_for_new_conversation(); diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 85488e1880..7560b7e336 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -46,6 +46,10 @@ class StaticLLMPipeline final : public LLMPipelineImplBase { uint32_t num_stored_tokens; }; + // FIXME: Ideally, we don't need to keep those + std::shared_ptr m_kvcache_model; + std::shared_ptr m_prefill_model; + KVCacheDesc m_kvcache_desc; ov::InferRequest m_kvcache_request; ov::InferRequest m_prefill_request; From 06c57b70de3093830c7a475ed61ad9a5bbf3cb87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= Date: Mon, 29 Jul 2024 15:56:27 +0200 Subject: [PATCH 14/15] Remove Dockerfile (#700) Removing dockerfile from release branch due to process requirements. --- Dockerfile | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index b73d907b87..0000000000 --- a/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM ubuntu:22.04 - -ARG JOBS -WORKDIR /workspace -RUN apt-get update -y && apt-get install -y python3-pip python3-venv git - -# Install OpenVINO -RUN git clone --branch master https://github.com/openvinotoolkit/openvino.git && \ - cd /workspace/openvino && \ - git submodule update --init -- /workspace/openvino/thirdparty/xbyak /workspace/openvino/thirdparty/pugixml /workspace/openvino/thirdparty/open_model_zoo \ - /workspace/openvino/thirdparty/protobuf /workspace/openvino/thirdparty/snappy /workspace/openvino/thirdparty/telemetry /workspace/openvino/src/plugins/intel_cpu/thirdparty/mlas \ - /workspace/openvino/src/plugins/intel_cpu/thirdparty/onednn /workspace/openvino/src/bindings/python/thirdparty/pybind11 && cd - - -RUN /workspace/openvino/install_build_dependencies.sh -RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt -RUN cmake -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_WHEEL=ON -DENABLE_CPPLINT=OFF -DENABLE_SAMPLES=OFF -DENABLE_INTEL_GPU=OFF \ - -DENABLE_INTEL_NPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF -DENABLE_OV_TF_FRONTEND=ON -DENABLE_OV_ONNX_FRONTEND=OFF \ - -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -S /workspace/openvino -B /workspace/openvino_build -RUN cmake --build /workspace/openvino_build --parallel $JOBS -RUN cmake -P /workspace/openvino_build/cmake_install.cmake -RUN python3 -m pip install /workspace/openvino_build/wheels/openvino-2024* -ENV OpenVINO_DIR=/workspace/openvino_build - -# Download dataset -RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -# Build GenAI library with dependencies -RUN git clone https://github.com/Wovchena/openvino.genai-public.git -b reuse-Tokenizer openvino.genai && \ - cd /workspace/openvino.genai/thirdparty && git submodule update --remote --init && \ - mkdir /workspace/openvino.genai/build && cd /workspace/openvino.genai/build && \ - cmake -DCMAKE_BUILD_TYPE=Release .. && \ - make -j${JOBS} - -# Install test dependencies -RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/tests/python_tests/continuous_batching/requirements.txt -ENV PYTHONPATH=/workspace/openvino.genai/build/ -ENV LD_LIBRARY_PATH=/workspace/openvino.genai/build/ From e2864696954f8b9e73cee7704dc231e7fed07b10 Mon Sep 17 00:00:00 2001 From: Anatoliy Talamanov Date: Tue, 30 Jul 2024 12:26:04 +0100 Subject: [PATCH 15/15] StaticLLMPipeline - align u4 zero points (#705) --- src/cpp/src/llm_pipeline_static.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 351e10b523..c4ff0a90ab 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -12,6 +12,23 @@ namespace { +void align_u4_zp_constants(const std::shared_ptr& model) { + for (auto op : model->get_ops()) { + if (ov::op::util::is_constant(op)) { + auto cst_op = std::dynamic_pointer_cast(op); + const auto cst_op_out = cst_op->output(0); + if (cst_op_out.get_element_type() == ov::element::u4 && ov::shape_size(cst_op_out.get_shape()) == 1u) { + ov::Tensor cst_tensor(ov::element::u4, cst_op_out.get_shape()); + *static_cast(cst_tensor.data()) = cst_op->get_vector()[0] & 0x0f; + auto new_cst_op = std::make_shared(cst_tensor); + for (auto target_input : cst_op_out.get_target_inputs()) { + target_input.replace_source_output(new_cst_op); + } + } + } + } +} + std::shared_ptr add_slices_to_kvcache_inputs(const std::shared_ptr& model) { const auto kvcache_name_pattern = "past_key_values"; std::vector> new_params; @@ -147,6 +164,7 @@ StaticLLMPipeline::StaticLLMPipeline( m_kvcache_model = core.read_model(path / "openvino_model.xml"); // (2) Expose KV-cache input and output layers from kvcache model ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); + align_u4_zp_constants(m_kvcache_model); // (3) Clone the model - this will be prefill m_prefill_model = m_kvcache_model->clone(); m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill");