add detokenization metric; refactor split to perf_conter & perf_metrics

openvinotoolkit · Jul 22, 2024 · 7cab496 · 7cab496
1 parent f0e4190
commit 7cab496
Show file tree

Hide file tree

Showing 15 changed files with 282 additions and 130 deletions.
diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp
@@ -37,23 +37,25 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::GenerationConfig config;
     config.max_new_tokens = 100;
+    config.num_beam_groups = 3;
+    config.num_beams = 15;
 
     ov::genai::LLMPipeline pipe(model_path, device);
 
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, config);
 
-    ov::genai::GenerationMetrics metrics;
+    ov::genai::PerfMetrics metrics;
     for (size_t i = 0; i < num_iter; i++) {
         ov::genai::DecodedResults res = pipe.generate(prompt, config);
         metrics = metrics + res.metrics;
         metrics.load_time = res.metrics.load_time;
     }
-    
+
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
     std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
     std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl;
-    std::cout << "Tokens/s: " << metrics.get_tokens_per_sec().first << std::endl;
+    std::cout << "Tokens/s: " << metrics.mean_throughput << std::endl;
 
     return 0;
 } catch (const std::exception& error) {

diff --git a/src/cpp/include/openvino/genai/generation_metrics.hpp b/src/cpp/include/openvino/genai/generation_metrics.hpp
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -11,7 +11,7 @@
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
-#include "openvino/genai/generation_metrics.hpp"
+#include "openvino/genai/perf_metrics.hpp"
 
 namespace ov {
 namespace genai {
@@ -36,7 +36,7 @@ class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
-    GenerationMetrics metrics;
+    PerfMetrics metrics;
 };
 
 /**
@@ -50,7 +50,7 @@ class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
-    GenerationMetrics metrics;
+    PerfMetrics metrics;
 
     // @brief Convert DecodedResults to a string.
     operator std::string() const {

diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp
@@ -0,0 +1,50 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <chrono>
+#include "openvino/genai/visibility.hpp"
+#include <vector>
+#include <memory>
+
+namespace ov {
+namespace genai {
+
+using TimePoint = std::chrono::steady_clock::time_point;
+
+struct PerfCounters;
+
+struct OPENVINO_GENAI_EXPORTS PerfMetrics {
+    // First token time.
+    float mean_ttft;
+    float std_ttft;
+
+    // Time per output token.
+    float mean_tpot;
+    float std_tpot;
+
+    float load_time;
+    float start_time;
+
+    float mean_generate_duration;
+    float mean_decoding_duration;
+    float mean_encoding_duration;
+
+    float mean_throughput;
+    float std_throughput;
+
+    size_t num_generated_tokens;
+    size_t num_input_tokens;
+
+    std::shared_ptr<PerfCounters> m_counters;
+    void evaluate(TimePoint start_time);
+
+    PerfMetrics operator+(const PerfMetrics& metrics) const;
+    PerfMetrics& operator+=(const PerfMetrics& right);
+
+
+};
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/generation_metrics.cpp b/src/cpp/src/generation_metrics.cpp
diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp
@@ -1,7 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/llm_pipeline.hpp"
+#include "openvino/genai/perf_metrics.hpp"
+#include "perf_counters.hpp"
 #include "utils.hpp"
 
 namespace ov {
@@ -22,11 +23,8 @@ EncodedResults greedy_decoding(
     size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);
 
     EncodedResults results;
-    // Time before the first token generated as a reference point.
-    ov::genai::TimePoints tok_times;
-    tok_times.reserve(max_new_tokens);
-    tok_times.emplace_back(std::chrono::steady_clock::now());
-
+    auto& perf_counters = results.metrics.m_counters;
+
     results.scores.resize(running_batch_size);
     results.tokens.resize(running_batch_size);
     std::fill(results.scores.begin(), results.scores.end(), 0);
@@ -56,8 +54,8 @@ EncodedResults greedy_decoding(
         eos_met[batch] = (out_token == generation_config.eos_token_id);
         m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
     }
-    tok_times.emplace_back(std::chrono::steady_clock::now());
-
+    perf_counters->add_timestamp(running_batch_size);
+        
     if (streamer && streamer->put(token_iter_results[0])) {
         return results;
     }
@@ -88,7 +86,7 @@ EncodedResults greedy_decoding(
 
             m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
         }
-        tok_times.emplace_back(std::chrono::steady_clock::now());
+        perf_counters->add_timestamp(running_batch_size);
 
         if (streamer && streamer->put(token_iter_results[0]))
             return results;
@@ -116,9 +114,8 @@ EncodedResults greedy_decoding(
         streamer->end();
     }
 
-    results.metrics = GenerationMetrics(tok_times);
     return results;
 }
 
 }  //namespace genai
-}  //namespace ov
+}  //namespace ov
diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp
@@ -362,14 +362,20 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
                            std::optional<int32_t> selected_beam_idx) {
     OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0,
                     "number of beams should be divisible by number of groups");
-
-    // Initialize beam search
+
     auto batch_size = input_ids.get_shape().at(0);
+    auto sequence_length = input_ids.get_shape().at(1);
+
+    // Initialize time metric counters.
+    // ov::genai::TimePoints tok_times;
+    // tok_times.reserve(config.get_max_new_tokens(sequence_length));
+    // tok_times.emplace_back(std::chrono::steady_clock::now());
+
+    // Initialize beam search.
     const int64_t* prompt_data = input_ids.data<const int64_t>();
     std::vector<std::vector<int64_t>> prompts;
     prompts.reserve(batch_size);
     for (size_t batch = 0; batch < batch_size; batch++) {
-        size_t sequence_length = input_ids.get_shape().at(1);
         size_t batch_offset = batch * sequence_length;
         const int64_t* prompt_start = prompt_data + batch_offset;
         prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
@@ -389,7 +395,7 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
     lm.set_tensor("beam_idx", beam_idx);
 
     Parameters parameters{std::move(prompts)};
-    parameters.max_new_tokens = config.max_new_tokens;
+    parameters.max_new_tokens = config.get_max_new_tokens(sequence_length);
     parameters.eos_token_id = config.eos_token_id;
     parameters.n_groups = config.num_beam_groups;
     parameters.group_size = config.num_beams / config.num_beam_groups;
@@ -406,6 +412,8 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
         lm.infer();
 
         std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
+        // tok_times.emplace_back(std::chrono::steady_clock::now());
+
         if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
             // Break the cycle before masks are extended in update_attention_mask_with_beams.
             // If generation is continued, attention_mask length should be equal to KV cache size.
@@ -462,7 +470,8 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
             results.tokens.push_back(std::move(beam->get().tokens));
         }
     }
-
+
+    // results.metrics = PerfCounters(tok_times);
     return {results, res_selected_beam_idx};
 }