Skip to content

Commit

Permalink
add detokenization metric; refactor split to perf_conter & perf_metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Jul 22, 2024
1 parent f0e4190 commit 7cab496
Show file tree
Hide file tree
Showing 15 changed files with 282 additions and 130 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,25 @@ int main(int argc, char* argv[]) try {

ov::genai::GenerationConfig config;
config.max_new_tokens = 100;
config.num_beam_groups = 3;
config.num_beams = 15;

ov::genai::LLMPipeline pipe(model_path, device);

for (size_t i = 0; i < num_warmup; i++)
pipe.generate(prompt, config);

ov::genai::GenerationMetrics metrics;
ov::genai::PerfMetrics metrics;
for (size_t i = 0; i < num_iter; i++) {
ov::genai::DecodedResults res = pipe.generate(prompt, config);
metrics = metrics + res.metrics;
metrics.load_time = res.metrics.load_time;
}

std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;
std::cout << "ttft: " << metrics.mean_ttft << " ± " << metrics.std_ttft << " ms" << std::endl;
std::cout << "tpot: " << metrics.mean_tpot << " ± " << metrics.std_tpot << " ms" << std::endl;
std::cout << "Tokens/s: " << metrics.get_tokens_per_sec().first << std::endl;
std::cout << "Tokens/s: " << metrics.mean_throughput << std::endl;

return 0;
} catch (const std::exception& error) {
Expand Down
40 changes: 0 additions & 40 deletions src/cpp/include/openvino/genai/generation_metrics.hpp

This file was deleted.

6 changes: 3 additions & 3 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "openvino/genai/generation_config.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/generation_metrics.hpp"
#include "openvino/genai/perf_metrics.hpp"

namespace ov {
namespace genai {
Expand All @@ -36,7 +36,7 @@ class EncodedResults {
public:
std::vector<std::vector<int64_t>> tokens;
std::vector<float> scores;
GenerationMetrics metrics;
PerfMetrics metrics;
};

/**
Expand All @@ -50,7 +50,7 @@ class DecodedResults {
public:
std::vector<std::string> texts;
std::vector<float> scores;
GenerationMetrics metrics;
PerfMetrics metrics;

// @brief Convert DecodedResults to a string.
operator std::string() const {
Expand Down
50 changes: 50 additions & 0 deletions src/cpp/include/openvino/genai/perf_metrics.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <chrono>
#include "openvino/genai/visibility.hpp"
#include <vector>
#include <memory>

namespace ov {
namespace genai {

using TimePoint = std::chrono::steady_clock::time_point;

struct PerfCounters;

struct OPENVINO_GENAI_EXPORTS PerfMetrics {
// First token time.
float mean_ttft;
float std_ttft;

// Time per output token.
float mean_tpot;
float std_tpot;

float load_time;
float start_time;

float mean_generate_duration;
float mean_decoding_duration;
float mean_encoding_duration;

float mean_throughput;
float std_throughput;

size_t num_generated_tokens;
size_t num_input_tokens;

std::shared_ptr<PerfCounters> m_counters;
void evaluate(TimePoint start_time);

PerfMetrics operator+(const PerfMetrics& metrics) const;
PerfMetrics& operator+=(const PerfMetrics& right);


};

} // namespace genai
} // namespace ov
62 changes: 0 additions & 62 deletions src/cpp/src/generation_metrics.cpp

This file was deleted.

19 changes: 8 additions & 11 deletions src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/llm_pipeline.hpp"
#include "openvino/genai/perf_metrics.hpp"
#include "perf_counters.hpp"
#include "utils.hpp"

namespace ov {
Expand All @@ -22,11 +23,8 @@ EncodedResults greedy_decoding(
size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);

EncodedResults results;
// Time before the first token generated as a reference point.
ov::genai::TimePoints tok_times;
tok_times.reserve(max_new_tokens);
tok_times.emplace_back(std::chrono::steady_clock::now());

auto& perf_counters = results.metrics.m_counters;

results.scores.resize(running_batch_size);
results.tokens.resize(running_batch_size);
std::fill(results.scores.begin(), results.scores.end(), 0);
Expand Down Expand Up @@ -56,8 +54,8 @@ EncodedResults greedy_decoding(
eos_met[batch] = (out_token == generation_config.eos_token_id);
m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
}
tok_times.emplace_back(std::chrono::steady_clock::now());

perf_counters->add_timestamp(running_batch_size);
if (streamer && streamer->put(token_iter_results[0])) {
return results;
}
Expand Down Expand Up @@ -88,7 +86,7 @@ EncodedResults greedy_decoding(

m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token;
}
tok_times.emplace_back(std::chrono::steady_clock::now());
perf_counters->add_timestamp(running_batch_size);

if (streamer && streamer->put(token_iter_results[0]))
return results;
Expand Down Expand Up @@ -116,9 +114,8 @@ EncodedResults greedy_decoding(
streamer->end();
}

results.metrics = GenerationMetrics(tok_times);
return results;
}

} //namespace genai
} //namespace ov
} //namespace ov
19 changes: 14 additions & 5 deletions src/cpp/src/group_beam_searcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,14 +362,20 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
std::optional<int32_t> selected_beam_idx) {
OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0,
"number of beams should be divisible by number of groups");

// Initialize beam search

auto batch_size = input_ids.get_shape().at(0);
auto sequence_length = input_ids.get_shape().at(1);

// Initialize time metric counters.
// ov::genai::TimePoints tok_times;
// tok_times.reserve(config.get_max_new_tokens(sequence_length));
// tok_times.emplace_back(std::chrono::steady_clock::now());

// Initialize beam search.
const int64_t* prompt_data = input_ids.data<const int64_t>();
std::vector<std::vector<int64_t>> prompts;
prompts.reserve(batch_size);
for (size_t batch = 0; batch < batch_size; batch++) {
size_t sequence_length = input_ids.get_shape().at(1);
size_t batch_offset = batch * sequence_length;
const int64_t* prompt_start = prompt_data + batch_offset;
prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length});
Expand All @@ -389,7 +395,7 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
lm.set_tensor("beam_idx", beam_idx);

Parameters parameters{std::move(prompts)};
parameters.max_new_tokens = config.max_new_tokens;
parameters.max_new_tokens = config.get_max_new_tokens(sequence_length);
parameters.eos_token_id = config.eos_token_id;
parameters.n_groups = config.num_beam_groups;
parameters.group_size = config.num_beams / config.num_beam_groups;
Expand All @@ -406,6 +412,8 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
lm.infer();

std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits"));
// tok_times.emplace_back(std::chrono::steady_clock::now());

if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) {
// Break the cycle before masks are extended in update_attention_mask_with_beams.
// If generation is continued, attention_mask length should be equal to KV cache size.
Expand Down Expand Up @@ -462,7 +470,8 @@ std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm,
results.tokens.push_back(std::move(beam->get().tokens));
}
}


// results.metrics = PerfCounters(tok_times);
return {results, res_selected_beam_idx};
}

Expand Down
Loading

0 comments on commit 7cab496

Please sign in to comment.