Skip to content

Commit

Permalink
add more preicise durations
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Jul 22, 2024
1 parent bb1113c commit 0a8f0d9
Show file tree
Hide file tree
Showing 11 changed files with 52 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try {
("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(5)))
("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
("h,help", "Print usage");
Expand Down
15 changes: 8 additions & 7 deletions samples/python/benchmark_vanilla_genai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
```

```
Load time: 3446 ms
Generate time: 876.2 ± 3.30719 ms
Tokenization time: 0 ± 0 ms
Detokenization time: 0 ± 0 ms
ttft: 168 ± 0 ms
tpot: 174.68 ± 4.08671 ms
Tokens/s: 5.72475 ± 0.133933
Load time: 3405.69 ms
Generate time: 1430.77 ± 3.04 ms
Tokenization time: 0.51 ± 0.02 ms
Detokenization time: 0.37 ± 0.01 ms
TTFT: 81.60 ± 0.54 ms
TPOT: 71.52 ± 2.72 ms
Throughput tokens/s: 13.98 ± 0.53
```
s
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def main():
parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations")
parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")

Expand All @@ -22,9 +22,8 @@ def main():
num_warmup = args.num_warmup
num_iter = args.num_iter


config = ov_genai.GenerationConfig()
config.max_new_tokens = args.num_new_tokens
config.max_new_tokens = args.max_new_tokens

pipe = ov_genai.LLMPipeline(model_path, device)

Expand All @@ -37,8 +36,8 @@ def main():
# pdb.set_trace()
res = pipe.generate(prompt, config)
metrics += res.metrics

print(f"Load time: {metrics.load_time} ms")
print(f"Load time: {metrics.load_time:.2f} ms")
print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
Expand Down
11 changes: 6 additions & 5 deletions src/cpp/include/openvino/genai/perf_metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,20 @@ namespace ov {
namespace genai {

using TimePoint = std::chrono::steady_clock::time_point;
using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>;

/**
* @brief Structure with raw performance metrics for each generation before any statistics calculated.
*/
struct OPENVINO_GENAI_EXPORTS RawPerfMetrics {
std::vector<float> generate_durations;
std::vector<float> tokenization_durations;
std::vector<float> detokenization_durations;
std::vector<MicroSeconds> generate_durations;
std::vector<MicroSeconds> tokenization_durations;
std::vector<MicroSeconds> detokenization_durations;

std::vector<float> m_times_to_first_token;
std::vector<MicroSeconds> m_times_to_first_token;
std::vector<TimePoint> m_new_token_times;
std::vector<size_t> m_batch_sizes;
std::vector<float> m_durations;
std::vector<MicroSeconds> m_durations;

size_t num_generated_tokens;
size_t num_input_tokens;
Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/greedy_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ EncodedResults greedy_decoding(
size_t prompt_len = prompts_shape[1];
size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len);

// Initialize results and performance metrics.
EncodedResults results;
auto& raw_perf_counters = results.metrics.raw_counters;

Expand Down
6 changes: 4 additions & 2 deletions src/cpp/src/llm_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {

auto& raw_counters = decoded_results.metrics.raw_counters;
auto stop_time = std::chrono::steady_clock::now();

raw_counters.generate_durations = std::vector<MicroSeconds>();
raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time));
raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time));
Expand Down Expand Up @@ -269,11 +269,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
} else {
m_is_cache_empty = false;
}
auto stop_time = std::chrono::steady_clock::now();

// If is called without tokenization then that stat will not be reported.
auto& metrics = result.metrics;
metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1);
metrics.load_time = this->m_load_time_ms;
metrics.raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time));
metrics.evaluate_statistics(start_time);
return result;
}
Expand Down Expand Up @@ -391,7 +393,7 @@ ov::genai::LLMPipeline::LLMPipeline(
m_pimpl = make_unique<StatefulLLMPipeline>(std::filesystem::path(path), device, config);
}
auto stop_time = std::chrono::steady_clock::now();
m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time);
m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time) / 1000.0f;
}

ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const {
Expand Down
8 changes: 7 additions & 1 deletion src/cpp/src/multinomial_decoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,9 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner

size_t prompt_len = prompts_shape[1];

ov::genai::EncodedResults results;
// Initialize results and performance metrics.
EncodedResults results;
auto& raw_perf_counters = results.metrics.raw_counters;
results.scores.resize(batch_size, 0);
results.tokens.resize(batch_size);

Expand All @@ -179,6 +181,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;

m_model_runner.infer();
raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);

auto logits_tensor = m_model_runner.get_tensor("logits");

Expand Down Expand Up @@ -222,6 +226,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner
m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token.id;

m_model_runner.infer();
raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);

logits = m_model_runner.get_tensor("logits").data<float>();
out_token = sampling.get_out_token(logits, vocab_size, tokens);
Expand Down
24 changes: 15 additions & 9 deletions src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@

namespace {

std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size();
// std::pair<float, float> calc_mean_and_std(const std::vector<float>& durations) {
std::pair<float, float> calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
float mean = std::accumulate(durations.begin(), durations.end(), 0.0f,
[](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
return acc + duration.count();
});
mean /= durations.size();
mean /= 1000.f;

float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f,
[](const float& acc, const float& duration) -> float {
return acc + duration * duration;
[](const float& acc, const ov::genai::MicroSeconds& duration) -> float {
return acc + duration.count() * duration.count() / 1000000.0f;
});
float std = std::sqrt(sum_square_durations / durations.size() - mean * mean);
return {mean, std};
Expand All @@ -27,7 +33,7 @@ namespace ov {
namespace genai {

float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) {
return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
return std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
}

void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
Expand All @@ -36,14 +42,14 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
auto start_time_val = *start_time;
auto& tok_times = raw_counters.m_new_token_times;
auto& batch_sizes = raw_counters.m_batch_sizes;
raw_counters.m_durations = std::vector<float>(tok_times.size());
raw_counters.m_durations = std::vector<MicroSeconds>(tok_times.size());

auto ttft = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[0] - start_time_val).count();
raw_counters.m_times_to_first_token = std::vector<float>();
auto ttft = tok_times[0] - start_time_val;
raw_counters.m_times_to_first_token = std::vector<MicroSeconds>();
raw_counters.m_times_to_first_token.emplace_back(ttft);
num_generated_tokens = 0;
for (size_t i = 0; i < tok_times.size(); ++i) {
raw_counters.m_durations[i] = std::chrono::duration_cast<std::chrono::milliseconds>(tok_times[i] - start_time_val).count();
raw_counters.m_durations[i] = tok_times[i] - start_time_val;

// If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5.
// todo: float check that it's valid for batch > 1.
Expand Down
9 changes: 2 additions & 7 deletions src/cpp/src/sampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,13 +219,8 @@ class Sampler {
}

Token _greedy_sample(const std::vector<Token>& logit_vector) const {
Token max_token{-std::numeric_limits<float>::infinity() , 0};
for (const auto& logit : logit_vector) {
if (logit.m_log_prob > max_token.m_log_prob) {
max_token = logit;
}
}
return max_token;
auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; });
return *out_token;
}

std::vector<Token> _multinomial_sample(const std::vector<Token>& logit_vector, size_t num_tokens_per_sequence) {
Expand Down
14 changes: 0 additions & 14 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,6 @@ namespace ov {
namespace genai {
namespace utils {

#include <iostream>
#include <chrono>
#include <functional>

// Templated function to measure execution time of an object method.
template<typename T, typename Ret, typename... Args>
std::pair<Ret, float> execution_time_wrapper(T& instance, Ret(T::*method)(Args...), Args&&... args) {
auto start = std::chrono::steady_clock::now();
Ret result = (instance.*method)(std::forward<Args>(args)...);
auto end = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
return {result, duration};
}

Tensor init_attention_mask(const Tensor& position_ids);

void print_tensor(const ov::Tensor& tensor);
Expand Down
6 changes: 4 additions & 2 deletions src/python/py_generate_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
.def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); })
.def_readonly("scores", &DecodedResults::scores)
.def_readonly("metrics", &DecodedResults::metrics)
.def("__str__", &DecodedResults::operator std::string);;
.def("__str__", &DecodedResults::operator std::string);

py::class_<RawPerfMetrics>(m, "RawPerfMetrics")
.def(py::init<>())
Expand Down Expand Up @@ -566,7 +566,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
.def_readonly("std_ttft", &PerfMetrics::std_ttft)
.def_readonly("load_time", &PerfMetrics::load_time)
.def("__add__", &PerfMetrics::operator+)
.def("__iadd__", &PerfMetrics::operator+=);
.def("__iadd__", &PerfMetrics::operator+=)
.def_readonly("raw_counters", &PerfMetrics::raw_counters)
;

py::class_<TokenizedInputs>(m, "TokenizedInputs")
.def(py::init<ov::Tensor, ov::Tensor>())
Expand Down

0 comments on commit 0a8f0d9

Please sign in to comment.