diff --git a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp index 6d96d24fc5..a9bc07f641 100644 --- a/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp +++ b/samples/cpp/benchmark_vanilla_genai/benchmark_vanilla_genai.cpp @@ -11,7 +11,7 @@ int main(int argc, char* argv[]) try { ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) - ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(5))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) ("mt,max_new_tokens", "Number of iterations", cxxopts::value()->default_value(std::to_string(20))) ("d,device", "device", cxxopts::value()->default_value("CPU")) ("h,help", "Print usage"); diff --git a/samples/python/benchmark_vanilla_genai/README.md b/samples/python/benchmark_vanilla_genai/README.md index af66ea545d..13666a7de9 100644 --- a/samples/python/benchmark_vanilla_genai/README.md +++ b/samples/python/benchmark_vanilla_genai/README.md @@ -56,11 +56,12 @@ python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/ ``` ``` -Load time: 3446 ms -Generate time: 876.2 ± 3.30719 ms -Tokenization time: 0 ± 0 ms -Detokenization time: 0 ± 0 ms -ttft: 168 ± 0 ms -tpot: 174.68 ± 4.08671 ms -Tokens/s: 5.72475 ± 0.133933 +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 ``` +s \ No newline at end of file diff --git a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py index 4c87234179..9e4debe847 100755 --- a/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py +++ b/samples/python/benchmark_vanilla_genai/benchmark_vanilla_genai.py @@ -10,7 +10,7 @@ def main(): parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") - parser.add_argument("-n", "--num_iter", type=int, default=3, help="Number of iterations") + parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") @@ -22,9 +22,8 @@ def main(): num_warmup = args.num_warmup num_iter = args.num_iter - config = ov_genai.GenerationConfig() - config.max_new_tokens = args.num_new_tokens + config.max_new_tokens = args.max_new_tokens pipe = ov_genai.LLMPipeline(model_path, device) @@ -37,8 +36,8 @@ def main(): # pdb.set_trace() res = pipe.generate(prompt, config) metrics += res.metrics - - print(f"Load time: {metrics.load_time} ms") + + print(f"Load time: {metrics.load_time:.2f} ms") print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms") print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms") print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms") diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp index e66c917e81..5779b9b080 100644 --- a/src/cpp/include/openvino/genai/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -13,19 +13,20 @@ namespace ov { namespace genai { using TimePoint = std::chrono::steady_clock::time_point; +using MicroSeconds = std::chrono::duration>; /** * @brief Structure with raw performance metrics for each generation before any statistics calculated. */ struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { - std::vector generate_durations; - std::vector tokenization_durations; - std::vector detokenization_durations; + std::vector generate_durations; + std::vector tokenization_durations; + std::vector detokenization_durations; - std::vector m_times_to_first_token; + std::vector m_times_to_first_token; std::vector m_new_token_times; std::vector m_batch_sizes; - std::vector m_durations; + std::vector m_durations; size_t num_generated_tokens; size_t num_input_tokens; diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index c5bf10a2d1..c8fd36cbdd 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -22,6 +22,7 @@ EncodedResults greedy_decoding( size_t prompt_len = prompts_shape[1]; size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); + // Initialize results and performance metrics. EncodedResults results; auto& raw_perf_counters = results.metrics.raw_counters; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 5241142afe..adac9110e1 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -165,7 +165,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto& raw_counters = decoded_results.metrics.raw_counters; auto stop_time = std::chrono::steady_clock::now(); - + raw_counters.generate_durations = std::vector(); raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_duration_ms(encode_stop_time - start_time)); raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_duration_ms(decode_stop_time - decode_start_time)); @@ -269,11 +269,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else { m_is_cache_empty = false; } + auto stop_time = std::chrono::steady_clock::now(); // If is called without tokenization then that stat will not be reported. auto& metrics = result.metrics; metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); metrics.load_time = this->m_load_time_ms; + metrics.raw_counters.generate_durations.emplace_back(PerfMetrics::get_duration_ms(stop_time - start_time)); metrics.evaluate_statistics(start_time); return result; } @@ -391,7 +393,7 @@ ov::genai::LLMPipeline::LLMPipeline( m_pimpl = make_unique(std::filesystem::path(path), device, config); } auto stop_time = std::chrono::steady_clock::now(); - m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time); + m_pimpl->m_load_time_ms = PerfMetrics::get_duration_ms(stop_time - start_time) / 1000.0f; } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index fd16e948c1..fc59f00e12 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -162,7 +162,9 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner size_t prompt_len = prompts_shape[1]; - ov::genai::EncodedResults results; + // Initialize results and performance metrics. + EncodedResults results; + auto& raw_perf_counters = results.metrics.raw_counters; results.scores.resize(batch_size, 0); results.tokens.resize(batch_size); @@ -179,6 +181,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("beam_idx").data()[0] = 0; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); auto logits_tensor = m_model_runner.get_tensor("logits"); @@ -222,6 +226,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); logits = m_model_runner.get_tensor("logits").data(); out_token = sampling.get_out_token(logits, vocab_size, tokens); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 3947793802..d4dc6c8de6 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -9,12 +9,18 @@ namespace { -std::pair calc_mean_and_std(const std::vector& durations) { - float mean = std::accumulate(durations.begin(), durations.end(), 0.0f) / durations.size(); +// std::pair calc_mean_and_std(const std::vector& durations) { +std::pair calc_mean_and_std(const std::vector& durations) { + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count(); + }); + mean /= durations.size(); + mean /= 1000.f; float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, - [](const float& acc, const float& duration) -> float { - return acc + duration * duration; + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count() * duration.count() / 1000000.0f; }); float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); return {mean, std}; @@ -27,7 +33,7 @@ namespace ov { namespace genai { float PerfMetrics::get_duration_ms(std::chrono::steady_clock::duration duration) { - return std::chrono::duration_cast(duration).count(); + return std::chrono::duration_cast(duration).count(); } void PerfMetrics::evaluate_statistics(std::optional start_time) { @@ -36,14 +42,14 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { auto start_time_val = *start_time; auto& tok_times = raw_counters.m_new_token_times; auto& batch_sizes = raw_counters.m_batch_sizes; - raw_counters.m_durations = std::vector(tok_times.size()); + raw_counters.m_durations = std::vector(tok_times.size()); - auto ttft = std::chrono::duration_cast(tok_times[0] - start_time_val).count(); - raw_counters.m_times_to_first_token = std::vector(); + auto ttft = tok_times[0] - start_time_val; + raw_counters.m_times_to_first_token = std::vector(); raw_counters.m_times_to_first_token.emplace_back(ttft); num_generated_tokens = 0; for (size_t i = 0; i < tok_times.size(); ++i) { - raw_counters.m_durations[i] = std::chrono::duration_cast(tok_times[i] - start_time_val).count(); + raw_counters.m_durations[i] = tok_times[i] - start_time_val; // If in 10 ms a batch of 5 new tokens is generated then TTOT is 10 ms / 5. // todo: float check that it's valid for batch > 1. diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 6390fc8725..dc631c68ac 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -219,13 +219,8 @@ class Sampler { } Token _greedy_sample(const std::vector& logit_vector) const { - Token max_token{-std::numeric_limits::infinity() , 0}; - for (const auto& logit : logit_vector) { - if (logit.m_log_prob > max_token.m_log_prob) { - max_token = logit; - } - } - return max_token; + auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; }); + return *out_token; } std::vector _multinomial_sample(const std::vector& logit_vector, size_t num_tokens_per_sequence) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 446ef8549b..25acc1c87f 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -12,20 +12,6 @@ namespace ov { namespace genai { namespace utils { -#include -#include -#include - -// Templated function to measure execution time of an object method. -template -std::pair execution_time_wrapper(T& instance, Ret(T::*method)(Args...), Args&&... args) { - auto start = std::chrono::steady_clock::now(); - Ret result = (instance.*method)(std::forward(args)...); - auto end = std::chrono::steady_clock::now(); - auto duration = std::chrono::duration_cast(end - start).count(); - return {result, duration}; -} - Tensor init_attention_mask(const Tensor& position_ids); void print_tensor(const ov::Tensor& tensor); diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 860d3c3592..e2f89cd962 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -537,7 +537,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) .def_readonly("scores", &DecodedResults::scores) .def_readonly("metrics", &DecodedResults::metrics) - .def("__str__", &DecodedResults::operator std::string);; + .def("__str__", &DecodedResults::operator std::string); py::class_(m, "RawPerfMetrics") .def(py::init<>()) @@ -566,7 +566,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readonly("std_ttft", &PerfMetrics::std_ttft) .def_readonly("load_time", &PerfMetrics::load_time) .def("__add__", &PerfMetrics::operator+) - .def("__iadd__", &PerfMetrics::operator+=); + .def("__iadd__", &PerfMetrics::operator+=) + .def_readonly("raw_counters", &PerfMetrics::raw_counters) + ; py::class_(m, "TokenizedInputs") .def(py::init())