add cpp Readme, ensured correct batch processing, add PerfMetrics to …

…Readme
openvinotoolkit · Jul 23, 2024 · 90320f4 · 90320f4
1 parent 0a8f0d9
commit 90320f4
Show file tree

Hide file tree

Showing 17 changed files with 278 additions and 115 deletions.
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
@@ -10,7 +10,7 @@ add_subdirectory(cpp/greedy_causal_lm)
 add_subdirectory(cpp/multinomial_causal_lm)
 add_subdirectory(cpp/prompt_lookup_decoding_lm)
 add_subdirectory(cpp/speculative_decoding_lm)
-add_subdirectory(cpp/benchmark_vanilla_genai)
+add_subdirectory(cpp/benchmark_genai)
 
 install(FILES requirements.txt DESTINATION samples
         COMPONENT cpp_samples_genai)

diff --git a/...pp/benchmark_vanilla_genai/CMakeLists.txt → samples/cpp/benchmark_genai/CMakeLists.txt b/...pp/benchmark_vanilla_genai/CMakeLists.txt → samples/cpp/benchmark_genai/CMakeLists.txt
@@ -12,14 +12,14 @@ FetchContent_Declare(cxxopts
     URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08)
 FetchContent_MakeAvailable(cxxopts)
 
-add_executable(benchmark_vanilla_genai benchmark_vanilla_genai.cpp)
-target_link_libraries(benchmark_vanilla_genai PRIVATE openvino::genai cxxopts::cxxopts)
-set_target_properties(benchmark_vanilla_genai PROPERTIES
-    COMPILE_PDB_NAME benchmark_vanilla_genai
+add_executable(benchmark_genai benchmark_genai.cpp)
+target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts)
+set_target_properties(benchmark_genai PROPERTIES
+    COMPILE_PDB_NAME benchmark_genai
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-# target_compile_features(benchmark_vanilla_genai PRIVATE cxx_std_11)
-install(TARGETS benchmark_vanilla_genai
+# target_compile_features(benchmark_genai PRIVATE cxx_std_11)
+install(TARGETS benchmark_genai
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
     EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md
@@ -0,0 +1,47 @@
+# Benchmarking Vanilla GenAI
+
+This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
+
+## Download and convert the model and tokenizers
+
+The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
+
+It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported.
+
+```sh
+pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
+```
+
+## Usage
+
+```sh
+benchmark_vanilla_genai [OPTIONS]
+```
+
+### Options
+
+- `-m, --model`: Path to the model and tokenizers base directory.
+- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
+- `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
+- `-n, --num_iter` (default: `3`): Number of iterations.
+- `-d, --device` (default: `"CPU"`): Device to run the model on.
+
+### Output:
+
+```
+benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10
+```
+
+```
+Load time: 3405.69 ms
+Generate time: 1430.77 ± 3.04 ms
+Tokenization time: 0.51 ± 0.02 ms
+Detokenization time: 0.37 ± 0.01 ms
+TTFT: 81.60 ± 0.54 ms
+TPOT: 71.52 ± 2.72 ms
+Throughput tokens/s: 13.98 ± 0.53
+```
+
+For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics).
diff --git a/...vanilla_genai/benchmark_vanilla_genai.cpp → ...s/cpp/benchmark_genai/benchmark_genai.cpp b/...vanilla_genai/benchmark_vanilla_genai.cpp → ...s/cpp/benchmark_genai/benchmark_genai.cpp
@@ -8,11 +8,11 @@ int main(int argc, char* argv[]) try {
     cxxopts::Options options("benchmark_vanilla_genai", "Help command");
 
     options.add_options()
-    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
     ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
+    ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
     ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
-    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
-    ("mt,max_new_tokens", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(20)))
+    ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3)))
+    ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20)))
     ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
     ("h,help", "Print usage");
 
@@ -38,17 +38,19 @@ int main(int argc, char* argv[]) try {
 
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
+    config.num_beam_groups = 3;
+    config.num_beams = 15;
 
     ov::genai::LLMPipeline pipe(model_path, device);
 
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, config);
 
     ov::genai::DecodedResults res = pipe.generate(prompt, config);
-    ov::genai::PerfMetrics metrics = res.metrics;
+    ov::genai::PerfMetrics metrics = res.perf_metrics;
     for (size_t i = 0; i < num_iter - 1; i++) {
         res = pipe.generate(prompt, config);
-        metrics = metrics + res.metrics;
+        metrics = metrics + res.perf_metrics;
     }
 
     std::cout << "Load time: " << metrics.load_time << " ms" << std::endl;

diff --git a/samples/cpp/benchmark_vanilla_genai/README.md b/samples/cpp/benchmark_vanilla_genai/README.md
diff --git a/.../python/benchmark_vanilla_genai/README.md → samples/python/benchmark_genai/README.md b/.../python/benchmark_vanilla_genai/README.md → samples/python/benchmark_genai/README.md
@@ -1,28 +1,7 @@
-# Benchmark Vanilla GenAI
+# Benchmarking Vanilla GenAI
 
 This sample script demonstrates how to benchmark an LLMModel in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics.
 
-# ov.genai.PerfMetrics structure
-ov.genai.PerfMetrics is a structure which holds performance metric for each generate call. Each generate call calcualtes the following metrics:
-- mean_ttft
- - std_ttft
- - mean_tpot
- - std_tpot
- - load_time
- - mean_generate_duration
- - std_generate_duration
- - mean_tokenization_duration
- - std_tokenization_duration
- - mean_detokenization_duration
- - std_detokenization_duration
- - mean_throughput
- - std_throughput
- - num_generated_tokens
- - num_input_tokens
-
-Performance metrics can be added to one another and accumulated using the += operator or the + operator. In that case the mean values accumulated by several generate calls will be calculated.
-
-
 ## Download and convert the model and tokenizers
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
@@ -45,14 +24,14 @@ python benchmark_vanilla_genai.py [OPTIONS]
 - `-m, --model`: Path to the model and tokenizers base directory.
 - `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text.
 - `-nw, --num_warmup` (default: `1`): Number of warmup iterations.
-- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
 - `-n, --num_iter` (default: `3`): Number of iterations.
+- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations.
 - `-d, --device` (default: `"CPU"`): Device to run the model on.
 
 ### Output:
 
 ```
-python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0/
+python benchmark_vanilla_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10
 ```
 
 ```
@@ -64,4 +43,5 @@ TTFT: 81.60 ± 0.54 ms
 TPOT: 71.52 ± 2.72 ms
 Throughput tokens/s: 13.98 ± 0.53
 ```
-s
+
+For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics).
diff --git a/..._vanilla_genai/benchmark_vanilla_genai.py → ...python/benchmark_genai/benchmark_genai.py b/..._vanilla_genai/benchmark_vanilla_genai.py → ...python/benchmark_genai/benchmark_genai.py
@@ -3,7 +3,6 @@
 
 import argparse
 import openvino_genai as ov_genai
-import pdb
 
 def main():
     parser = argparse.ArgumentParser(description="Help command")
@@ -16,6 +15,8 @@ def main():
 
     args = parser.parse_args()
 
+    # Perf metrics is stored in DecodedResults. 
+    # In order to get DecodedResults instead of a string input should be a list.
     prompt = [args.prompt]
     model_path = args.model
     device = args.device
@@ -24,26 +25,27 @@ def main():
 
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = args.max_new_tokens
+    config.num_beam_groups = 3
+    config.num_beams = 15
 
     pipe = ov_genai.LLMPipeline(model_path, device)
 
     for _ in range(num_warmup):
         pipe.generate(prompt, config)
 
     res = pipe.generate(prompt, config)
-    metrics = res.metrics
+    perf_metrics = res.perf_metrics
     for _ in range(num_iter - 1):
-        # pdb.set_trace()
         res = pipe.generate(prompt, config)
-        metrics += res.metrics
+        perf_metrics += res.perf_metrics
 
-    print(f"Load time: {metrics.load_time:.2f} ms")
-    print(f"Generate time: {metrics.mean_generate_duration:.2f} ± {metrics.std_generate_duration:.2f} ms")
-    print(f"Tokenization time: {metrics.mean_tokenization_duration:.2f} ± {metrics.std_tokenization_duration:.2f} ms")
-    print(f"Detokenization time: {metrics.mean_detokenization_duration:.2f} ± {metrics.std_detokenization_duration:.2f} ms")
-    print(f"TTFT: {metrics.mean_ttft:.2f} ± {metrics.std_ttft:.2f} ms")
-    print(f"TPOT: {metrics.mean_tpot:.2f} ± {metrics.std_tpot:.2f} ms")
-    print(f"Throughput tokens/s: {metrics.mean_throughput:.2f} ± {metrics.std_throughput:.2f}")
+    print(f"Load time: {perf_metrics.load_time:.2f} ms")
+    print(f"Generate time: {perf_metrics.mean_generate_duration:.2f} ± {perf_metrics.std_generate_duration:.2f} ms")
+    print(f"Tokenization time: {perf_metrics.mean_tokenization_duration:.2f} ± {perf_metrics.std_tokenization_duration:.2f} ms")
+    print(f"Detokenization time: {perf_metrics.mean_detokenization_duration:.2f} ± {perf_metrics.std_detokenization_duration:.2f} ms")
+    print(f"TTFT: {perf_metrics.mean_ttft:.2f} ± {perf_metrics.std_ttft:.2f} ms")
+    print(f"TPOT: {perf_metrics.mean_tpot:.2f} ± {perf_metrics.std_tpot:.2f} ms")
+    print(f"Throughput tokens/s: {perf_metrics.mean_throughput:.2f} ± {perf_metrics.std_throughput:.2f}")
 
 if __name__ == "__main__":
     main()
diff --git a/samples/python/benchmark_genai/benchmark_genai_automatic.py b/samples/python/benchmark_genai/benchmark_genai_automatic.py
@@ -0,0 +1,62 @@
+# Copyright (C) 2023-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai as ov_genai
+import pdb
+
+def main():
+    parser = argparse.ArgumentParser(description="Help command")
+    parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory")
+    parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt")
+    parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations")
+    parser.add_argument("-n", "--num_iter", type=int, default=5, help="Number of iterations")
+    parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
+    parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
+
+    args = parser.parse_args()
+
+    # Perf metrics is stored in DecodedResults. 
+    # In order to get DecodedResults instead of a string input should be a list.
+
+    model_path = args.model
+    device = args.device
+    num_warmup = args.num_warmup
+    num_iter = args.num_iter
+
+    config = ov_genai.GenerationConfig()
+    config.max_new_tokens = 20
+    # config.num_beam_groups = 3
+    # config.num_beams = 15
+
+    pipe = ov_genai.LLMPipeline(model_path, device)
+
+    import pandas as pd
+    metrics_df = pd.DataFrame(columns=['batch_size', 'throughput', 'ttft', 'tpot', 'std_throughput', 'std_ttft', 'std_tpot'])
+
+    batch_sizes = [1, 2, 4, 16, 32, 64, 256]
+    for batch_size in batch_sizes:
+        prompt = [args.prompt] * batch_size
+        for _ in range(num_warmup):
+            pipe.generate(prompt, config)
+
+        res = pipe.generate(prompt, config)
+        metrics = res.metrics
+        for _ in range(num_iter - 1):
+            res = pipe.generate(prompt, config)
+            metrics += res.metrics
+        # pdb.set_trace()
+        metrics_df = metrics_df._append({
+            'batch_size': batch_size,
+            'throughput': metrics.mean_throughput,
+            'ttft': metrics.mean_ttft,
+            'tpot': metrics.mean_tpot,
+            'std_throughput': metrics.std_throughput,
+            'std_ttft': metrics.std_ttft,
+            'std_tpot': metrics.std_tpot,
+        }, ignore_index=True)
+
+    metrics_df.to_csv('metrics.csv', index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/README.md b/src/README.md
@@ -196,6 +196,55 @@ int main(int argc, char* argv[]) {
 }
 ```
 
+### Performance Metrics
+
+`ov.genai.PerfMetrics` (referred to as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` hold fields with mean and standard deviations for the following metrics:
+- `ttft`
+- `tpot`
+- `load_time`
+- `generate_duration`
+- `tokenization_duration`
+- `detokenization_duration`
+- `throughput`
+
+and:
+- `num_generated_tokens`
+- `num_input_tokens`
+
+Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `ov.genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`.
+
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path, "CPU")
+res = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+perf_metrics = res.perf_metrics
+print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}')
+print(f'ttft: {perf_metrics.mean_ttft:.2f}')
+print(f'tpot: {perf_metrics.mean_tpot:.2f}')
+```
+output:
+```sh
+mean_generate_duration: 76.28
+mean_ttft: 42.58
+mean_tpot 3.80
+```
+
+>**Note**: If the input prompt is just a string, the generate function will return only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs.
+
+Several `perf_metrics` can be added with each other. In that case `raw_metrics` will be concatenated and mean/std values will be recalculated. This enhances benchmarking and accumulating statistics from several calls.
+
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path, "CPU")
+res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20)
+perf_metrics = res_1.perf_metrics + res_2.perf_metrics
+
+print(f'generate_duration: {perf_metrics.mean_generate_duration:.2f}')
+print(f'ttft: {perf_metrics.mean_ttft:.2f}')
+print(f'tpot: {perf_metrics.mean_tpot:.2f}')
+```
+
 ## How It Works
 
 For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md).

diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -37,7 +37,7 @@ class EncodedResults {
 public:
     std::vector<std::vector<int64_t>> tokens;
     std::vector<float> scores;
-    PerfMetrics metrics;
+    PerfMetrics perf_metrics;
 };
 
 /**
@@ -52,7 +52,7 @@ class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
-    PerfMetrics metrics;
+    PerfMetrics perf_metrics;
 
     // @brief Convert DecodedResults to a string.
     operator std::string() const {