From 4146960d52bc435bd42ef140b7b0f09bb4014057 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sun, 24 Mar 2024 08:16:22 +0100 Subject: [PATCH 01/13] server: bench: init --- .github/workflows/bench.yml | 183 ++++++++++++ examples/server/bench/bench.py | 267 ++++++++++++++++++ examples/server/bench/prometheus.yml | 9 + examples/server/bench/requirements.txt | 2 + examples/server/tests/features/steps/steps.py | 19 +- 5 files changed, 471 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/bench.yml create mode 100644 examples/server/bench/bench.py create mode 100644 examples/server/bench/prometheus.yml create mode 100644 examples/server/bench/requirements.txt diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 0000000000000..788e7de358743 --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,183 @@ +# Benchmark +name: Benchmark + +on: + workflow_dispatch: + inputs: + gpu-series: + description: 'Azure GPU series to run with' + required: true + type: choice + options: + - Standard_NC4as_T4_v3 + - Standard_NC64as_T4_v3 + - Standard_NC24ads_A100_v4 + - Standard_NC48ads_A100_v4 + - Standard_ND96asr_A100_v4 + - Standard_NC40ads_H100_v5 + - Standard_NC80adis_H100_v5 + push: + branches: + - master + paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] + schedule: + - cron: '04 2 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + bench-server-baseline: + runs-on: Standard_NC4as_T4_v3 + env: + RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it + if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }} + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install python env + id: pipenv + run: | + cd examples/server/bench + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + + - name: Prometheus + id: install_prometheus + run: | + wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz + tar xzf prometheus*.tar.gz --strip-components=1 + ./prometheus --config.file=examples/server/bench/prometheus.yml & + while ! nc -z localhost 9090; do + sleep 0.1 + done + + - name: Install k6 + id: k6_installation + run: | + cd examples/server/bench + wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz + tar xzf k6*.tar.gz --strip-components=1 + + - name: Build + id: cmake_build + run: | + set -eux + mkdir build + cd build + cmake .. \ + -DLLAMA_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DLLAMA_CUBLAS=ON \ + -DCUDAToolkit_ROOT=/usr/local/cuda \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=75 \ + -DLLAMA_FATAL_WARNINGS=OFF \ + -DLLAMA_ALL_WARNINGS=OFF \ + -DCMAKE_BUILD_TYPE=Release; + cmake --build . --config Release -j $(nproc) --target server + + - name: Download the dataset + id: download_dataset + run: | + cd examples/server/bench + wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + - name: Server bench + id: server_bench + run: | + set -eux + + cd examples/server/bench + source venv/bin/activate + BENCH_K6_BIN_PATH=./k6 python bench.py \ + --runner-label ${{ env.RUNNER_LABEL }} \ + --name ${{ github.job }} \ + --branch ${{ github.head_ref || github.ref_name }} \ + --commit ${{ github.sha }} \ + --scenario script.js \ + --duration 10m \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model-path-prefix /models \ + --parallel 8 \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 16384 \ + --n-prompts 1000 \ + --max-prompt-tokens 1024 \ + --max-tokens 2048 + + cat results.github.env >> $GITHUB_ENV + + - name: Commit status + uses: Sibz/github-status-action@v1 + with: + authToken: ${{secrets.GITHUB_TOKEN}} + context: bench-server-baseline + description: | + ${{ env.BENCH_RESULTS }} + state: 'success' + + - name: Upload benchmark images + uses: devicons/public-upload-to-imgur@v2.2.2 + id: imgur_step + with: + client_id: ${{secrets.IMGUR_CLIENT_ID}} + path: | + examples/server/bench/prompt_tokens_seconds.png + examples/server/bench/predicted_tokens_seconds.png + examples/server/bench/kv_cache_usage_ratio.png + examples/server/bench/requests_processing.png + examples/server/bench/requests_deferred.png + + - name: Comment PR + uses: mshick/add-pr-comment@v2 + id: comment_pr + if: ${{ github.event.pull_request != '' }} + with: + message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} + message: | + 📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 +

+ prompt_tokens_seconds + predicted_tokens_seconds +

+
+ Details +

+ kv_cache_usage_ratio + requests_processing + requests_deferred +

+ + + - name: Upload results + if: ${{ github.event.pull_request }} + uses: edunad/actions-image@v2.0.0 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + path: 'examples/server/bench/*.png' + title: | + llama.cpp server benchmark results for ${{ github.job }} on ${{ env.RUNNER_LABEL }}: ${{ env.LLAMACPP_TOKENS_SECOND_AVG}}tk/s + annotationLevel: 'success' + + - uses: actions/upload-artifact@v4 + with: + name: benchmark-results + compression-level: 9 + path: | + examples/server/bench/**/.png + examples/server/bench/**/.json + examples/server/bench/**/.log diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py new file mode 100644 index 0000000000000..c0e08ae197227 --- /dev/null +++ b/examples/server/bench/bench.py @@ -0,0 +1,267 @@ +import argparse +import base64 +import json +import os +import re +import signal +import socket +import subprocess +import sys +import threading +import time +import traceback +from contextlib import closing +from datetime import datetime + +import matplotlib.pyplot as plt +import requests + + +def main(args_in: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description="Start server benchmark scenario") + parser.add_argument("--name", type=str, help="Bench name", required=True) + parser.add_argument("--runner-label", type=str, help="Runner label", required=True) + parser.add_argument("--branch", type=str, help="Branch name", default="detached") + parser.add_argument("--commit", type=str, help="Commit name", default="dirty") + parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0") + parser.add_argument("--port", type=int, help="Server listen host", default="8080") + parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models") + parser.add_argument("--n-prompts", type=int, + help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True) + parser.add_argument("--max-prompt-tokens", type=int, + help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset", + required=True) + parser.add_argument("--max-tokens", type=int, + help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens", + required=True) + parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) + parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) + parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) + parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) + parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) + parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True) + parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True) + parser.add_argument("--scenario", type=str, help="Scenario to run", required=True) + parser.add_argument("--duration", type=str, help="Bench scenario", required=True) + + args = parser.parse_args(args_in) + + start_time = time.time() + + # Start the server and performance scenario + try: + server_process = start_server(args) + except Exception: + print("bench: server start error :") + traceback.print_exc(file=sys.stdout) + sys.exit(1) + + # start the benchmark + try: + start_benchmark(args) + + iterations = 0 + with open("results.github.env", 'w') as github_env: + # parse output + with open('k6-results.json', 'r') as bench_results: + # Load JSON data from file + data = json.load(bench_results) + for metric_name in data['metrics']: + for metric_metric in data['metrics'][metric_name]: + value = data['metrics'][metric_name][metric_metric] + if isinstance(value, float): + value = round(value, 2) + data['metrics'][metric_name][metric_metric]=value + github_env.write( + f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n") + token_seconds = data['metrics']['llamacpp_tokens_second']['avg'] + iterations = data['root_group']['checks']['success completion']['passes'] + + except Exception: + print("bench: error :") + traceback.print_exc(file=sys.stdout) + + # Stop the server + if server_process: + try: + print(f"bench: shutting down server pid={server_process.pid} ...") + if os.name == 'nt': + interrupt = signal.CTRL_C_EVENT + else: + interrupt = signal.SIGINT + server_process.send_signal(interrupt) + server_process.wait(0.5) + + except subprocess.TimeoutExpired: + print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...") + server_process.kill() # SIGKILL + server_process.wait() + + while is_server_listening(args.host, args.port): + time.sleep(0.1) + + # Prometheus + end_time = time.time() + if is_server_listening("0.0.0.0", 9090): + metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', + 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] + + for metric in metrics: + resp = requests.get(f"http://localhost:9090/api/v1/query_range", + params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2}) + if resp.status_code != 200: + print(f"bench: unable to extract prometheus metric {metric}: {resp.text}") + else: + metric_data = resp.json() + values = metric_data['data']['result'][0]['values'] + timestamps, metric_values = zip(*values) + metric_values = [float(value) for value in metric_values] + timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + plt.figure(figsize=(16, 10), dpi=80) + plt.plot(timestamps, metric_values, label=metric) + plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) + plt.yticks(fontsize=12, alpha=.7) + + plt.title(f"llama.cpp {args.name} on {args.runner_label}\n" + f"duration={args.duration} {iterations} iterations", + fontsize=14, wrap=True) + plt.grid(axis='both', alpha=.3) + plt.ylabel(f"llamacpp:{metric}", fontsize=22) + plt.xlabel(f"{args.hf_repo}/{args.hf_file}\n" + f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" + f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" + f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True) + plt.gcf().autofmt_xdate() + + # Remove borders + plt.gca().spines["top"].set_alpha(0.0) + plt.gca().spines["bottom"].set_alpha(0.3) + plt.gca().spines["right"].set_alpha(0.0) + plt.gca().spines["left"].set_alpha(0.3) + + # Save the plot as a PNG image + plt.savefig(f'{metric}.png') + plt.close() + + # 140 chars max for commit status description + bench_results = { + "req": { + "p90": data['metrics']["http_req_duration"]["p(90)"], + "avg": data['metrics']["http_req_duration"]["avg"], + }, + "pp": { + "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"], + "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"], + }, + "tg": { + "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"], + "avg": data['metrics']["llamacpp_tokens_second"]["avg"], + }, + } + with open("results.github.env", 'a') as github_env: + github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n") + github_env.write(f"BENCH_ITERATIONS={iterations}\n") + + +def start_benchmark(args): + k6_path = 'k6' + if 'BENCH_K6_BIN_PATH' in os.environ: + k6_path = os.environ['BENCH_K6_BIN_PATH'] + k6_args = [ + 'run', args.scenario, + '--no-color', + ] + k6_args.extend(['--duration', args.duration]) + k6_args.extend(['--iterations', args.n_prompts]) + k6_args.extend(['--vus', args.parallel]) + k6_args.extend(['--summary-export', 'k6-results.json']) + args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " + args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) + print(f"bench: starting k6 with: {args}") + k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr) + if k6_completed.returncode != 0: + raise Exception("bench: unable to run k6") + + +def start_server(args): + server_process = start_server_background(args) + + attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + + while not is_server_listening(args.host, args.port): + attempts += 1 + if attempts > max_attempts: + assert False, "server not started" + print(f"bench: waiting for server to start ...") + time.sleep(0.5) + + print("bench: server started.") + return server_process + + +def start_server_background(args): + # Start the server + server_path = '../../../build/bin/server' + if 'LLAMA_SERVER_BIN_PATH' in os.environ: + server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_args = [ + '--host', args.host, + '--port', args.port, + ] + model_file = args.model_path_prefix + os.path.sep + args.hf_file + model_dir = os.path.dirname(model_file) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + server_args.extend(['--model', model_file]) + server_args.extend(['--hf-repo', args.hf_repo]) + server_args.extend(['--hf-file', args.hf_file]) + server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) + server_args.extend(['--ctx-size', args.ctx_size]) + server_args.extend(['--parallel', args.parallel]) + server_args.extend(['--batch-size', args.batch_size]) + server_args.extend(['--ubatch-size', args.ubatch_size]) + server_args.extend(['--n-predict', args.max_tokens * 2]) + server_args.extend(['--defrag-thold', "0.1"]) + server_args.append('--cont-batching') + server_args.append('--metrics') + server_args.extend(['--log-format', "text"]) + args = [str(arg) for arg in [server_path, *server_args]] + print(f"bench: starting server with: {' '.join(args)}") + pkwargs = { + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE + } + server_process = subprocess.Popen( + args, + **pkwargs) + + def server_log(in_stream, out_stream): + for line in iter(in_stream.readline, b''): + print(line.decode('utf-8'), end='', file=out_stream) + + thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout)) + thread_stdout.start() + thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr)) + thread_stderr.start() + + return server_process + + +def is_server_listening(server_fqdn, server_port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((server_fqdn, server_port)) + _is_server_listening = result == 0 + if _is_server_listening: + print(f"server is listening on {server_fqdn}:{server_port}...") + return _is_server_listening + + +def escape_metric_name(metric_name): + return re.sub('[^A-Z0-9]', '_', metric_name.upper()) + + +if __name__ == '__main__': + main() diff --git a/examples/server/bench/prometheus.yml b/examples/server/bench/prometheus.yml new file mode 100644 index 0000000000000..b15ee52443fe8 --- /dev/null +++ b/examples/server/bench/prometheus.yml @@ -0,0 +1,9 @@ +global: + scrape_interval: 10s + external_labels: + llamacpp: 'server' + +scrape_configs: + - job_name: 'llama.cpp server' + static_configs: + - targets: ['localhost:8080'] diff --git a/examples/server/bench/requirements.txt b/examples/server/bench/requirements.txt new file mode 100644 index 0000000000000..66ed226eda6f0 --- /dev/null +++ b/examples/server/bench/requirements.txt @@ -0,0 +1,2 @@ +matplotlib +requests diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 86c3339dc7183..9a6cf7d6aae07 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1114,7 +1114,10 @@ def start_server_background(context): server_args.append('--verbose') if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) - print(f"starting server with: {context.server_path} {server_args}") + + args = [str(arg) for arg in [context.server_path, *server_args]] + print(f"bench: starting server with: {' '.join(args)}") + flags = 0 if 'nt' == os.name: flags |= subprocess.DETACHED_PROCESS @@ -1130,16 +1133,14 @@ def start_server_background(context): [str(arg) for arg in [context.server_path, *server_args]], **pkwargs) - def log_stdout(process): - for line in iter(process.stdout.readline, b''): - print(line.decode('utf-8'), end='') - thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,)) + def server_log(in_stream, out_stream): + for line in iter(in_stream.readline, b''): + print(line.decode('utf-8'), end='', file=out_stream) + + thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout)) thread_stdout.start() - def log_stderr(process): - for line in iter(process.stderr.readline, b''): - print(line.decode('utf-8'), end='', file=sys.stderr) - thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,)) + thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr)) thread_stderr.start() print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") From 799317b27d0465fa66221a9f5b096baa5619fa27 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 25 Mar 2024 21:15:09 +0100 Subject: [PATCH 02/13] server: bench: reduce list of GPU nodes --- .github/workflows/bench.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 788e7de358743..b0788cccf0dde 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -10,11 +10,7 @@ on: type: choice options: - Standard_NC4as_T4_v3 - - Standard_NC64as_T4_v3 - Standard_NC24ads_A100_v4 - - Standard_NC48ads_A100_v4 - - Standard_ND96asr_A100_v4 - - Standard_NC40ads_H100_v5 - Standard_NC80adis_H100_v5 push: branches: From 5c0b2a2b597b14cfcde575188eeb64eb1cda9164 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Mon, 25 Mar 2024 21:44:45 +0100 Subject: [PATCH 03/13] server: bench: fix graph, fix output artifact --- .github/workflows/bench.yml | 6 +++--- examples/server/bench/bench.py | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b0788cccf0dde..49a1700a8316c 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -174,6 +174,6 @@ jobs: name: benchmark-results compression-level: 9 path: | - examples/server/bench/**/.png - examples/server/bench/**/.json - examples/server/bench/**/.log + examples/server/bench/*.png + examples/server/bench/*.json + examples/server/bench/*.log diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index c0e08ae197227..3a213cce9c43f 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -1,5 +1,4 @@ import argparse -import base64 import json import os import re @@ -13,6 +12,8 @@ from contextlib import closing from datetime import datetime +import matplotlib +import matplotlib.dates import matplotlib.pyplot as plt import requests @@ -109,6 +110,10 @@ def main(args_in: list[str] | None = None) -> None: for metric in metrics: resp = requests.get(f"http://localhost:9090/api/v1/query_range", params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2}) + + with open(f"{metric}.json", 'w') as metric_json: + metric_json.write(resp.text) + if resp.status_code != 200: print(f"bench: unable to extract prometheus metric {metric}: {resp.text}") else: @@ -131,6 +136,8 @@ def main(args_in: list[str] | None = None) -> None: f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True) + plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator()) + plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y%m%d %H:%M:%S")) plt.gcf().autofmt_xdate() # Remove borders From 93434fdc7e98ee82ce837bb76f0c7053ddac8770 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 26 Mar 2024 01:08:59 +0100 Subject: [PATCH 04/13] ci: bench: add mermaid in case of image cannot be uploaded --- .github/workflows/bench.yml | 116 +++++++++++++++++++++++++-------- examples/server/bench/bench.py | 49 +++++++++++--- 2 files changed, 129 insertions(+), 36 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 49a1700a8316c..65d89015873ec 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -117,6 +117,18 @@ jobs: cat results.github.env >> $GITHUB_ENV + # Remove dataset as we do not want it in the artefact + rm ShareGPT_V3_unfiltered_cleaned_split.json + + - uses: actions/upload-artifact@v4 + with: + name: benchmark-results + compression-level: 9 + path: | + examples/server/bench/*.png + examples/server/bench/*.json + examples/server/bench/*.log + - name: Commit status uses: Sibz/github-status-action@v1 with: @@ -128,6 +140,7 @@ jobs: - name: Upload benchmark images uses: devicons/public-upload-to-imgur@v2.2.2 + continue-on-error: true # Important as it looks unstable: 503 id: imgur_step with: client_id: ${{secrets.IMGUR_CLIENT_ID}} @@ -136,44 +149,95 @@ jobs: examples/server/bench/predicted_tokens_seconds.png examples/server/bench/kv_cache_usage_ratio.png examples/server/bench/requests_processing.png - examples/server/bench/requests_deferred.png + + - name: Extract mermaid + id: set_mermaid + run: | + set -eux + + cd examples/server/bench + PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) + echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) + echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) + echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV + echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + REQUESTS_PROCESSING=$(cat requests_processing.mermaid) + echo "REQUESTS_PROCESSING<> $GITHUB_ENV + echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV - name: Comment PR uses: mshick/add-pr-comment@v2 id: comment_pr if: ${{ github.event.pull_request != '' }} + continue-on-error: true with: message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} message: | 📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + + - ${{ env.BENCH_GRAPH_XLABEL }} + - req_avg=${{ env.HTTP_REQ_DURATION_AVG }} pp_avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }} tks_avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }} + +

- prompt_tokens_seconds - predicted_tokens_seconds + prompt_tokens_seconds + +

+ More + + ```mermaid + ${{ env.PROMPT_TOKENS_SECONDS }} + ``` + +
+ + predicted_tokens_seconds + +
+ More + + ```mermaid + ${{ env.PREDICTED_TOKENS_SECONDS }} + ``` + +
+

Details

- kv_cache_usage_ratio - requests_processing - requests_deferred -

- - - - name: Upload results - if: ${{ github.event.pull_request }} - uses: edunad/actions-image@v2.0.0 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - path: 'examples/server/bench/*.png' - title: | - llama.cpp server benchmark results for ${{ github.job }} on ${{ env.RUNNER_LABEL }}: ${{ env.LLAMACPP_TOKENS_SECOND_AVG}}tk/s - annotationLevel: 'success' + kv_cache_usage_ratio - - uses: actions/upload-artifact@v4 - with: - name: benchmark-results - compression-level: 9 - path: | - examples/server/bench/*.png - examples/server/bench/*.json - examples/server/bench/*.log +
+ More + + ```mermaid + ${{ env.KV_CACHE_USAGE_RATIO }} + ``` + +
+ + requests_processing + +
+ More + + ```mermaid + ${{ env.REQUESTS_PROCESSING }} + ``` + +
+ +

+
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 3a213cce9c43f..cee9724319c69 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -101,6 +101,12 @@ def main(args_in: list[str] | None = None) -> None: while is_server_listening(args.host, args.port): time.sleep(0.1) + title = (f"llama.cpp {args.name} on {args.runner_label}\n " + f"duration={args.duration} {iterations} iterations") + xlabel = (f"{args.hf_repo}/{args.hf_file}\n" + f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" + f"branch={args.branch} commit={args.commit}") + # Prometheus end_time = time.time() if is_server_listening("0.0.0.0", 9090): @@ -121,23 +127,20 @@ def main(args_in: list[str] | None = None) -> None: values = metric_data['data']['result'][0]['values'] timestamps, metric_values = zip(*values) metric_values = [float(value) for value in metric_values] - timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] plt.figure(figsize=(16, 10), dpi=80) - plt.plot(timestamps, metric_values, label=metric) + plt.plot(timestamps_dt, metric_values, label=metric) plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) plt.yticks(fontsize=12, alpha=.7) - plt.title(f"llama.cpp {args.name} on {args.runner_label}\n" - f"duration={args.duration} {iterations} iterations", + ylabel = f"llamacpp:{metric}" + plt.title(title, fontsize=14, wrap=True) plt.grid(axis='both', alpha=.3) - plt.ylabel(f"llamacpp:{metric}", fontsize=22) - plt.xlabel(f"{args.hf_repo}/{args.hf_file}\n" - f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n" - f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" - f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True) + plt.ylabel(ylabel, fontsize=22) + plt.xlabel(xlabel, fontsize=14, wrap=True) plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator()) - plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y%m%d %H:%M:%S")) + plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S")) plt.gcf().autofmt_xdate() # Remove borders @@ -150,6 +153,27 @@ def main(args_in: list[str] | None = None) -> None: plt.savefig(f'{metric}.png') plt.close() + # Mermaid format in case image failed + with (open(f"{metric}.mermaid", 'w') as mermaid_f): + mermaid = ( + f"""--- +config: + xyChart: + titleFontSize: 12 + width: 900 + height: 600 + themeVariables: + xyChart: + titleColor: "#000000" +--- +xychart-beta + title "{title}" + y-axis "llamacpp:{metric}" + x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))} + line [{', '.join([str(round(float(value))) for value in metric_values])}] + """) + mermaid_f.write(mermaid) + # 140 chars max for commit status description bench_results = { "req": { @@ -169,6 +193,11 @@ def main(args_in: list[str] | None = None) -> None: github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n") github_env.write(f"BENCH_ITERATIONS={iterations}\n") + title = title.replace('\n', ' ') + xlabel = xlabel.replace('\n', ' ') + github_env.write(f"BENCH_GRAPH_TITLE={title}\n") + github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n") + def start_benchmark(args): k6_path = 'k6' From 5c2f8e6bfb77dfdf59600b6b566b5f40b8ce4ddc Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 26 Mar 2024 08:07:08 +0100 Subject: [PATCH 05/13] ci: bench: more resilient, more metrics --- .github/workflows/bench.yml | 72 ++++++++++++++++++++++++---------- examples/server/bench/bench.py | 8 ++-- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 65d89015873ec..a7b4edcabc130 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -12,6 +12,15 @@ on: - Standard_NC4as_T4_v3 - Standard_NC24ads_A100_v4 - Standard_NC80adis_H100_v5 + sha: + description: 'Commit SHA1 to build' + required: false + type: string + duration: + description: 'Duration of the bench' + type: string + default: 10m + push: branches: - master @@ -31,6 +40,7 @@ jobs: runs-on: Standard_NC4as_T4_v3 env: RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it + N_USERS: 8 if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }} steps: - name: Clone @@ -38,6 +48,7 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: Install python env id: pipenv @@ -100,13 +111,13 @@ jobs: --runner-label ${{ env.RUNNER_LABEL }} \ --name ${{ github.job }} \ --branch ${{ github.head_ref || github.ref_name }} \ - --commit ${{ github.sha }} \ + --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --scenario script.js \ - --duration 10m \ + --duration ${{ github.event.inputs.duration || "10m" }} \ --hf-repo ggml-org/models \ --hf-file phi-2/ggml-model-q4_0.gguf \ --model-path-prefix /models \ - --parallel 8 \ + --parallel ${{ env.N_USERS }} \ -ngl 33 \ --batch-size 2048 \ --ubatch-size 256 \ @@ -125,7 +136,7 @@ jobs: name: benchmark-results compression-level: 9 path: | - examples/server/bench/*.png + examples/server/bench/*.jpg examples/server/bench/*.json examples/server/bench/*.log @@ -133,6 +144,7 @@ jobs: uses: Sibz/github-status-action@v1 with: authToken: ${{secrets.GITHUB_TOKEN}} + sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} context: bench-server-baseline description: | ${{ env.BENCH_RESULTS }} @@ -145,10 +157,10 @@ jobs: with: client_id: ${{secrets.IMGUR_CLIENT_ID}} path: | - examples/server/bench/prompt_tokens_seconds.png - examples/server/bench/predicted_tokens_seconds.png - examples/server/bench/kv_cache_usage_ratio.png - examples/server/bench/requests_processing.png + examples/server/bench/prompt_tokens_seconds.jpg + examples/server/bench/predicted_tokens_seconds.jpg + examples/server/bench/kv_cache_usage_ratio.jpg + examples/server/bench/requests_processing.jpg - name: Extract mermaid id: set_mermaid @@ -176,24 +188,40 @@ jobs: echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV + - name: Extract image url + id: extrac_image_url + continue-on-error: true + run: | + set -eux + + echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV + echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV + echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV + echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV + - name: Comment PR uses: mshick/add-pr-comment@v2 id: comment_pr if: ${{ github.event.pull_request != '' }} - continue-on-error: true with: message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} message: | - 📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + - Concurrent users: ${{ env.N_USERS }} + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs + - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s** + - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** + - Finish reason : stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - ${{ env.BENCH_GRAPH_XLABEL }} - - req_avg=${{ env.HTTP_REQ_DURATION_AVG }} pp_avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }} tks_avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }} - - + +

- prompt_tokens_seconds + + prompt_tokens_seconds

+ More ```mermaid @@ -202,7 +230,7 @@ jobs:
- predicted_tokens_seconds + predicted_tokens_seconds
More @@ -214,10 +242,14 @@ jobs:

+
- Details -

- kv_cache_usage_ratio + +

Details + +

+ + kv_cache_usage_ratio

More @@ -228,7 +260,7 @@ jobs:
- requests_processing + requests_processing
More @@ -238,6 +270,6 @@ jobs: ```
- +

diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index cee9724319c69..df8c1398797f8 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -70,7 +70,7 @@ def main(args_in: list[str] | None = None) -> None: for metric_name in data['metrics']: for metric_metric in data['metrics'][metric_name]: value = data['metrics'][metric_name][metric_metric] - if isinstance(value, float): + if isinstance(value, float) or isinstance(value, int): value = round(value, 2) data['metrics'][metric_name][metric_metric]=value github_env.write( @@ -149,11 +149,11 @@ def main(args_in: list[str] | None = None) -> None: plt.gca().spines["right"].set_alpha(0.0) plt.gca().spines["left"].set_alpha(0.3) - # Save the plot as a PNG image - plt.savefig(f'{metric}.png') + # Save the plot as a jpg image + plt.savefig(f'{metric}.jpg', dpi=60) plt.close() - # Mermaid format in case image failed + # Mermaid format in case images upload failed with (open(f"{metric}.mermaid", 'w') as mermaid_f): mermaid = ( f"""--- From 225f63baccc91cbb20c6a498297be8c47249d6fb Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 26 Mar 2024 08:10:39 +0100 Subject: [PATCH 06/13] ci: bench: trigger build --- .github/workflows/bench.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index a7b4edcabc130..b05312f37107a 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -215,7 +215,6 @@ jobs: - Finish reason : stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - ${{ env.BENCH_GRAPH_XLABEL }} -

prompt_tokens_seconds From fb3b2f5eb10188c29db46697004a862f7aaca345 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 26 Mar 2024 08:13:32 +0100 Subject: [PATCH 07/13] ci: bench: fix duration --- .github/workflows/bench.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b05312f37107a..90975d8fcfa87 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -41,6 +41,7 @@ jobs: env: RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it N_USERS: 8 + DURATION: 10m if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }} steps: - name: Clone @@ -113,7 +114,7 @@ jobs: --branch ${{ github.head_ref || github.ref_name }} \ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ --scenario script.js \ - --duration ${{ github.event.inputs.duration || "10m" }} \ + --duration ${{ github.event.inputs.duration || env.DURATION }} \ --hf-repo ggml-org/models \ --hf-file phi-2/ggml-model-q4_0.gguf \ --model-path-prefix /models \ From bff4644f4971c6f948a867f94bd537271c980361 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 26 Mar 2024 08:20:28 +0100 Subject: [PATCH 08/13] ci: bench: fix typo --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 90975d8fcfa87..ddd6976c88dd9 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -218,7 +218,7 @@ jobs:

- prompt_tokens_seconds + prompt_tokens_seconds

From 337c13b22688f627775c3892613926c54d2da1d1 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 26 Mar 2024 08:39:30 +0100 Subject: [PATCH 09/13] ci: bench: fix mermaid values, markdown generated --- .github/workflows/bench.yml | 2 +- examples/server/bench/bench.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index ddd6976c88dd9..4b44b61d123c6 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -222,7 +222,7 @@ jobs:
- More + More ```mermaid ${{ env.PROMPT_TOKENS_SECONDS }} diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index df8c1398797f8..ea5d3854d561a 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -170,7 +170,7 @@ def main(args_in: list[str] | None = None) -> None: title "{title}" y-axis "llamacpp:{metric}" x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))} - line [{', '.join([str(round(float(value))) for value in metric_values])}] + line [{', '.join([str(round(float(value), 2)) for value in metric_values])}] """) mermaid_f.write(mermaid) From 1c1f8769947ef6e483809beec87b59051cf3e435 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Tue, 26 Mar 2024 11:11:32 +0100 Subject: [PATCH 10/13] typo on the step name Co-authored-by: Xuan Son Nguyen --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 4b44b61d123c6..a343488ba6a3d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -190,7 +190,7 @@ jobs: echo "EOF" >> $GITHUB_ENV - name: Extract image url - id: extrac_image_url + id: extract_image_url continue-on-error: true run: | set -eux From 30195d73079241bcb20b239ac961dec9218ce902 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 27 Mar 2024 18:30:28 +0100 Subject: [PATCH 11/13] ci: bench: trailing spaces --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index a343488ba6a3d..72e74a0e0bb9e 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -217,7 +217,7 @@ jobs: - ${{ env.BENCH_GRAPH_XLABEL }}

- + prompt_tokens_seconds

From fce86c3a55fbcd59e50f119a08ab7106180698e1 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 27 Mar 2024 19:23:13 +0100 Subject: [PATCH 12/13] ci: bench: move images in a details section --- .github/workflows/bench.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 72e74a0e0bb9e..15857a9721fd4 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -209,13 +209,16 @@ jobs: message: | 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - - Concurrent users: ${{ env.N_USERS }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs + - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs Finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}reqs - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - - Finish reason : stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - ${{ env.BENCH_GRAPH_XLABEL }} +
+ + Time series +

prompt_tokens_seconds @@ -273,3 +276,4 @@ jobs:

+
From 4a6bfa92c5cfa12efa264c4c145dd91e6c8aba60 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 27 Mar 2024 19:55:13 +0100 Subject: [PATCH 13/13] ci: bench: reduce bullet point size --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 15857a9721fd4..c758b5c481730 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -210,7 +210,7 @@ jobs: 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs Finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}reqs + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - ${{ env.BENCH_GRAPH_XLABEL }}