From 4146960d52bc435bd42ef140b7b0f09bb4014057 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Sun, 24 Mar 2024 08:16:22 +0100
Subject: [PATCH 01/13] server: bench: init
---
.github/workflows/bench.yml | 183 ++++++++++++
examples/server/bench/bench.py | 267 ++++++++++++++++++
examples/server/bench/prometheus.yml | 9 +
examples/server/bench/requirements.txt | 2 +
examples/server/tests/features/steps/steps.py | 19 +-
5 files changed, 471 insertions(+), 9 deletions(-)
create mode 100644 .github/workflows/bench.yml
create mode 100644 examples/server/bench/bench.py
create mode 100644 examples/server/bench/prometheus.yml
create mode 100644 examples/server/bench/requirements.txt
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
new file mode 100644
index 0000000000000..788e7de358743
--- /dev/null
+++ b/.github/workflows/bench.yml
@@ -0,0 +1,183 @@
+# Benchmark
+name: Benchmark
+
+on:
+ workflow_dispatch:
+ inputs:
+ gpu-series:
+ description: 'Azure GPU series to run with'
+ required: true
+ type: choice
+ options:
+ - Standard_NC4as_T4_v3
+ - Standard_NC64as_T4_v3
+ - Standard_NC24ads_A100_v4
+ - Standard_NC48ads_A100_v4
+ - Standard_ND96asr_A100_v4
+ - Standard_NC40ads_H100_v5
+ - Standard_NC80adis_H100_v5
+ push:
+ branches:
+ - master
+ paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+ pull_request:
+ types: [opened, synchronize, reopened]
+ paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+ schedule:
+ - cron: '04 2 * * *'
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ bench-server-baseline:
+ runs-on: Standard_NC4as_T4_v3
+ env:
+ RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
+ if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Install python env
+ id: pipenv
+ run: |
+ cd examples/server/bench
+ python3 -m venv venv
+ source venv/bin/activate
+ pip install -r requirements.txt
+
+ - name: Prometheus
+ id: install_prometheus
+ run: |
+ wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
+ tar xzf prometheus*.tar.gz --strip-components=1
+ ./prometheus --config.file=examples/server/bench/prometheus.yml &
+ while ! nc -z localhost 9090; do
+ sleep 0.1
+ done
+
+ - name: Install k6
+ id: k6_installation
+ run: |
+ cd examples/server/bench
+ wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
+ tar xzf k6*.tar.gz --strip-components=1
+
+ - name: Build
+ id: cmake_build
+ run: |
+ set -eux
+ mkdir build
+ cd build
+ cmake .. \
+ -DLLAMA_NATIVE=OFF \
+ -DLLAMA_BUILD_SERVER=ON \
+ -DLLAMA_CURL=ON \
+ -DLLAMA_CUBLAS=ON \
+ -DCUDAToolkit_ROOT=/usr/local/cuda \
+ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
+ -DCMAKE_CUDA_ARCHITECTURES=75 \
+ -DLLAMA_FATAL_WARNINGS=OFF \
+ -DLLAMA_ALL_WARNINGS=OFF \
+ -DCMAKE_BUILD_TYPE=Release;
+ cmake --build . --config Release -j $(nproc) --target server
+
+ - name: Download the dataset
+ id: download_dataset
+ run: |
+ cd examples/server/bench
+ wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+ - name: Server bench
+ id: server_bench
+ run: |
+ set -eux
+
+ cd examples/server/bench
+ source venv/bin/activate
+ BENCH_K6_BIN_PATH=./k6 python bench.py \
+ --runner-label ${{ env.RUNNER_LABEL }} \
+ --name ${{ github.job }} \
+ --branch ${{ github.head_ref || github.ref_name }} \
+ --commit ${{ github.sha }} \
+ --scenario script.js \
+ --duration 10m \
+ --hf-repo ggml-org/models \
+ --hf-file phi-2/ggml-model-q4_0.gguf \
+ --model-path-prefix /models \
+ --parallel 8 \
+ -ngl 33 \
+ --batch-size 2048 \
+ --ubatch-size 256 \
+ --ctx-size 16384 \
+ --n-prompts 1000 \
+ --max-prompt-tokens 1024 \
+ --max-tokens 2048
+
+ cat results.github.env >> $GITHUB_ENV
+
+ - name: Commit status
+ uses: Sibz/github-status-action@v1
+ with:
+ authToken: ${{secrets.GITHUB_TOKEN}}
+ context: bench-server-baseline
+ description: |
+ ${{ env.BENCH_RESULTS }}
+ state: 'success'
+
+ - name: Upload benchmark images
+ uses: devicons/public-upload-to-imgur@v2.2.2
+ id: imgur_step
+ with:
+ client_id: ${{secrets.IMGUR_CLIENT_ID}}
+ path: |
+ examples/server/bench/prompt_tokens_seconds.png
+ examples/server/bench/predicted_tokens_seconds.png
+ examples/server/bench/kv_cache_usage_ratio.png
+ examples/server/bench/requests_processing.png
+ examples/server/bench/requests_deferred.png
+
+ - name: Comment PR
+ uses: mshick/add-pr-comment@v2
+ id: comment_pr
+ if: ${{ github.event.pull_request != '' }}
+ with:
+ message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
+ message: |
+ 📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+
+
+
+
+ Details
+
+
+
+
+
+
+
+ - name: Upload results
+ if: ${{ github.event.pull_request }}
+ uses: edunad/actions-image@v2.0.0
+ with:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ path: 'examples/server/bench/*.png'
+ title: |
+ llama.cpp server benchmark results for ${{ github.job }} on ${{ env.RUNNER_LABEL }}: ${{ env.LLAMACPP_TOKENS_SECOND_AVG}}tk/s
+ annotationLevel: 'success'
+
+ - uses: actions/upload-artifact@v4
+ with:
+ name: benchmark-results
+ compression-level: 9
+ path: |
+ examples/server/bench/**/.png
+ examples/server/bench/**/.json
+ examples/server/bench/**/.log
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
new file mode 100644
index 0000000000000..c0e08ae197227
--- /dev/null
+++ b/examples/server/bench/bench.py
@@ -0,0 +1,267 @@
+import argparse
+import base64
+import json
+import os
+import re
+import signal
+import socket
+import subprocess
+import sys
+import threading
+import time
+import traceback
+from contextlib import closing
+from datetime import datetime
+
+import matplotlib.pyplot as plt
+import requests
+
+
+def main(args_in: list[str] | None = None) -> None:
+ parser = argparse.ArgumentParser(description="Start server benchmark scenario")
+ parser.add_argument("--name", type=str, help="Bench name", required=True)
+ parser.add_argument("--runner-label", type=str, help="Runner label", required=True)
+ parser.add_argument("--branch", type=str, help="Branch name", default="detached")
+ parser.add_argument("--commit", type=str, help="Commit name", default="dirty")
+ parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0")
+ parser.add_argument("--port", type=int, help="Server listen host", default="8080")
+ parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models")
+ parser.add_argument("--n-prompts", type=int,
+ help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True)
+ parser.add_argument("--max-prompt-tokens", type=int,
+ help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset",
+ required=True)
+ parser.add_argument("--max-tokens", type=int,
+ help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens",
+ required=True)
+ parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True)
+ parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True)
+ parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True)
+ parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True)
+ parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True)
+ parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True)
+ parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True)
+ parser.add_argument("--scenario", type=str, help="Scenario to run", required=True)
+ parser.add_argument("--duration", type=str, help="Bench scenario", required=True)
+
+ args = parser.parse_args(args_in)
+
+ start_time = time.time()
+
+ # Start the server and performance scenario
+ try:
+ server_process = start_server(args)
+ except Exception:
+ print("bench: server start error :")
+ traceback.print_exc(file=sys.stdout)
+ sys.exit(1)
+
+ # start the benchmark
+ try:
+ start_benchmark(args)
+
+ iterations = 0
+ with open("results.github.env", 'w') as github_env:
+ # parse output
+ with open('k6-results.json', 'r') as bench_results:
+ # Load JSON data from file
+ data = json.load(bench_results)
+ for metric_name in data['metrics']:
+ for metric_metric in data['metrics'][metric_name]:
+ value = data['metrics'][metric_name][metric_metric]
+ if isinstance(value, float):
+ value = round(value, 2)
+ data['metrics'][metric_name][metric_metric]=value
+ github_env.write(
+ f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
+ token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
+ iterations = data['root_group']['checks']['success completion']['passes']
+
+ except Exception:
+ print("bench: error :")
+ traceback.print_exc(file=sys.stdout)
+
+ # Stop the server
+ if server_process:
+ try:
+ print(f"bench: shutting down server pid={server_process.pid} ...")
+ if os.name == 'nt':
+ interrupt = signal.CTRL_C_EVENT
+ else:
+ interrupt = signal.SIGINT
+ server_process.send_signal(interrupt)
+ server_process.wait(0.5)
+
+ except subprocess.TimeoutExpired:
+ print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...")
+ server_process.kill() # SIGKILL
+ server_process.wait()
+
+ while is_server_listening(args.host, args.port):
+ time.sleep(0.1)
+
+ # Prometheus
+ end_time = time.time()
+ if is_server_listening("0.0.0.0", 9090):
+ metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds',
+ 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred']
+
+ for metric in metrics:
+ resp = requests.get(f"http://localhost:9090/api/v1/query_range",
+ params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
+ if resp.status_code != 200:
+ print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
+ else:
+ metric_data = resp.json()
+ values = metric_data['data']['result'][0]['values']
+ timestamps, metric_values = zip(*values)
+ metric_values = [float(value) for value in metric_values]
+ timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
+ plt.figure(figsize=(16, 10), dpi=80)
+ plt.plot(timestamps, metric_values, label=metric)
+ plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
+ plt.yticks(fontsize=12, alpha=.7)
+
+ plt.title(f"llama.cpp {args.name} on {args.runner_label}\n"
+ f"duration={args.duration} {iterations} iterations",
+ fontsize=14, wrap=True)
+ plt.grid(axis='both', alpha=.3)
+ plt.ylabel(f"llamacpp:{metric}", fontsize=22)
+ plt.xlabel(f"{args.hf_repo}/{args.hf_file}\n"
+ f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n"
+ f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
+ f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True)
+ plt.gcf().autofmt_xdate()
+
+ # Remove borders
+ plt.gca().spines["top"].set_alpha(0.0)
+ plt.gca().spines["bottom"].set_alpha(0.3)
+ plt.gca().spines["right"].set_alpha(0.0)
+ plt.gca().spines["left"].set_alpha(0.3)
+
+ # Save the plot as a PNG image
+ plt.savefig(f'{metric}.png')
+ plt.close()
+
+ # 140 chars max for commit status description
+ bench_results = {
+ "req": {
+ "p90": data['metrics']["http_req_duration"]["p(90)"],
+ "avg": data['metrics']["http_req_duration"]["avg"],
+ },
+ "pp": {
+ "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"],
+ "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"],
+ },
+ "tg": {
+ "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"],
+ "avg": data['metrics']["llamacpp_tokens_second"]["avg"],
+ },
+ }
+ with open("results.github.env", 'a') as github_env:
+ github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
+ github_env.write(f"BENCH_ITERATIONS={iterations}\n")
+
+
+def start_benchmark(args):
+ k6_path = 'k6'
+ if 'BENCH_K6_BIN_PATH' in os.environ:
+ k6_path = os.environ['BENCH_K6_BIN_PATH']
+ k6_args = [
+ 'run', args.scenario,
+ '--no-color',
+ ]
+ k6_args.extend(['--duration', args.duration])
+ k6_args.extend(['--iterations', args.n_prompts])
+ k6_args.extend(['--vus', args.parallel])
+ k6_args.extend(['--summary-export', 'k6-results.json'])
+ args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
+ args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
+ print(f"bench: starting k6 with: {args}")
+ k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr)
+ if k6_completed.returncode != 0:
+ raise Exception("bench: unable to run k6")
+
+
+def start_server(args):
+ server_process = start_server_background(args)
+
+ attempts = 0
+ max_attempts = 20
+ if 'GITHUB_ACTIONS' in os.environ:
+ max_attempts *= 2
+
+ while not is_server_listening(args.host, args.port):
+ attempts += 1
+ if attempts > max_attempts:
+ assert False, "server not started"
+ print(f"bench: waiting for server to start ...")
+ time.sleep(0.5)
+
+ print("bench: server started.")
+ return server_process
+
+
+def start_server_background(args):
+ # Start the server
+ server_path = '../../../build/bin/server'
+ if 'LLAMA_SERVER_BIN_PATH' in os.environ:
+ server_path = os.environ['LLAMA_SERVER_BIN_PATH']
+ server_args = [
+ '--host', args.host,
+ '--port', args.port,
+ ]
+ model_file = args.model_path_prefix + os.path.sep + args.hf_file
+ model_dir = os.path.dirname(model_file)
+ if not os.path.exists(model_dir):
+ os.makedirs(model_dir)
+ server_args.extend(['--model', model_file])
+ server_args.extend(['--hf-repo', args.hf_repo])
+ server_args.extend(['--hf-file', args.hf_file])
+ server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
+ server_args.extend(['--ctx-size', args.ctx_size])
+ server_args.extend(['--parallel', args.parallel])
+ server_args.extend(['--batch-size', args.batch_size])
+ server_args.extend(['--ubatch-size', args.ubatch_size])
+ server_args.extend(['--n-predict', args.max_tokens * 2])
+ server_args.extend(['--defrag-thold', "0.1"])
+ server_args.append('--cont-batching')
+ server_args.append('--metrics')
+ server_args.extend(['--log-format', "text"])
+ args = [str(arg) for arg in [server_path, *server_args]]
+ print(f"bench: starting server with: {' '.join(args)}")
+ pkwargs = {
+ 'stdout': subprocess.PIPE,
+ 'stderr': subprocess.PIPE
+ }
+ server_process = subprocess.Popen(
+ args,
+ **pkwargs)
+
+ def server_log(in_stream, out_stream):
+ for line in iter(in_stream.readline, b''):
+ print(line.decode('utf-8'), end='', file=out_stream)
+
+ thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout))
+ thread_stdout.start()
+ thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr))
+ thread_stderr.start()
+
+ return server_process
+
+
+def is_server_listening(server_fqdn, server_port):
+ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+ result = sock.connect_ex((server_fqdn, server_port))
+ _is_server_listening = result == 0
+ if _is_server_listening:
+ print(f"server is listening on {server_fqdn}:{server_port}...")
+ return _is_server_listening
+
+
+def escape_metric_name(metric_name):
+ return re.sub('[^A-Z0-9]', '_', metric_name.upper())
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/server/bench/prometheus.yml b/examples/server/bench/prometheus.yml
new file mode 100644
index 0000000000000..b15ee52443fe8
--- /dev/null
+++ b/examples/server/bench/prometheus.yml
@@ -0,0 +1,9 @@
+global:
+ scrape_interval: 10s
+ external_labels:
+ llamacpp: 'server'
+
+scrape_configs:
+ - job_name: 'llama.cpp server'
+ static_configs:
+ - targets: ['localhost:8080']
diff --git a/examples/server/bench/requirements.txt b/examples/server/bench/requirements.txt
new file mode 100644
index 0000000000000..66ed226eda6f0
--- /dev/null
+++ b/examples/server/bench/requirements.txt
@@ -0,0 +1,2 @@
+matplotlib
+requests
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 86c3339dc7183..9a6cf7d6aae07 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1114,7 +1114,10 @@ def start_server_background(context):
server_args.append('--verbose')
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
server_args.extend(['--log-format', "text"])
- print(f"starting server with: {context.server_path} {server_args}")
+
+ args = [str(arg) for arg in [context.server_path, *server_args]]
+ print(f"bench: starting server with: {' '.join(args)}")
+
flags = 0
if 'nt' == os.name:
flags |= subprocess.DETACHED_PROCESS
@@ -1130,16 +1133,14 @@ def start_server_background(context):
[str(arg) for arg in [context.server_path, *server_args]],
**pkwargs)
- def log_stdout(process):
- for line in iter(process.stdout.readline, b''):
- print(line.decode('utf-8'), end='')
- thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
+ def server_log(in_stream, out_stream):
+ for line in iter(in_stream.readline, b''):
+ print(line.decode('utf-8'), end='', file=out_stream)
+
+ thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout))
thread_stdout.start()
- def log_stderr(process):
- for line in iter(process.stderr.readline, b''):
- print(line.decode('utf-8'), end='', file=sys.stderr)
- thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
+ thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
thread_stderr.start()
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
From 799317b27d0465fa66221a9f5b096baa5619fa27 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Mon, 25 Mar 2024 21:15:09 +0100
Subject: [PATCH 02/13] server: bench: reduce list of GPU nodes
---
.github/workflows/bench.yml | 4 ----
1 file changed, 4 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 788e7de358743..b0788cccf0dde 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -10,11 +10,7 @@ on:
type: choice
options:
- Standard_NC4as_T4_v3
- - Standard_NC64as_T4_v3
- Standard_NC24ads_A100_v4
- - Standard_NC48ads_A100_v4
- - Standard_ND96asr_A100_v4
- - Standard_NC40ads_H100_v5
- Standard_NC80adis_H100_v5
push:
branches:
From 5c0b2a2b597b14cfcde575188eeb64eb1cda9164 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Mon, 25 Mar 2024 21:44:45 +0100
Subject: [PATCH 03/13] server: bench: fix graph, fix output artifact
---
.github/workflows/bench.yml | 6 +++---
examples/server/bench/bench.py | 9 ++++++++-
2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b0788cccf0dde..49a1700a8316c 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -174,6 +174,6 @@ jobs:
name: benchmark-results
compression-level: 9
path: |
- examples/server/bench/**/.png
- examples/server/bench/**/.json
- examples/server/bench/**/.log
+ examples/server/bench/*.png
+ examples/server/bench/*.json
+ examples/server/bench/*.log
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index c0e08ae197227..3a213cce9c43f 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -1,5 +1,4 @@
import argparse
-import base64
import json
import os
import re
@@ -13,6 +12,8 @@
from contextlib import closing
from datetime import datetime
+import matplotlib
+import matplotlib.dates
import matplotlib.pyplot as plt
import requests
@@ -109,6 +110,10 @@ def main(args_in: list[str] | None = None) -> None:
for metric in metrics:
resp = requests.get(f"http://localhost:9090/api/v1/query_range",
params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2})
+
+ with open(f"{metric}.json", 'w') as metric_json:
+ metric_json.write(resp.text)
+
if resp.status_code != 200:
print(f"bench: unable to extract prometheus metric {metric}: {resp.text}")
else:
@@ -131,6 +136,8 @@ def main(args_in: list[str] | None = None) -> None:
f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n"
f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True)
+ plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
+ plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y%m%d %H:%M:%S"))
plt.gcf().autofmt_xdate()
# Remove borders
From 93434fdc7e98ee82ce837bb76f0c7053ddac8770 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Tue, 26 Mar 2024 01:08:59 +0100
Subject: [PATCH 04/13] ci: bench: add mermaid in case of image cannot be
uploaded
---
.github/workflows/bench.yml | 116 +++++++++++++++++++++++++--------
examples/server/bench/bench.py | 49 +++++++++++---
2 files changed, 129 insertions(+), 36 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 49a1700a8316c..65d89015873ec 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -117,6 +117,18 @@ jobs:
cat results.github.env >> $GITHUB_ENV
+ # Remove dataset as we do not want it in the artefact
+ rm ShareGPT_V3_unfiltered_cleaned_split.json
+
+ - uses: actions/upload-artifact@v4
+ with:
+ name: benchmark-results
+ compression-level: 9
+ path: |
+ examples/server/bench/*.png
+ examples/server/bench/*.json
+ examples/server/bench/*.log
+
- name: Commit status
uses: Sibz/github-status-action@v1
with:
@@ -128,6 +140,7 @@ jobs:
- name: Upload benchmark images
uses: devicons/public-upload-to-imgur@v2.2.2
+ continue-on-error: true # Important as it looks unstable: 503
id: imgur_step
with:
client_id: ${{secrets.IMGUR_CLIENT_ID}}
@@ -136,44 +149,95 @@ jobs:
examples/server/bench/predicted_tokens_seconds.png
examples/server/bench/kv_cache_usage_ratio.png
examples/server/bench/requests_processing.png
- examples/server/bench/requests_deferred.png
+
+ - name: Extract mermaid
+ id: set_mermaid
+ run: |
+ set -eux
+
+ cd examples/server/bench
+ PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
+ echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV
+ echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
+ echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV
+ echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
+ echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV
+ echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
+
+ REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
+ echo "REQUESTS_PROCESSING<> $GITHUB_ENV
+ echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
+ echo "EOF" >> $GITHUB_ENV
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' }}
+ continue-on-error: true
with:
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
message: |
📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+ - ${{ env.BENCH_GRAPH_XLABEL }}
+ - req_avg=${{ env.HTTP_REQ_DURATION_AVG }} pp_avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }} tks_avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}
+
+
-
-
+
+
+
+ More
+
+ ```mermaid
+ ${{ env.PROMPT_TOKENS_SECONDS }}
+ ```
+
+
+
+
+
+
+ More
+
+ ```mermaid
+ ${{ env.PREDICTED_TOKENS_SECONDS }}
+ ```
+
+
+
Details
-
-
-
-
-
-
- - name: Upload results
- if: ${{ github.event.pull_request }}
- uses: edunad/actions-image@v2.0.0
- with:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- path: 'examples/server/bench/*.png'
- title: |
- llama.cpp server benchmark results for ${{ github.job }} on ${{ env.RUNNER_LABEL }}: ${{ env.LLAMACPP_TOKENS_SECOND_AVG}}tk/s
- annotationLevel: 'success'
+
- - uses: actions/upload-artifact@v4
- with:
- name: benchmark-results
- compression-level: 9
- path: |
- examples/server/bench/*.png
- examples/server/bench/*.json
- examples/server/bench/*.log
+
+ More
+
+ ```mermaid
+ ${{ env.KV_CACHE_USAGE_RATIO }}
+ ```
+
+
+
+
+
+
+ More
+
+ ```mermaid
+ ${{ env.REQUESTS_PROCESSING }}
+ ```
+
+
+
+
+
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 3a213cce9c43f..cee9724319c69 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -101,6 +101,12 @@ def main(args_in: list[str] | None = None) -> None:
while is_server_listening(args.host, args.port):
time.sleep(0.1)
+ title = (f"llama.cpp {args.name} on {args.runner_label}\n "
+ f"duration={args.duration} {iterations} iterations")
+ xlabel = (f"{args.hf_repo}/{args.hf_file}\n"
+ f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
+ f"branch={args.branch} commit={args.commit}")
+
# Prometheus
end_time = time.time()
if is_server_listening("0.0.0.0", 9090):
@@ -121,23 +127,20 @@ def main(args_in: list[str] | None = None) -> None:
values = metric_data['data']['result'][0]['values']
timestamps, metric_values = zip(*values)
metric_values = [float(value) for value in metric_values]
- timestamps = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
+ timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
plt.figure(figsize=(16, 10), dpi=80)
- plt.plot(timestamps, metric_values, label=metric)
+ plt.plot(timestamps_dt, metric_values, label=metric)
plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
plt.yticks(fontsize=12, alpha=.7)
- plt.title(f"llama.cpp {args.name} on {args.runner_label}\n"
- f"duration={args.duration} {iterations} iterations",
+ ylabel = f"llamacpp:{metric}"
+ plt.title(title,
fontsize=14, wrap=True)
plt.grid(axis='both', alpha=.3)
- plt.ylabel(f"llamacpp:{metric}", fontsize=22)
- plt.xlabel(f"{args.hf_repo}/{args.hf_file}\n"
- f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size}\n"
- f"pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n"
- f"branch={args.branch} commit={args.commit}", fontsize=14, wrap=True)
+ plt.ylabel(ylabel, fontsize=22)
+ plt.xlabel(xlabel, fontsize=14, wrap=True)
plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator())
- plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y%m%d %H:%M:%S"))
+ plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S"))
plt.gcf().autofmt_xdate()
# Remove borders
@@ -150,6 +153,27 @@ def main(args_in: list[str] | None = None) -> None:
plt.savefig(f'{metric}.png')
plt.close()
+ # Mermaid format in case image failed
+ with (open(f"{metric}.mermaid", 'w') as mermaid_f):
+ mermaid = (
+ f"""---
+config:
+ xyChart:
+ titleFontSize: 12
+ width: 900
+ height: 600
+ themeVariables:
+ xyChart:
+ titleColor: "#000000"
+---
+xychart-beta
+ title "{title}"
+ y-axis "llamacpp:{metric}"
+ x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
+ line [{', '.join([str(round(float(value))) for value in metric_values])}]
+ """)
+ mermaid_f.write(mermaid)
+
# 140 chars max for commit status description
bench_results = {
"req": {
@@ -169,6 +193,11 @@ def main(args_in: list[str] | None = None) -> None:
github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n")
github_env.write(f"BENCH_ITERATIONS={iterations}\n")
+ title = title.replace('\n', ' ')
+ xlabel = xlabel.replace('\n', ' ')
+ github_env.write(f"BENCH_GRAPH_TITLE={title}\n")
+ github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n")
+
def start_benchmark(args):
k6_path = 'k6'
From 5c2f8e6bfb77dfdf59600b6b566b5f40b8ce4ddc Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Tue, 26 Mar 2024 08:07:08 +0100
Subject: [PATCH 05/13] ci: bench: more resilient, more metrics
---
.github/workflows/bench.yml | 72 ++++++++++++++++++++++++----------
examples/server/bench/bench.py | 8 ++--
2 files changed, 56 insertions(+), 24 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 65d89015873ec..a7b4edcabc130 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -12,6 +12,15 @@ on:
- Standard_NC4as_T4_v3
- Standard_NC24ads_A100_v4
- Standard_NC80adis_H100_v5
+ sha:
+ description: 'Commit SHA1 to build'
+ required: false
+ type: string
+ duration:
+ description: 'Duration of the bench'
+ type: string
+ default: 10m
+
push:
branches:
- master
@@ -31,6 +40,7 @@ jobs:
runs-on: Standard_NC4as_T4_v3
env:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
+ N_USERS: 8
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
steps:
- name: Clone
@@ -38,6 +48,7 @@ jobs:
uses: actions/checkout@v3
with:
fetch-depth: 0
+ ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
- name: Install python env
id: pipenv
@@ -100,13 +111,13 @@ jobs:
--runner-label ${{ env.RUNNER_LABEL }} \
--name ${{ github.job }} \
--branch ${{ github.head_ref || github.ref_name }} \
- --commit ${{ github.sha }} \
+ --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \
- --duration 10m \
+ --duration ${{ github.event.inputs.duration || "10m" }} \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix /models \
- --parallel 8 \
+ --parallel ${{ env.N_USERS }} \
-ngl 33 \
--batch-size 2048 \
--ubatch-size 256 \
@@ -125,7 +136,7 @@ jobs:
name: benchmark-results
compression-level: 9
path: |
- examples/server/bench/*.png
+ examples/server/bench/*.jpg
examples/server/bench/*.json
examples/server/bench/*.log
@@ -133,6 +144,7 @@ jobs:
uses: Sibz/github-status-action@v1
with:
authToken: ${{secrets.GITHUB_TOKEN}}
+ sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
context: bench-server-baseline
description: |
${{ env.BENCH_RESULTS }}
@@ -145,10 +157,10 @@ jobs:
with:
client_id: ${{secrets.IMGUR_CLIENT_ID}}
path: |
- examples/server/bench/prompt_tokens_seconds.png
- examples/server/bench/predicted_tokens_seconds.png
- examples/server/bench/kv_cache_usage_ratio.png
- examples/server/bench/requests_processing.png
+ examples/server/bench/prompt_tokens_seconds.jpg
+ examples/server/bench/predicted_tokens_seconds.jpg
+ examples/server/bench/kv_cache_usage_ratio.jpg
+ examples/server/bench/requests_processing.jpg
- name: Extract mermaid
id: set_mermaid
@@ -176,24 +188,40 @@ jobs:
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
echo "EOF" >> $GITHUB_ENV
+ - name: Extract image url
+ id: extrac_image_url
+ continue-on-error: true
+ run: |
+ set -eux
+
+ echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
+ echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
+ echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
+ echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
+
- name: Comment PR
uses: mshick/add-pr-comment@v2
id: comment_pr
if: ${{ github.event.pull_request != '' }}
- continue-on-error: true
with:
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
message: |
- 📈 **llama.cpp server** benchmark for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+ 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+ - Concurrent users: ${{ env.N_USERS }}
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs
+ - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+ - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+ - Finish reason : stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- ${{ env.BENCH_GRAPH_XLABEL }}
- - req_avg=${{ env.HTTP_REQ_DURATION_AVG }} pp_avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }} tks_avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}
-
-
+
+
-
+
+
+
More
```mermaid
@@ -202,7 +230,7 @@ jobs:
-
+
More
@@ -214,10 +242,14 @@ jobs:
+
- Details
-
-
+
+ Details
+
+
+
+
More
@@ -228,7 +260,7 @@ jobs:
-
+
More
@@ -238,6 +270,6 @@ jobs:
```
-
+
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index cee9724319c69..df8c1398797f8 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -70,7 +70,7 @@ def main(args_in: list[str] | None = None) -> None:
for metric_name in data['metrics']:
for metric_metric in data['metrics'][metric_name]:
value = data['metrics'][metric_name][metric_metric]
- if isinstance(value, float):
+ if isinstance(value, float) or isinstance(value, int):
value = round(value, 2)
data['metrics'][metric_name][metric_metric]=value
github_env.write(
@@ -149,11 +149,11 @@ def main(args_in: list[str] | None = None) -> None:
plt.gca().spines["right"].set_alpha(0.0)
plt.gca().spines["left"].set_alpha(0.3)
- # Save the plot as a PNG image
- plt.savefig(f'{metric}.png')
+ # Save the plot as a jpg image
+ plt.savefig(f'{metric}.jpg', dpi=60)
plt.close()
- # Mermaid format in case image failed
+ # Mermaid format in case images upload failed
with (open(f"{metric}.mermaid", 'w') as mermaid_f):
mermaid = (
f"""---
From 225f63baccc91cbb20c6a498297be8c47249d6fb Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Tue, 26 Mar 2024 08:10:39 +0100
Subject: [PATCH 06/13] ci: bench: trigger build
---
.github/workflows/bench.yml | 1 -
1 file changed, 1 deletion(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index a7b4edcabc130..b05312f37107a 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -215,7 +215,6 @@ jobs:
- Finish reason : stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- ${{ env.BENCH_GRAPH_XLABEL }}
-
From fb3b2f5eb10188c29db46697004a862f7aaca345 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Tue, 26 Mar 2024 08:13:32 +0100
Subject: [PATCH 07/13] ci: bench: fix duration
---
.github/workflows/bench.yml | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b05312f37107a..90975d8fcfa87 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -41,6 +41,7 @@ jobs:
env:
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
N_USERS: 8
+ DURATION: 10m
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }}
steps:
- name: Clone
@@ -113,7 +114,7 @@ jobs:
--branch ${{ github.head_ref || github.ref_name }} \
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
--scenario script.js \
- --duration ${{ github.event.inputs.duration || "10m" }} \
+ --duration ${{ github.event.inputs.duration || env.DURATION }} \
--hf-repo ggml-org/models \
--hf-file phi-2/ggml-model-q4_0.gguf \
--model-path-prefix /models \
From bff4644f4971c6f948a867f94bd537271c980361 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Tue, 26 Mar 2024 08:20:28 +0100
Subject: [PATCH 08/13] ci: bench: fix typo
---
.github/workflows/bench.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 90975d8fcfa87..ddd6976c88dd9 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -218,7 +218,7 @@ jobs:
-
+
From 337c13b22688f627775c3892613926c54d2da1d1 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Tue, 26 Mar 2024 08:39:30 +0100
Subject: [PATCH 09/13] ci: bench: fix mermaid values, markdown generated
---
.github/workflows/bench.yml | 2 +-
examples/server/bench/bench.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index ddd6976c88dd9..4b44b61d123c6 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -222,7 +222,7 @@ jobs:
- More
+ More
```mermaid
${{ env.PROMPT_TOKENS_SECONDS }}
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index df8c1398797f8..ea5d3854d561a 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -170,7 +170,7 @@ def main(args_in: list[str] | None = None) -> None:
title "{title}"
y-axis "llamacpp:{metric}"
x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))}
- line [{', '.join([str(round(float(value))) for value in metric_values])}]
+ line [{', '.join([str(round(float(value), 2)) for value in metric_values])}]
""")
mermaid_f.write(mermaid)
From 1c1f8769947ef6e483809beec87b59051cf3e435 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert
Date: Tue, 26 Mar 2024 11:11:32 +0100
Subject: [PATCH 10/13] typo on the step name
Co-authored-by: Xuan Son Nguyen
---
.github/workflows/bench.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 4b44b61d123c6..a343488ba6a3d 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -190,7 +190,7 @@ jobs:
echo "EOF" >> $GITHUB_ENV
- name: Extract image url
- id: extrac_image_url
+ id: extract_image_url
continue-on-error: true
run: |
set -eux
From 30195d73079241bcb20b239ac961dec9218ce902 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Wed, 27 Mar 2024 18:30:28 +0100
Subject: [PATCH 11/13] ci: bench: trailing spaces
---
.github/workflows/bench.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index a343488ba6a3d..72e74a0e0bb9e 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -217,7 +217,7 @@ jobs:
- ${{ env.BENCH_GRAPH_XLABEL }}
-
+
From fce86c3a55fbcd59e50f119a08ab7106180698e1 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Wed, 27 Mar 2024 19:23:13 +0100
Subject: [PATCH 12/13] ci: bench: move images in a details section
---
.github/workflows/bench.yml | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 72e74a0e0bb9e..15857a9721fd4 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -209,13 +209,16 @@ jobs:
message: |
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
- - Concurrent users: ${{ env.N_USERS }}
- - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs
+ - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs Finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}reqs
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- - Finish reason : stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- ${{ env.BENCH_GRAPH_XLABEL }}
+
+
+ Time series
+
@@ -273,3 +276,4 @@ jobs:
+
From 4a6bfa92c5cfa12efa264c4c145dd91e6c8aba60 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT
Date: Wed, 27 Mar 2024 19:55:13 +0100
Subject: [PATCH 13/13] ci: bench: reduce bullet point size
---
.github/workflows/bench.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 15857a9721fd4..c758b5c481730 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -210,7 +210,7 @@ jobs:
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
- - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms passes=${{ env.HTTP_REQ_FAILED_FAILS }}reqs fails=${{ env.HTTP_REQ_FAILED_PASSES }}reqs Finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }}reqs truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}reqs
+ - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
- ${{ env.BENCH_GRAPH_XLABEL }}