This repository has been archived by the owner on Oct 11, 2024. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add .github/data/nm_flakiness_test.txt
- Loading branch information
07173d3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9838806153127505
prompts/s0.9839423464815629
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1004790270155
tokens/s290.1186806679104
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.38046962141033
tokens/s212.40363433497495
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8015637157399438
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.4492137483306
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.2966718603175
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6594143678990263
prompts/s1.6687833229306541
prompts/s1.01
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
541.1969768416074
tokens/s544.2525428517424
tokens/s1.01
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377.18046071846766
tokens/s379.32334947986
tokens/s1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6516803459834803
prompts/s1.6687833229306541
prompts/s1.01
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
538.6746235581297
tokens/s544.2525428517424
tokens/s1.01
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
375.4269426420451
tokens/s379.32334947986
tokens/s1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8008328806036298
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.2352449529666
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.12698266058067
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9678529818865674
prompts/s0.9681718741664802
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.5911759623304
tokens/s288.6862621993805
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
209.9982876565348
tokens/s210.0642515379012
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4279516931534055
prompts/s3.426415053428332
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1026.636109931949
tokens/s1026.1759022129
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.0180193770127
tokens/s778.6722372435578
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46617228586627996
prompts/s0.46619889417344884
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.05920172872032
tokens/s126.06639696975621
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.11873379279801
tokens/s118.12547580566847
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839408532562771
prompts/s0.9839423464815629
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1182403854575
tokens/s290.1186806679104
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.3901927810558
tokens/s212.40363433497495
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.968213581943855
prompts/s0.9681718741664802
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6986984854122
tokens/s288.6862621993805
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.0765282529647
tokens/s210.0642515379012
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6534161903205102
prompts/s1.6687833229306541
prompts/s1.01
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
539.2407472012903
tokens/s544.2525428517424
tokens/s1.01
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
375.821500059852
tokens/s379.32334947986
tokens/s1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9682043997708495
prompts/s0.9681718741664802
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.69596057567264
tokens/s288.6862621993805
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.07453596628045
tokens/s210.0642515379012
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2159685723525158
prompts/s3.2103391227966993
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1019.1082808927887
tokens/s1017.3243646230461
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
735.078224540711
tokens/s733.795774201969
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.49188692147912894
prompts/s0.49192736369027046
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02538882379292
tokens/s130.03607931788608
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.98795652682396
tokens/s122.08653312065132
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4919031273254898
prompts/s0.49192736369027046
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02967267722
tokens/s130.03607931788608
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.9886962225393
tokens/s122.08653312065132
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7912902288425876
prompts/s1.790315599910969
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
569.1478386036413
tokens/s568.8381691034455
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.10093126848864
tokens/s400.8767272384647
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4297023346094284
prompts/s3.426415053428332
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1027.1604089580662
tokens/s1026.1759022129
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.428436392781
tokens/s778.6722372435578
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2161197334584184
prompts/s2.218472219108833
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.3968702181851
tokens/s685.1233805199839
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.6733494864889
tokens/s508.9323168783604
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.794475323821354
prompts/s1.790315599910969
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
570.1598409546414
tokens/s568.8381691034455
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.8105402918997
tokens/s400.8767272384647
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.794293224503511
prompts/s1.790315599910969
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
570.1019824169836
tokens/s568.8381691034455
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.76976546244515
tokens/s400.8767272384647
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2084901247639728
prompts/s3.2103391227966993
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1016.7384356364554
tokens/s1017.3243646230461
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
733.3859777514664
tokens/s733.795774201969
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6644829715829466
prompts/s1.6687833229306541
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
542.8500377308046
tokens/s544.2525428517424
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
378.3281021982886
tokens/s379.32334947986
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.21639859119691
prompts/s3.2103391227966993
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1019.2445495643888
tokens/s1017.3243646230461
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
735.1915243814249
tokens/s733.795774201969
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46619125237047376
prompts/s0.46619889417344884
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.06433052434105
tokens/s126.06639696975621
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.12353952563063
tokens/s118.12547580566847
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8025126935434426
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.7270488267943
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.526623988981
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4661784884746152
prompts/s0.46619889417344884
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.0608789967156
tokens/s126.06639696975621
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.12030540969799
tokens/s118.12547580566847
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.430342876819414
prompts/s3.426415053428332
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1027.3522447310206
tokens/s1026.1759022129
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.5865828717336
tokens/s778.6722372435578
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9681612165247417
prompts/s0.9681718741664802
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6830843392924
tokens/s288.6862621993805
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.06516635342828
tokens/s210.0642515379012
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8012619661869493
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.3608696264517
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.22661003977225
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.212043981727541
prompts/s3.2103391227966993
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1017.8646173696405
tokens/s1017.3243646230461
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
734.2047306179724
tokens/s733.795774201969
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.42941556360955
prompts/s3.426415053428332
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1027.0745240069027
tokens/s1026.1759022129
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.3541201851306
tokens/s778.6722372435578
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.210889630556952
prompts/s3.2103391227966993
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1017.4988150271924
tokens/s1017.3243646230461
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
733.9344487872728
tokens/s733.795774201969
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839303700328755
prompts/s0.9839423464815629
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.11514937209347
tokens/s290.1186806679104
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40432874689697
tokens/s212.40363433497495
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.660967877099355
prompts/s1.6687833229306541
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
541.7036341895114
tokens/s544.2525428517424
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377.53135459317497
tokens/s379.32334947986
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4919050109685955
prompts/s0.49192736369027046
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.03017059943855
tokens/s130.03607931788608
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.08426498892582
tokens/s122.08653312065132
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.491903683383068
prompts/s0.49192736369027046
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.0298196654802
tokens/s130.03607931788608
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.98555476322242
tokens/s122.08653312065132
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4661126558232265
prompts/s0.46619889417344884
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.04307697001141
tokens/s126.06639696975621
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.10362473248912
tokens/s118.12547580566847
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7918698051312607
prompts/s1.790315599910969
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
569.3319877642256
tokens/s568.8381691034455
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.22115180655084
tokens/s400.8767272384647
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8026872995140495
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.7781687406268
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.5671658657002
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2165391100304213
prompts/s2.218472219108833
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.526384886995
tokens/s685.1233805199839
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.57159956182
tokens/s508.9323168783604
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839298545065714
prompts/s0.9839423464815629
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1149973674443
tokens/s290.1186806679104
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40093769233357
tokens/s212.40363433497495
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2168408372800346
prompts/s2.218472219108833
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.6195663077355
tokens/s685.1233805199839
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.5373764964738
tokens/s508.9323168783604
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839310730942807
prompts/s0.9839423464815629
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.11535667209233
tokens/s290.1186806679104
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40776028934965
tokens/s212.40363433497495
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4661661192676988
prompts/s0.46619889417344884
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.05753419824265
tokens/s126.06639696975621
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.11717130004952
tokens/s118.12547580566847
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9681229801068986
prompts/s0.9681718741664802
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6716831316746
tokens/s288.6862621993805
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.05687007039413
tokens/s210.0642515379012
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2153689163965087
prompts/s2.218472219108833
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.1649978876791
tokens/s685.1233805199839
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.21449089702463
tokens/s508.9323168783604
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7954438761430778
prompts/s1.790315599910969
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
570.4675797295242
tokens/s568.8381691034455
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
402.0274140078693
tokens/s400.8767272384647
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2163880596773264
prompts/s2.218472219108833
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.4797365099498
tokens/s685.1233805199839
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.7408496660948
tokens/s508.9323168783604
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.49189028417263925
prompts/s0.49192736369027046
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02627771819544
tokens/s130.03607931788608
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.98551120625338
tokens/s122.08653312065132
tokens/s1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.427939292640791
prompts/s3.426415053428332
prompts/s1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1026.6323961065596
tokens/s1026.1759022129
tokens/s1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.0391968777005
tokens/s778.6722372435578
tokens/s1.00
This comment was automatically generated by workflow using github-action-benchmark.