diff --git a/README.md b/README.md index a7240ec..c958ec9 100644 --- a/README.md +++ b/README.md @@ -58,11 +58,19 @@ various NVBench features and usecases: - [Runtime and compile-time parameter sweeps](examples/axes.cu) - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu) -- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu) +- [Reporting simple item/sec and byte/sec throughput statistics](examples/throughput.cu) +- [Gathering and reporting CUPTI metrics](examples/auto_throughput.cu) - [Skipping benchmark configurations](examples/skip.cu) - [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu) - [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu) +### Example Output Samples + +Sample outputs for `--list`, `--markdown`, `--json`, and `--csv` are provided +for each example. These are located in the [examples/outputs](examples/outputs/) +directory. See the associated [README](examples/outputs/README.md) for more +information. + ### Building Examples To build the examples: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 34c8763..2e01fbb 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,7 @@ set(example_srcs # Metatarget for all examples: add_custom_target(nvbench.example.all) add_dependencies(nvbench.all nvbench.example.all) +set(examples) foreach(example_src IN LISTS example_srcs) get_filename_component(example_name "${example_src}" NAME_WLE) @@ -25,4 +26,7 @@ foreach(example_src IN LISTS example_srcs) ) add_dependencies(nvbench.example.all ${example_name}) + list(APPEND examples ${example_name}) endforeach() + +add_subdirectory(outputs) diff --git a/examples/outputs/CMakeLists.txt b/examples/outputs/CMakeLists.txt new file mode 100644 index 0000000..179114d --- /dev/null +++ b/examples/outputs/CMakeLists.txt @@ -0,0 +1,52 @@ +set(NVBench_EXAMPLE_OUTPUT_DEVICES "all" CACHE STRING + "--devices argument used when generating example outputs." +) +set(NVBench_EXAMPLE_OUTPUT_TIMEOUT "15.0" CACHE STRING + "--timeout argument used when generating example outputs." +) +mark_as_advanced( + NVBench_EXAMPLE_OUTPUT_DEVICES + NVBench_EXAMPLE_OUTPUT_TIMEOUT +) + +set_property(GLOBAL PROPERTY JOB_POOLS exclusive=1) + +add_custom_target(nvbench.regenerate_example_outputs) +add_custom_target(nvbench.example.all.list) + +set(results_dir "${CMAKE_CURRENT_BINARY_DIR}/results") + +foreach (example IN LISTS examples) + # Print --list + add_custom_target(${example}.list + COMMAND "$" + --md "${results_dir}/${example}.list.md" + --list + BYPRODUCTS "${results_dir}/${example}.list.md" + COMMENT "Generating ${example}.list.md..." + VERBATIM + ) + add_dependencies(${example}.list nvbench.example.all) + add_dependencies(nvbench.regenerate_example_outputs ${example}.list) + add_dependencies(nvbench.example.all.list ${example}.list) + + # Run and output all formats + add_custom_target(${example}.outputs + COMMAND "$" + --quiet + --devices ${NVBench_EXAMPLE_OUTPUT_DEVICES} + --timeout ${NVBench_EXAMPLE_OUTPUT_TIMEOUT} + --md "${results_dir}/${example}.md" + --csv "${results_dir}/${example}.csv" + --json "${results_dir}/${example}.json" + BYPRODUCTS + "${results_dir}/${example}.md" + "${results_dir}/${example}.csv" + "${results_dir}/${example}.json" + COMMENT "Generating ${example} outputs (json, csv, md)..." + JOB_POOL exclusive + VERBATIM + ) + add_dependencies(${example}.outputs nvbench.example.all) + add_dependencies(nvbench.regenerate_example_outputs ${example}.outputs) +endforeach() diff --git a/examples/outputs/README.md b/examples/outputs/README.md new file mode 100644 index 0000000..1256bd6 --- /dev/null +++ b/examples/outputs/README.md @@ -0,0 +1,32 @@ +# Example Outputs + +This directory contains sample outputs for each of the NVBench examples: + +- `${example}.list.md`: The output of invoking the example with `--list`. +- `${example}.md`: The output of invoking the example with `--md stdout`. +- `${example}.json`: The output of invoking the example with `--json stdout`. +- `${example}.csv`: The output of invoking the example with `--csv stdout`. + +These files are only meant to provide samples of NVBench output formats. The +results and measurements in these files are not intended to demonstrate peak +performance of the devices and algorithms measured. The results may +intentionally include errors (noisy results, timeouts, etc) to show how these +are reported. + +# Generating + +The outputs are generated by building the `nvbench.regenerate_example_outputs` +target. Ideally, CUPTI metrics should be enabled and the GPU clocks locked +to `base`: + +```bash +# Enable non-root users to collect CUPTI metrics: +sudo rmmod nvidia +sudo modprobe nvidia NVreg_RestrictProfilingToAdminUsers=0 +# Enable persistence mode and lock GPU clocks to base: +sudo --pm 1 --lgc base + +ninja nvbench.regenerate_example_outputs +``` + +The files will be written to `/outputs/results/`. diff --git a/examples/outputs/nvbench.example.auto_throughput.csv b/examples/outputs/nvbench.example.auto_throughput.csv new file mode 100644 index 0000000..aaed1be --- /dev/null +++ b/examples/outputs/nvbench.example.auto_throughput.csv @@ -0,0 +1,9 @@ +Benchmark,Device,Device Name,T,Stride,Skipped,Elements,HBWPeak,LoadEff,StoreEff,L1HitRate,L2HitRate,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),Batch GPU (sec),Batch +throughput_bench,0,Quadro GV100,1,1,No,33554432,0.6585438475423353,1,1,0,0.49999904648868265,1078,0.00047011594805194853,0.0014244727929907238,0.0004639086524829754,0.0015463138823336802,72329825754.28767,0.0004611097332451499,1134 +throughput_bench,0,Quadro GV100,1,4,No,33554432,0.6947096407739456,0.25,1,0,0.2000340992245478,452,0.0011125579800884962,0.0006852662282240376,0.0011062891657373588,0.0006886487256687341,30330616116.660107,0.001102912886702233,473 +throughput_bench,0,Quadro GV100,2,1,No,33554432,0.671727291393325,0.5069332663594991,1,0.2682065963745117,0.5437664402009483,548,0.0009194173795620432,0.0006406757855676574,0.0009131468997819581,0.0006974456101610215,36745929935.27345,0.0009104959699842665,576 +throughput_bench,0,Quadro GV100,2,4,No,33554432,0.5825672478320796,0.12669523778540573,1,0.20691384209526908,0.3812073432135215,175,0.0028746560114285722,0.002486847758258855,0.002868418363843645,0.0024933453791821615,11697886341.460135,0.002855257117229959,184 +throughput_bench,1,Quadro GP100,1,1,Yes,,,,,,,,,,,,,, +throughput_bench,1,Quadro GP100,1,4,Yes,,,,,,,,,,,,,, +throughput_bench,1,Quadro GP100,2,1,Yes,,,,,,,,,,,,,, +throughput_bench,1,Quadro GP100,2,4,Yes,,,,,,,,,,,,,, diff --git a/examples/outputs/nvbench.example.auto_throughput.json b/examples/outputs/nvbench.example.auto_throughput.json new file mode 100644 index 0000000..7a71ae8 --- /dev/null +++ b/examples/outputs/nvbench.example.auto_throughput.json @@ -0,0 +1,1293 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "throughput_bench", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "T": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "1", + "description": "nvbench::enum_type<1, int>", + "is_active": true + }, + { + "input_string": "2", + "description": "nvbench::enum_type<2, int>", + "is_active": true + } + ] + }, + "Stride": { + "type": "int64", + "flags": "", + "values": [ + { + "input_string": "1", + "description": "", + "value": 1 + }, + { + "input_string": "4", + "description": "", + "value": 4 + } + ] + } + }, + "states": { + "Device=0 T=1 Stride=1": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "1" + }, + "Stride": { + "type": "int64", + "value": "1" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Peak Sustained Global Memory Throughput (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "HBWPeak" + }, + "description": { + "type": "string", + "value": "The utilization level of the device memory relative to the peak utilization." + }, + "value": { + "type": "float64", + "value": "0.6585438475423353" + } + }, + "Global Load Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "LoadEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "1" + } + }, + "Global Store Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "StoreEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "1" + } + }, + "L1 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L1HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L1 cache." + }, + "value": { + "type": "float64", + "value": "0" + } + }, + "L2 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L2HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L2 cache." + }, + "value": { + "type": "float64", + "value": "0.49999904648868265" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1078" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00047011594805194853" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0014244727929907238" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004639086524829754" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0015463138823336802" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "72329825754.28767" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004611097332451499" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1134" + } + } + }, + "is_skipped": false + }, + "Device=0 T=1 Stride=4": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "1" + }, + "Stride": { + "type": "int64", + "value": "4" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Peak Sustained Global Memory Throughput (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "HBWPeak" + }, + "description": { + "type": "string", + "value": "The utilization level of the device memory relative to the peak utilization." + }, + "value": { + "type": "float64", + "value": "0.6947096407739456" + } + }, + "Global Load Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "LoadEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "0.25" + } + }, + "Global Store Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "StoreEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "1" + } + }, + "L1 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L1HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L1 cache." + }, + "value": { + "type": "float64", + "value": "0" + } + }, + "L2 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L2HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L2 cache." + }, + "value": { + "type": "float64", + "value": "0.2000340992245478" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "452" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011125579800884962" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006852662282240376" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011062891657373588" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006886487256687341" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "30330616116.660107" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001102912886702233" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "473" + } + } + }, + "is_skipped": false + }, + "Device=0 T=2 Stride=1": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "2" + }, + "Stride": { + "type": "int64", + "value": "1" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Peak Sustained Global Memory Throughput (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "HBWPeak" + }, + "description": { + "type": "string", + "value": "The utilization level of the device memory relative to the peak utilization." + }, + "value": { + "type": "float64", + "value": "0.671727291393325" + } + }, + "Global Load Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "LoadEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "0.5069332663594991" + } + }, + "Global Store Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "StoreEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "1" + } + }, + "L1 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L1HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L1 cache." + }, + "value": { + "type": "float64", + "value": "0.2682065963745117" + } + }, + "L2 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L2HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L2 cache." + }, + "value": { + "type": "float64", + "value": "0.5437664402009483" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "548" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0009194173795620432" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006406757855676574" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009131468997819581" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006974456101610215" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "36745929935.27345" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009104959699842665" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "576" + } + } + }, + "is_skipped": false + }, + "Device=0 T=2 Stride=4": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "2" + }, + "Stride": { + "type": "int64", + "value": "4" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Peak Sustained Global Memory Throughput (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "HBWPeak" + }, + "description": { + "type": "string", + "value": "The utilization level of the device memory relative to the peak utilization." + }, + "value": { + "type": "float64", + "value": "0.5825672478320796" + } + }, + "Global Load Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "LoadEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "0.12669523778540573" + } + }, + "Global Store Efficiency (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "StoreEff" + }, + "description": { + "type": "string", + "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage." + }, + "value": { + "type": "float64", + "value": "1" + } + }, + "L1 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L1HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L1 cache." + }, + "value": { + "type": "float64", + "value": "0.20691384209526908" + } + }, + "L2 Cache Hit Rate (HW)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "L2HitRate" + }, + "description": { + "type": "string", + "value": "Hit rate at L2 cache." + }, + "value": { + "type": "float64", + "value": "0.3812073432135215" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "175" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0028746560114285722" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.002486847758258855" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002868418363843645" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0024933453791821615" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "11697886341.460135" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002855257117229959" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "184" + } + } + }, + "is_skipped": false + }, + "Device=1 T=1 Stride=1": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "1" + }, + "Stride": { + "type": "int64", + "value": "1" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + } + }, + "is_skipped": true, + "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)" + }, + "Device=1 T=1 Stride=4": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "1" + }, + "Stride": { + "type": "int64", + "value": "4" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + } + }, + "is_skipped": true, + "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)" + }, + "Device=1 T=2 Stride=1": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "2" + }, + "Stride": { + "type": "int64", + "value": "1" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + } + }, + "is_skipped": true, + "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)" + }, + "Device=1 T=2 Stride=4": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "2" + }, + "Stride": { + "type": "int64", + "value": "4" + } + }, + "summaries": { + "Element count: Elements": { + "short_name": { + "type": "string", + "value": "Elements" + }, + "value": { + "type": "int64", + "value": "33554432" + } + } + }, + "is_skipped": true, + "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)" + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.auto_throughput.list.md b/examples/outputs/nvbench.example.auto_throughput.list.md new file mode 100644 index 0000000..8f808da --- /dev/null +++ b/examples/outputs/nvbench.example.auto_throughput.list.md @@ -0,0 +1,41 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 31601 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15563 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `throughput_bench` (4 configurations) + +### Axes + +* `T` : type + * `1` (nvbench::enum_type<1, int>) + * `2` (nvbench::enum_type<2, int>) +* `Stride` : int64 + * `1` + * `4` + diff --git a/examples/outputs/nvbench.example.auto_throughput.md b/examples/outputs/nvbench.example.auto_throughput.md new file mode 100644 index 0000000..ded46d8 --- /dev/null +++ b/examples/outputs/nvbench.example.auto_throughput.md @@ -0,0 +1,73 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 32163 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15999 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: throughput_bench [Device=0 T=1 Stride=1] +Pass: Cold: 0.463909ms GPU, 0.470116ms CPU, 0.50s total GPU, 1078x +Pass: Batch: 0.461110ms GPU, 0.52s total GPU, 1134x +Run: throughput_bench [Device=0 T=1 Stride=4] +Pass: Cold: 1.106289ms GPU, 1.112558ms CPU, 0.50s total GPU, 452x +Pass: Batch: 1.102913ms GPU, 0.52s total GPU, 473x +Run: throughput_bench [Device=0 T=2 Stride=1] +Pass: Cold: 0.913147ms GPU, 0.919417ms CPU, 0.50s total GPU, 548x +Pass: Batch: 0.910496ms GPU, 0.52s total GPU, 576x +Run: throughput_bench [Device=0 T=2 Stride=4] +Pass: Cold: 2.868418ms GPU, 2.874656ms CPU, 0.50s total GPU, 175x +Pass: Batch: 2.855257ms GPU, 0.53s total GPU, 184x +Run: throughput_bench [Device=1 T=1 Stride=1] +Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600) +Fail: Unexpected error: Device: 1 isn't supported (CC 600) +Run: throughput_bench [Device=1 T=1 Stride=4] +Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600) +Fail: Unexpected error: Device: 1 isn't supported (CC 600) +Run: throughput_bench [Device=1 T=2 Stride=1] +Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600) +Fail: Unexpected error: Device: 1 isn't supported (CC 600) +Run: throughput_bench [Device=1 T=2 Stride=4] +Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600) +Fail: Unexpected error: Device: 1 isn't supported (CC 600) +``` + +# Benchmark Results + +## throughput_bench + +### [0] Quadro GV100 + +| T | Stride | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | Batch GPU | Batch | +|---|--------|----------|---------|---------|----------|-----------|-----------|---------|------------|-------|------------|-------|---------|------------|-------| +| 1 | 1 | 33554432 | 65.85% | 100.00% | 100.00% | 0.00% | 50.00% | 1078x | 470.116 us | 0.14% | 463.909 us | 0.15% | 72.330G | 461.110 us | 1134x | +| 1 | 4 | 33554432 | 69.47% | 25.00% | 100.00% | 0.00% | 20.00% | 452x | 1.113 ms | 0.07% | 1.106 ms | 0.07% | 30.331G | 1.103 ms | 473x | +| 2 | 1 | 33554432 | 67.17% | 50.69% | 100.00% | 26.82% | 54.38% | 548x | 919.417 us | 0.06% | 913.147 us | 0.07% | 36.746G | 910.496 us | 576x | +| 2 | 4 | 33554432 | 58.26% | 12.67% | 100.00% | 20.69% | 38.12% | 175x | 2.875 ms | 0.25% | 2.868 ms | 0.25% | 11.698G | 2.855 ms | 184x | + +### [1] Quadro GP100 + +No data -- check log. diff --git a/examples/outputs/nvbench.example.axes.csv b/examples/outputs/nvbench.example.axes.csv new file mode 100644 index 0000000..177dc56 --- /dev/null +++ b/examples/outputs/nvbench.example.axes.csv @@ -0,0 +1,127 @@ +Benchmark,Device,Device Name,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Batch GPU (sec),Batch,Duration,BlockSize (pow2),BlockSize,NumBlocks (pow2),NumBlocks,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak,T,In,Out,Items,InSize (bytes),OutSize (bytes) +simple,0,Quadro GV100,No,499,0.0010102518997995992,0.0005320863289715677,0.0010037636295826922,0.0005606955085401353,0.001001475909284053,524,,,,,,,,,,,,,, +simple,1,Quadro GP100,No,499,0.0010072372945891786,0.0004201092756117083,0.001002567436508759,0.0003010855735431417,0.001001474511532383,524,,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,147957,1.0618111072811744e-05,0.03254637275181478,4.42401244240246e-06,0.1078451537144948,2.0427748176574993e-06,244766,0,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,4831,0.00011004767191057754,0.004150807794866266,0.00010351461256001611,0.004781428015913556,0.00010137620362095862,5088,0.0001,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,2453,0.00021036899388503913,0.002247040450593631,0.0002039032309542721,0.0024717338672984117,0.00020172918129115027,2582,0.0002,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,1648,0.00030986622087378617,0.0014834014227566264,0.0003034121238635499,0.0016613620256970604,0.00030116395027406757,1736,0.00030000000000000003,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,1239,0.0004101481057304273,0.0012074050648415652,0.000403672849583562,0.00138227314754308,0.00040141034272550807,1304,0.0004,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,992,0.0005105290151209681,0.0009132925978805083,0.0005040890874881937,0.001126663137928465,0.0005017619947554283,1042,0.0005,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,829,0.0006098617478890229,0.0007603582722561039,0.000603470634925204,0.0009715240544303389,0.0006011044563503441,872,0.0006000000000000001,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,711,0.0007102935302390999,0.0006584784515618755,0.0007037439938168366,0.0008111324463740949,0.0007014426981064088,748,0.0007000000000000001,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,622,0.000810564897106109,0.0006072346537084304,0.0008041868904587526,0.000707547745748677,0.0008017951065694391,653,0.0008000000000000001,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,554,0.0009098726931407932,0.0005164841732610924,0.000903432962588876,0.0005681374617078215,0.0009011252491744524,582,0.0009000000000000002,,,,,,,,,,,,, +single_float64_axis,0,Quadro GV100,No,499,0.0010102697054108218,0.00044759296847813034,0.0010038065348932821,0.0005354727313941588,0.0010014759304418617,523,0.0010000000000000002,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,152839,7.705229234684682e-06,0.05418085805591698,3.016308709558017e-06,0.04127500754098809,1.3434882326935044e-06,372166,0,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,4879,0.00010715639024390229,0.004066079001791793,0.0001024813676868744,0.00308685243303526,0.0001013762513461915,5107,0.0001,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,2466,0.00020754400243309012,0.0019319860083082698,0.00020283288218606642,0.0014786760562713035,0.00020172824844867798,2586,0.0002,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,1655,0.00030687972205438067,0.0013377397645028726,0.00030219129368978006,0.0010508732574554497,0.00030105650589762746,1736,0.00030000000000000003,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,1243,0.0004072136267095737,0.001056923897828408,0.0004025078938310397,0.0007582628414963917,0.0004014090936302682,1305,0.0004,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,995,0.0005075617336683415,0.0007933683824162227,0.0005028640134849749,0.0006232695683416088,0.0005017614337245814,1045,0.0005,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,831,0.0006069544103489773,0.0007196728740232229,0.0006022227483966297,0.0004991286373051354,0.0006010893901462271,873,0.0006000000000000001,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,712,0.0007072545856741567,0.0005896830799502328,0.000702559235139509,0.0004401401059381315,0.0007014417189327791,748,0.0007000000000000001,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,623,0.0008076356067415732,0.0005473369257973612,0.0008029095356384018,0.00040823678026317246,0.0008017940608599713,655,0.0008000000000000001,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,555,0.0009069345225225232,0.0004925253150638496,0.0009022477837296215,0.0003476692854287226,0.000901123046875,582,0.0009000000000000002,,,,,,,,,,,,, +single_float64_axis,1,Quadro GP100,No,499,0.0010072963186372732,0.0004478227973886651,0.0010025936021116784,0.0003019453543048136,0.001001473230260019,524,0.0010000000000000002,,,,,,,,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,66,0.00762213684848485,0.001421350078274123,0.007615782766631154,0.001435096779193961,0.007614612524060236,69,,2^6,64,2^6,64,8811814367.137686,70494514937.10149,0.08099094087442726,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,206,0.0024424633495145616,0.004624145111239578,0.0024361206686612466,0.004621497870662145,0.0024339480377906972,215,,2^8,256,2^6,64,27547430167.685093,220379441341.48074,0.25319329198239976,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,13161,0.0011122250784894763,0.012946747378394284,0.0011059346540513448,0.013016477670773642,0.0011028085603954402,13162,,2^10,1024,2^6,64,60680677428.96169,485445419431.69354,0.5577268146044273,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,375,0.0024504410826666676,0.005006459599172983,0.0024441843128204348,0.004997852033101851,0.002444396898467488,376,,2^6,64,2^8,256,27456548038.53994,219652384308.31952,0.25235797829540385,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,13509,0.0010829132057147126,0.00962532025494488,0.0010766412656412294,0.009687962765161975,0.0010755151240936572,13510,,2^8,256,2^8,256,62331684788.27634,498653478306.2107,0.5729015145981281,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,15105,0.0009647508075471688,0.00509700309681285,0.0009584777770750065,0.0051422186996564374,0.0009572492433860444,15106,,2^10,1024,2^8,256,70016087597.56184,560128700780.4948,0.6435302168893552,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,13582,0.0010768811630834938,0.007374891553423134,0.0010706156819352108,0.007408403333483488,0.0010709149855748453,13583,,2^6,64,2^10,1024,62682496746.82156,501459973974.5725,0.5761258892171099,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,1782,0.0009628989545454543,0.004952339328546463,0.0009565676676170035,0.004999266556092292,0.0009545994625289475,1783,,2^8,256,2^10,1024,70155898293.3024,561247186346.4192,0.6448152416663824,,,,,, +copy_sweep_grid_shape,0,Quadro GV100,No,14579,0.0010004705825502492,0.019254279361810064,0.000994218452561916,0.019368874235788265,0.0009928190472480985,14580,,2^10,1024,2^10,1024,67499113325.72127,539992906605.77014,0.6203962621849382,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,2236,0.006688950130143119,0.010949635482023172,0.006684225965911029,0.010951296261576433,0.006674802853478132,2237,,2^6,64,2^6,64,10039885596.66435,80319084773.3148,0.10970154716634999,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,218,0.002301079724770642,0.0028902989580355323,0.0022963436360752907,0.00289533405223059,0.0022982710453501917,228,,2^8,256,2^6,64,29224225392.80601,233793803142.4481,0.31932064458922654,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,426,0.0011791361924882624,0.003945300386512142,0.0011743737087003502,0.003937393066379565,0.0011721584260596465,449,,2^10,1024,2^6,64,57144385558.72278,457155084469.7822,0.6243923247238066,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,226,0.00221838060176991,0.0015555610970668225,0.002213621245021314,0.00156938592170116,0.002213029949976925,237,,2^6,64,2^8,256,30316326314.149483,242530610513.19586,0.33125356549551443,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,12933,0.0011352358293512678,0.006664341285751961,0.0011305139959895455,0.006686678312757843,0.0011301243156632502,12934,,2^8,256,2^8,256,59361373886.62687,474890991093.01495,0.6486164104745069,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,447,0.001123642310961969,0.0021689290927995966,0.001118954880392258,0.0021631706036555243,0.001117002699110243,468,,2^10,1024,2^8,256,59974593413.87786,479796747311.0229,0.6553167986656235,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,448,0.0011216752544642855,0.003016436766525619,0.0011169237145887954,0.0030169455335936853,0.0011148893315741358,470,,2^6,64,2^10,1024,60083659361.37964,480669274891.0371,0.6565085157493404,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,448,0.0011223883750000004,0.002772433370845318,0.001117700926693421,0.002767310086041235,0.001115604862286027,471,,2^8,256,2^10,1024,60041879180.089096,480335033440.71277,0.6560520015306938,,,,,, +copy_sweep_grid_shape,1,Quadro GP100,No,474,0.001060387455696202,0.0015438063319203115,0.0010557063963845812,0.0015281670861921415,0.0010540968806868097,498,,2^10,1024,2^10,1024,63567734580.20524,508541876641.6419,0.6945775194515432,,,,,, +copy_type_sweep,0,Quadro GV100,No,197,0.0025498305939086305,0.0029889374958217657,0.002543548455698237,0.0030039924801443835,0.002539370934940079,206,,,,,,105535813716.71214,211071627433.4243,0.24249957195935695,U8,,,,, +copy_type_sweep,0,Quadro GV100,No,314,0.001601867525477707,0.0041088290293520464,0.001595620784789893,0.004121783467130656,0.0015915001814459024,331,,,,,,84116307132.25725,336465228529.029,0.3865639114533881,U16,,,,, +copy_type_sweep,0,Quadro GV100,No,13509,0.0010828757243319257,0.009631338263504865,0.0010766108759909336,0.00968747141038187,0.0010754745522399284,13510,,,,,,62333444233.72251,498667553869.7801,0.5729176859717142,U32,,,,, +copy_type_sweep,0,Quadro GV100,No,15542,0.000936680335671086,0.006014811602456382,0.0009304157742707227,0.006054554605786883,0.0009291894323484043,15543,,,,,,36063911347.91389,577022581566.6222,0.6629395468366523,U64,,,,, +copy_type_sweep,0,Quadro GV100,No,13508,0.0010830444422564424,0.009669058622361396,0.0010767860333789413,0.009724931080690958,0.0010753850626975213,13509,,,,,,62323304648.94053,498586437191.52423,0.5728244912586445,F32,,,,, +copy_type_sweep,0,Quadro GV100,No,15546,0.0009366829067284162,0.005915735845058088,0.000930411673893452,0.005957150780596753,0.000929181897058261,15547,,,,,,36064070283.626465,577025124538.0234,0.6629424684490159,F64,,,,, +copy_type_sweep,1,Quadro GP100,No,5497,0.0027078292635983312,0.006395900728715414,0.002703092247003721,0.006391943780941085,0.0026997315984675927,5498,,,,,,99306805491.95126,198613610983.90253,0.2712707754915627,U8,,,,, +copy_type_sweep,1,Quadro GP100,No,330,0.001520048475757576,0.004360351977281759,0.0015153354655612601,0.004371610333527513,0.0015136887122844827,348,,,,,,88572947080.2609,354291788321.0436,0.48389940494023653,U16,,,,, +copy_type_sweep,1,Quadro GP100,No,12935,0.001135307039969079,0.006620691359921369,0.0011306220265028327,0.006643085804704841,0.0011301225775930074,12936,,,,,,59355701929.47401,474845615435.79205,0.6485544354182038,U32,,,,, +copy_type_sweep,1,Quadro GP100,No,478,0.0010522014937238498,0.002705184693469705,0.0010475129038718955,0.0026904354098530054,0.0010449056396484376,500,,,,,,32032476044.899876,512519616718.398,0.7000104030791057,U64,,,,, +copy_type_sweep,1,Quadro GP100,No,12933,0.0011353411816283944,0.0066894954953519185,0.0011306308270201252,0.006710880496820158,0.001130335244040081,12934,,,,,,59355239921.125435,474841919369.0035,0.6485493872500594,F32,,,,, +copy_type_sweep,1,Quadro GP100,No,477,0.0010531248867924527,0.0028403001389236257,0.0010484168726943076,0.0028275833565767315,0.001045540454641914,497,,,,,,32004856917.047768,512077710672.7643,0.6994068382221977,F64,,,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I8,I8,,, +copy_type_conversion_sweep,0,Quadro GV100,No,712,0.0007091774676966297,0.002907153358900259,0.0007029326261764157,0.0029793074635008562,0.0006997137222698028,748,,,,,,95469838076.85379,286409514230.56134,0.3290550485185677,,I8,I16,67108864,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,No,622,0.0008109339212218649,0.0030780801739523534,0.0008046977720268298,0.003128636670235119,0.0008020894928445145,654,,,,,,83396358648.99908,416981793244.9954,0.4790691558421363,,I8,I32,67108864,67108864,268435456 +copy_type_conversion_sweep,0,Quadro GV100,No,614,0.0008210284218241037,0.0032226581332443155,0.0008147683657147577,0.0032802944549392547,0.0008120876105256783,645,,,,,,82365573853.77692,411827869268.8846,0.47314782774458247,,I8,F32,67108864,67108864,268435456 +copy_type_conversion_sweep,0,Quadro GV100,No,12047,0.001218285543537808,0.010379223139149431,0.0012120340953301184,0.010435925751575564,0.0012109437788209631,12048,,,,,,55368792229.992294,498319130069.93066,0.5725173828928432,,I8,I64,67108864,67108864,536870912 +copy_type_conversion_sweep,0,Quadro GV100,No,12345,0.0011880294179019875,0.007390662960171094,0.0011817586330519084,0.007441978357839151,0.00118048292704533,12346,,,,,,56787284749.24732,511085562743.2259,0.5871846998428606,,I8,F64,67108864,67108864,536870912 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I16,I8,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I16,I16,,, +copy_type_conversion_sweep,0,Quadro GV100,No,30679,0.00045337666328758926,0.005585454361874255,0.0004471040883478709,0.005691409845285877,0.00044639401537845537,30680,,,,,,75048367649.66653,450290205897.9992,0.5173370931732527,,I16,I32,33554432,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,No,1111,0.00045644497029703035,0.004630719301683377,0.000450117155493756,0.004699806594964763,0.0004474967917148505,1162,,,,,,74545996726.5466,447275980359.2796,0.513874058317187,,I16,F32,33554432,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,No,21586,0.0006636389210136234,0.00678406319186359,0.0006573809432517179,0.006858260008522721,0.0006561174402032673,21587,,,,,,51042599187.65498,510425991876.5498,0.5864269208140508,,I16,I64,33554432,67108864,268435456 +copy_type_conversion_sweep,0,Quadro GV100,No,21638,0.0006617334348830768,0.006935544837856402,0.0006554566019343122,0.0070075276540148375,0.000653902132173752,21639,,,,,,51192454086.1711,511924540861.71094,0.5881485993356054,,I16,F64,33554432,67108864,268435456 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I32,I8,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I32,I16,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I32,I32,,, +copy_type_conversion_sweep,0,Quadro GV100,No,47462,0.00027253003156209153,0.012292429580379986,0.0002662904619916644,0.01259432062607704,0.00026489117725947637,47463,,,,,,63003443212.04103,504027545696.32825,0.5790757648165535,,I32,F32,16777216,67108864,67108864 +copy_type_conversion_sweep,0,Quadro GV100,No,35529,0.0003843450104703185,0.007061698560844491,0.0003780983807571241,0.007193170430510475,0.0003773213782537234,35530,,,,,,44372620603.146774,532471447237.7613,0.6117548796389721,,I32,I64,16777216,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,No,35498,0.0003847829341653037,0.007946267454909624,0.0003785711211350411,0.008088117160474145,0.00037782656535605304,35499,,,,,,44317210329.45711,531806523953.4853,0.6109909512333241,,I32,F64,16777216,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F32,I8,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F32,I16,,, +copy_type_conversion_sweep,0,Quadro GV100,No,47607,0.00027168871905392146,0.01320649583700062,0.0002654402574849676,0.013542194342127112,0.00026387154338796694,47608,,,,,,63205243089.21048,505641944713.68384,0.580930543099361,,F32,I32,16777216,67108864,67108864 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F32,F32,,, +copy_type_conversion_sweep,0,Quadro GV100,No,35499,0.00038478567168089093,0.007793965803915478,0.00037854311241399495,0.007936841129938173,0.0003779212159950015,35500,,,,,,44320489396.863045,531845872762.35657,0.6110361589641045,,F32,I64,16777216,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,No,35509,0.00038455552378270424,0.008192214819970133,0.0003782884006425947,0.008364103785135068,0.0003775098086048749,35510,,,,,,44350331576.38646,532203978916.6376,0.6114475860715045,,F32,F64,16777216,67108864,134217728 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I8,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I16,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I32,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,F32,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I64,,, +copy_type_conversion_sweep,0,Quadro GV100,No,52100,0.00024245567445297397,0.008539487651977316,0.0002361851441692315,0.008809731550468568,0.00023509478066781645,52101,,,,,,35517085672.37146,568273370757.9434,0.6528876042715341,,I64,F64,8388608,67108864,67108864 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,I8,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,I16,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,I32,,, +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,F32,,, +copy_type_conversion_sweep,0,Quadro GV100,No,51780,0.0002443157005407483,0.009752081536074006,0.00023802995230194252,0.010060889608084888,0.00023685153101656335,51781,,,,,,35241816917.89358,563869070686.2972,0.6478275168730437,,F64,I64,8388608,67108864,67108864 +copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,F64,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I8,I8,,, +copy_type_conversion_sweep,1,Quadro GP100,No,21576,0.0006664836124397485,0.008032774520430731,0.0006617529590178744,0.008026552131843242,0.000660956536269952,21577,,,,,,101410750168.15656,304232250504.46967,0.415527002983596,,I8,I16,67108864,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,No,16864,0.0008629180932163245,0.007882137751662148,0.0008582004840137586,0.00791837780480227,0.0008575679956081893,16865,,,,,,78197187312.3811,390985936561.9055,0.534017068075155,,I8,I32,67108864,67108864,268435456 +copy_type_conversion_sweep,1,Quadro GP100,No,16866,0.0008626138966559889,0.007971447505119814,0.0008578826743068147,0.008007931525887958,0.0008570867944797,16867,,,,,,78226156104.88371,391130780524.4186,0.5342148990991294,,I8,F32,67108864,67108864,268435456 +copy_type_conversion_sweep,1,Quadro GP100,No,10113,0.0014600333443093004,0.005526790210274132,0.0014553281934728316,0.0055395074611650314,0.0014537668022591766,10114,,,,,,46112529325.676674,415012763931.09,0.5668334297572799,,I8,I64,67108864,67108864,536870912 +copy_type_conversion_sweep,1,Quadro GP100,No,10100,0.0014618894557425724,0.005445897883226132,0.001457197949886321,0.005462563281467926,0.0014559329674746807,10101,,,,,,46053361525.27205,414480253727.4484,0.5661061157772187,,I8,F64,67108864,67108864,536870912 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I16,I8,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I16,I16,,, +copy_type_conversion_sweep,1,Quadro GP100,No,30414,0.00046039200555665144,0.00783107553466636,0.00045566900502105564,0.00787891812842585,0.0004554145292263416,30415,,,,,,73637731841.00926,441826391046.0556,0.6034560629453338,,I16,I32,33554432,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,No,30506,0.00045893235589720245,0.007769133803136496,0.00045424179060767235,0.007825865470567534,0.00045380600078266583,30507,,,,,,73869099439.55573,443214596637.33435,0.6053521042358697,,I16,F32,33554432,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,No,19198,0.0007536445464110835,0.0055724274957070705,0.0007489312136715944,0.005596667585138299,0.0007480235320061677,19199,,,,,,44803089238.998634,448030892389.9864,0.6119303053840505,,I16,I64,33554432,67108864,268435456 +copy_type_conversion_sweep,1,Quadro GP100,No,19239,0.0007522073903529291,0.005390014038421145,0.0007475173373753664,0.00541460189354359,0.0007464590773358217,19240,,,,,,44887831120.83274,448878311208.3274,0.6130877283767584,,I16,F64,33554432,67108864,268435456 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I32,I8,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I32,I16,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I32,I32,,, +copy_type_conversion_sweep,1,Quadro GP100,No,47007,0.00027858482081392173,0.007845561244706417,0.00027388008982136936,0.007927787169621173,0.0002735391274529489,47008,,,,,,61257523359.73918,490060186877.91345,0.669334826920227,,I32,F32,16777216,67108864,67108864 +copy_type_conversion_sweep,1,Quadro GP100,No,1196,0.00042285716053511705,0.0044259706705914136,0.000418105284879638,0.004434430384125105,0.00041619211102596686,1254,,,,,,40126773343.29734,481521280119.5681,0.6576722029605115,,I32,I64,16777216,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,No,1195,0.0004233829054393305,0.004675323526829325,0.0004187026210409825,0.004672495644368017,0.000416603259004343,1252,,,,,,40069527050.69847,480834324608.3817,0.6567339442312906,,I32,F64,16777216,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F32,I8,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F32,I16,,, +copy_type_conversion_sweep,1,Quadro GP100,No,46545,0.000281580676807392,0.012343793517939674,0.00027684729846364837,0.012502080899567246,0.00027647899042774625,46546,,,,,,60600974230.5755,484807793844.604,0.6621609946522673,,F32,I32,16777216,67108864,67108864 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F32,F32,,, +copy_type_conversion_sweep,1,Quadro GP100,No,1196,0.00042307038377926434,0.004605601751545893,0.00041839050190105916,0.004593649085141324,0.00041637280448239853,1257,,,,,,40099418901.16681,481193026814.0017,0.6572238674797882,,F32,I64,16777216,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,No,1195,0.00042337769372384964,0.004694569298674352,0.00041869027609605687,0.004721621480904926,0.00041666047469429343,1265,,,,,,40070708487.50959,480848501850.1151,0.6567533078153889,,F32,F64,16777216,67108864,134217728 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I8,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I16,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I32,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,F32,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I64,,, +copy_type_conversion_sweep,1,Quadro GP100,No,1910,0.0002665688994764398,0.004225222015302186,0.0002618850015064806,0.004212590939733298,0.0002600365045472557,2011,,,,,,32031647294.594746,512506356713.5159,0.6999922922769831,,I64,F64,8388608,67108864,67108864 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,I8,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,I16,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,I32,,, +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,F32,,, +copy_type_conversion_sweep,1,Quadro GP100,No,1912,0.00026628649320083707,0.004156825161653807,0.0002615861928238536,0.00409518011787359,0.0002597900572277251,2016,,,,,,32068236895.242805,513091790323.8849,0.7007918901932432,,F64,I64,8388608,67108864,67108864 +copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,F64,,, diff --git a/examples/outputs/nvbench.example.axes.json b/examples/outputs/nvbench.example.axes.json new file mode 100644 index 0000000..b35a88e --- /dev/null +++ b/examples/outputs/nvbench.example.axes.json @@ -0,0 +1,18797 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "simple", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": null, + "states": { + "Device=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010102518997995992" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005320863289715677" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010037636295826922" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005606955085401353" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072372945891786" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004201092756117083" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002567436508759" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003010855735431417" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001474511532383" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 1, + "name": "single_float64_axis", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "Duration": { + "type": "float64", + "flags": "", + "values": [ + { + "input_string": "0", + "description": "", + "value": 0.0 + }, + { + "input_string": "0.0001", + "description": "", + "value": 0.0001 + }, + { + "input_string": "0.0002", + "description": "", + "value": 0.0002 + }, + { + "input_string": "0.0003", + "description": "", + "value": 0.00030000000000000003 + }, + { + "input_string": "0.0004", + "description": "", + "value": 0.0004 + }, + { + "input_string": "0.0005", + "description": "", + "value": 0.0005 + }, + { + "input_string": "0.0006", + "description": "", + "value": 0.0006000000000000001 + }, + { + "input_string": "0.0007", + "description": "", + "value": 0.0007000000000000001 + }, + { + "input_string": "0.0008", + "description": "", + "value": 0.0008000000000000001 + }, + { + "input_string": "0.0009", + "description": "", + "value": 0.0009000000000000002 + }, + { + "input_string": "0.001", + "description": "", + "value": 0.0010000000000000002 + } + ] + } + }, + "states": { + "Device=0 Duration=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "147957" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "1.0618111072811744e-05" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.03254637275181478" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "4.42401244240246e-06" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.1078451537144948" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "2.0427748176574993e-06" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "244766" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0001": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "4831" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00011004767191057754" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004150807794866266" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00010351461256001611" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004781428015913556" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00010137620362095862" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "5088" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0002": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0002" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "2453" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00021036899388503913" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.002247040450593631" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002039032309542721" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0024717338672984117" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00020172918129115027" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2582" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0003": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00030000000000000003" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1648" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00030986622087378617" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0014834014227566264" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003034121238635499" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0016613620256970604" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00030116395027406757" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1736" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0004": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0004" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1239" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0004101481057304273" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0012074050648415652" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000403672849583562" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00138227314754308" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00040141034272550807" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1304" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0005": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "992" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0005105290151209681" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0009132925978805083" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005040890874881937" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.001126663137928465" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005017619947554283" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1042" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0006": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0006000000000000001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "829" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0006098617478890229" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0007603582722561039" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000603470634925204" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0009715240544303389" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006011044563503441" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "872" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0007": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0007000000000000001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "711" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007102935302390999" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006584784515618755" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007037439938168366" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0008111324463740949" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007014426981064088" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "748" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0008": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0008000000000000001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "622" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.000810564897106109" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006072346537084304" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008041868904587526" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.000707547745748677" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008017951065694391" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "653" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0009": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0009000000000000002" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "554" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0009098726931407932" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005164841732610924" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000903432962588876" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005681374617078215" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009011252491744524" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "582" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.001": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0010000000000000002" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010102697054108218" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00044759296847813034" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010038065348932821" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005354727313941588" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "152839" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "7.705229234684682e-06" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.05418085805591698" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "3.016308709558017e-06" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.04127500754098809" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "1.3434882326935044e-06" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "372166" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0001": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "4879" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00010715639024390229" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004066079001791793" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0001024813676868744" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00308685243303526" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0001013762513461915" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "5107" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0002": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0002" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "2466" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00020754400243309012" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0019319860083082698" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00020283288218606642" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0014786760562713035" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00020172824844867798" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2586" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0003": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00030000000000000003" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1655" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00030687972205438067" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0013377397645028726" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00030219129368978006" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0010508732574554497" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00030105650589762746" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1736" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0004": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0004" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1243" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0004072136267095737" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.001056923897828408" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004025078938310397" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0007582628414963917" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004014090936302682" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1305" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0005": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "995" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0005075617336683415" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0007933683824162227" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005028640134849749" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006232695683416088" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005017614337245814" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1045" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0006": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0006000000000000001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "831" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0006069544103489773" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0007196728740232229" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006022227483966297" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004991286373051354" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006010893901462271" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "873" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0007": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0007000000000000001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "712" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007072545856741567" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005896830799502328" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000702559235139509" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004401401059381315" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007014417189327791" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "748" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0008": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0008000000000000001" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "623" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0008076356067415732" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005473369257973612" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008029095356384018" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00040823678026317246" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008017940608599713" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "655" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0009": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0009000000000000002" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "555" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0009069345225225232" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004925253150638496" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009022477837296215" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003476692854287226" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000901123046875" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "582" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.001": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0010000000000000002" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072963186372732" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004478227973886651" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025936021116784" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003019453543048136" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001473230260019" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 2, + "name": "copy_sweep_grid_shape", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "BlockSize": { + "type": "int64", + "flags": "pow2", + "values": [ + { + "input_string": "6", + "description": "2^6 = 64", + "value": 64 + }, + { + "input_string": "8", + "description": "2^8 = 256", + "value": 256 + }, + { + "input_string": "10", + "description": "2^10 = 1024", + "value": 1024 + } + ] + }, + "NumBlocks": { + "type": "int64", + "flags": "pow2", + "values": [ + { + "input_string": "6", + "description": "2^6 = 64", + "value": 64 + }, + { + "input_string": "8", + "description": "2^8 = 256", + "value": 256 + }, + { + "input_string": "10", + "description": "2^10 = 1024", + "value": 1024 + } + ] + } + }, + "states": { + "Device=0 BlockSize=2^6 NumBlocks=2^6": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "64" + }, + "NumBlocks": { + "type": "int64", + "value": "64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "66" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00762213684848485" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.001421350078274123" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.007615782766631154" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.001435096779193961" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "8811814367.137686" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "70494514937.10149" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.08099094087442726" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.007614612524060236" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "69" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^8 NumBlocks=2^6": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "256" + }, + "NumBlocks": { + "type": "int64", + "value": "64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "206" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0024424633495145616" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004624145111239578" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0024361206686612466" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004621497870662145" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "27547430167.685093" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "220379441341.48074" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.25319329198239976" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0024339480377906972" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "215" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^10 NumBlocks=2^6": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "1024" + }, + "NumBlocks": { + "type": "int64", + "value": "64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "13161" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011122250784894763" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.012946747378394284" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011059346540513448" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.013016477670773642" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "60680677428.96169" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "485445419431.69354" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5577268146044273" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011028085603954402" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "13162" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^6 NumBlocks=2^8": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "64" + }, + "NumBlocks": { + "type": "int64", + "value": "256" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "375" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0024504410826666676" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005006459599172983" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0024441843128204348" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004997852033101851" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "27456548038.53994" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "219652384308.31952" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.25235797829540385" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002444396898467488" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "376" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^8 NumBlocks=2^8": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "256" + }, + "NumBlocks": { + "type": "int64", + "value": "256" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "13509" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010829132057147126" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00962532025494488" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010766412656412294" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009687962765161975" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "62331684788.27634" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "498653478306.2107" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5729015145981281" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010755151240936572" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "13510" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^10 NumBlocks=2^8": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "1024" + }, + "NumBlocks": { + "type": "int64", + "value": "256" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "15105" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0009647508075471688" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00509700309681285" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009584777770750065" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0051422186996564374" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "70016087597.56184" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "560128700780.4948" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6435302168893552" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009572492433860444" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "15106" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^6 NumBlocks=2^10": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "64" + }, + "NumBlocks": { + "type": "int64", + "value": "1024" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "13582" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010768811630834938" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007374891553423134" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010706156819352108" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007408403333483488" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "62682496746.82156" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "501459973974.5725" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5761258892171099" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010709149855748453" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "13583" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^8 NumBlocks=2^10": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "256" + }, + "NumBlocks": { + "type": "int64", + "value": "1024" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1782" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0009628989545454543" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004952339328546463" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009565676676170035" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004999266556092292" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "70155898293.3024" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "561247186346.4192" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6448152416663824" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009545994625289475" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1783" + } + } + }, + "is_skipped": false + }, + "Device=0 BlockSize=2^10 NumBlocks=2^10": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "1024" + }, + "NumBlocks": { + "type": "int64", + "value": "1024" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "14579" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010004705825502492" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.019254279361810064" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000994218452561916" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.019368874235788265" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "67499113325.72127" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "539992906605.77014" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6203962621849382" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009928190472480985" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "14580" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^6 NumBlocks=2^6": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "64" + }, + "NumBlocks": { + "type": "int64", + "value": "64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "2236" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.006688950130143119" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.010949635482023172" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.006684225965911029" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.010951296261576433" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "10039885596.66435" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "80319084773.3148" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.10970154716634999" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.006674802853478132" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2237" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^8 NumBlocks=2^6": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "256" + }, + "NumBlocks": { + "type": "int64", + "value": "64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "218" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.002301079724770642" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0028902989580355323" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0022963436360752907" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00289533405223059" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "29224225392.80601" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "233793803142.4481" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.31932064458922654" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0022982710453501917" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "228" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^10 NumBlocks=2^6": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "1024" + }, + "NumBlocks": { + "type": "int64", + "value": "64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "426" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011791361924882624" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.003945300386512142" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011743737087003502" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.003937393066379565" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "57144385558.72278" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "457155084469.7822" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6243923247238066" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011721584260596465" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "449" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^6 NumBlocks=2^8": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "64" + }, + "NumBlocks": { + "type": "int64", + "value": "256" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "226" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00221838060176991" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0015555610970668225" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002213621245021314" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00156938592170116" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "30316326314.149483" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "242530610513.19586" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.33125356549551443" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002213029949976925" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "237" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^8 NumBlocks=2^8": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "256" + }, + "NumBlocks": { + "type": "int64", + "value": "256" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "12933" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011352358293512678" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006664341285751961" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011305139959895455" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006686678312757843" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "59361373886.62687" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "474890991093.01495" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6486164104745069" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011301243156632502" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "12934" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^10 NumBlocks=2^8": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "1024" + }, + "NumBlocks": { + "type": "int64", + "value": "256" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "447" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001123642310961969" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0021689290927995966" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001118954880392258" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0021631706036555243" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "59974593413.87786" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "479796747311.0229" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6553167986656235" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001117002699110243" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "468" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^6 NumBlocks=2^10": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "64" + }, + "NumBlocks": { + "type": "int64", + "value": "1024" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "448" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011216752544642855" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.003016436766525619" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011169237145887954" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0030169455335936853" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "60083659361.37964" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "480669274891.0371" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6565085157493404" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011148893315741358" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "470" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^8 NumBlocks=2^10": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "256" + }, + "NumBlocks": { + "type": "int64", + "value": "1024" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "448" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011223883750000004" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.002772433370845318" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001117700926693421" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.002767310086041235" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "60041879180.089096" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "480335033440.71277" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6560520015306938" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001115604862286027" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "471" + } + } + }, + "is_skipped": false + }, + "Device=1 BlockSize=2^10 NumBlocks=2^10": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "BlockSize": { + "type": "int64", + "value": "1024" + }, + "NumBlocks": { + "type": "int64", + "value": "1024" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "474" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001060387455696202" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0015438063319203115" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010557063963845812" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0015281670861921415" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "63567734580.20524" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "508541876641.6419" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6945775194515432" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010540968806868097" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "498" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 3, + "name": "copy_type_sweep", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "T": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "U8", + "description": "uint8_t", + "is_active": true + }, + { + "input_string": "U16", + "description": "uint16_t", + "is_active": true + }, + { + "input_string": "U32", + "description": "uint32_t", + "is_active": true + }, + { + "input_string": "U64", + "description": "uint64_t", + "is_active": true + }, + { + "input_string": "F32", + "description": "float", + "is_active": true + }, + { + "input_string": "F64", + "description": "double", + "is_active": true + } + ] + } + }, + "states": { + "Device=0 T=U8": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U8" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "197" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0025498305939086305" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0029889374958217657" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002543548455698237" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0030039924801443835" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "105535813716.71214" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "211071627433.4243" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.24249957195935695" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002539370934940079" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "206" + } + } + }, + "is_skipped": false + }, + "Device=0 T=U16": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "314" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001601867525477707" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0041088290293520464" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001595620784789893" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004121783467130656" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "84116307132.25725" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "336465228529.029" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.3865639114533881" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0015915001814459024" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "331" + } + } + }, + "is_skipped": false + }, + "Device=0 T=U32": { + "device": 0, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "13509" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010828757243319257" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009631338263504865" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010766108759909336" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00968747141038187" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "62333444233.72251" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "498667553869.7801" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5729176859717142" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010754745522399284" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "13510" + } + } + }, + "is_skipped": false + }, + "Device=0 T=U64": { + "device": 0, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "15542" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.000936680335671086" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006014811602456382" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009304157742707227" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006054554605786883" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "36063911347.91389" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "577022581566.6222" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6629395468366523" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0009291894323484043" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "15543" + } + } + }, + "is_skipped": false + }, + "Device=0 T=F32": { + "device": 0, + "type_config_index": 4, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "13508" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010830444422564424" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009669058622361396" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010767860333789413" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009724931080690958" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "62323304648.94053" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "498586437191.52423" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5728244912586445" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010753850626975213" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "13509" + } + } + }, + "is_skipped": false + }, + "Device=0 T=F64": { + "device": 0, + "type_config_index": 5, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "15546" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0009366829067284162" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005915735845058088" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000930411673893452" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005957150780596753" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "36064070283.626465" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "577025124538.0234" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6629424684490159" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000929181897058261" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "15547" + } + } + }, + "is_skipped": false + }, + "Device=1 T=U8": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U8" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "5497" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0027078292635983312" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006395900728715414" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.002703092247003721" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006391943780941085" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "99306805491.95126" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "198613610983.90253" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.2712707754915627" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0026997315984675927" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "5498" + } + } + }, + "is_skipped": false + }, + "Device=1 T=U16": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "330" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001520048475757576" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004360351977281759" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0015153354655612601" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004371610333527513" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "88572947080.2609" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "354291788321.0436" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.48389940494023653" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0015136887122844827" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "348" + } + } + }, + "is_skipped": false + }, + "Device=1 T=U32": { + "device": 1, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "12935" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001135307039969079" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006620691359921369" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011306220265028327" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006643085804704841" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "59355701929.47401" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "474845615435.79205" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6485544354182038" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011301225775930074" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "12936" + } + } + }, + "is_skipped": false + }, + "Device=1 T=U64": { + "device": 1, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "U64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "478" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010522014937238498" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.002705184693469705" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010475129038718955" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0026904354098530054" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "32032476044.899876" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "512519616718.398" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.7000104030791057" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010449056396484376" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "500" + } + } + }, + "is_skipped": false + }, + "Device=1 T=F32": { + "device": 1, + "type_config_index": 4, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "12933" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011353411816283944" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0066894954953519185" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011306308270201252" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006710880496820158" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "59355239921.125435" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "474841919369.0035" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6485493872500594" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001130335244040081" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "12934" + } + } + }, + "is_skipped": false + }, + "Device=1 T=F64": { + "device": 1, + "type_config_index": 5, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "T": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "477" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010531248867924527" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0028403001389236257" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010484168726943076" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0028275833565767315" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "32004856917.047768" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "512077710672.7643" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6994068382221977" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001045540454641914" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "497" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 4, + "name": "copy_type_conversion_sweep", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "In": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "I8", + "description": "int8_t", + "is_active": true + }, + { + "input_string": "I16", + "description": "int16_t", + "is_active": true + }, + { + "input_string": "I32", + "description": "int32_t", + "is_active": true + }, + { + "input_string": "F32", + "description": "float", + "is_active": true + }, + { + "input_string": "I64", + "description": "int64_t", + "is_active": true + }, + { + "input_string": "F64", + "description": "double", + "is_active": true + } + ] + }, + "Out": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "I8", + "description": "int8_t", + "is_active": true + }, + { + "input_string": "I16", + "description": "int16_t", + "is_active": true + }, + { + "input_string": "I32", + "description": "int32_t", + "is_active": true + }, + { + "input_string": "F32", + "description": "float", + "is_active": true + }, + { + "input_string": "I64", + "description": "int64_t", + "is_active": true + }, + { + "input_string": "F64", + "description": "double", + "is_active": true + } + ] + } + }, + "states": { + "Device=0 In=I8 Out=I8": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=0 In=I8 Out=I16": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "712" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007091774676966297" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.002907153358900259" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007029326261764157" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0029793074635008562" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "95469838076.85379" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "286409514230.56134" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.3290550485185677" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006997137222698028" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "748" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=I32": { + "device": 0, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "622" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0008109339212218649" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0030780801739523534" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008046977720268298" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.003128636670235119" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "83396358648.99908" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "416981793244.9954" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.4790691558421363" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008020894928445145" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "654" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=F32": { + "device": 0, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "614" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0008210284218241037" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0032226581332443155" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008147683657147577" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0032802944549392547" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "82365573853.77692" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "411827869268.8846" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.47314782774458247" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008120876105256783" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "645" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=I64": { + "device": 0, + "type_config_index": 4, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "536870912" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "12047" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001218285543537808" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.010379223139149431" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0012120340953301184" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.010435925751575564" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "55368792229.992294" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "498319130069.93066" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5725173828928432" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0012109437788209631" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "12048" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=F64": { + "device": 0, + "type_config_index": 5, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "536870912" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "12345" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0011880294179019875" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007390662960171094" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0011817586330519084" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007441978357839151" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "56787284749.24732" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "511085562743.2259" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5871846998428606" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00118048292704533" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "12346" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=I8": { + "device": 0, + "type_config_index": 6, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I16 Out=I16": { + "device": 0, + "type_config_index": 7, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=0 In=I16 Out=I32": { + "device": 0, + "type_config_index": 8, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "30679" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00045337666328758926" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005585454361874255" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004471040883478709" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005691409845285877" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "75048367649.66653" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "450290205897.9992" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5173370931732527" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00044639401537845537" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "30680" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=F32": { + "device": 0, + "type_config_index": 9, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1111" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00045644497029703035" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004630719301683377" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000450117155493756" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004699806594964763" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "74545996726.5466" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "447275980359.2796" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.513874058317187" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004474967917148505" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1162" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=I64": { + "device": 0, + "type_config_index": 10, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "21586" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0006636389210136234" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00678406319186359" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006573809432517179" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006858260008522721" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "51042599187.65498" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "510425991876.5498" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5864269208140508" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006561174402032673" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "21587" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=F64": { + "device": 0, + "type_config_index": 11, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "21638" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0006617334348830768" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.006935544837856402" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006554566019343122" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0070075276540148375" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "51192454086.1711" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "511924540861.71094" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5881485993356054" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000653902132173752" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "21639" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I32 Out=I8": { + "device": 0, + "type_config_index": 12, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I32 Out=I16": { + "device": 0, + "type_config_index": 13, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I32 Out=I32": { + "device": 0, + "type_config_index": 14, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=0 In=I32 Out=F32": { + "device": 0, + "type_config_index": 15, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "47462" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00027253003156209153" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.012292429580379986" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002662904619916644" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.01259432062607704" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "63003443212.04103" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "504027545696.32825" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5790757648165535" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00026489117725947637" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "47463" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I32 Out=I64": { + "device": 0, + "type_config_index": 16, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "35529" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0003843450104703185" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007061698560844491" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003780983807571241" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007193170430510475" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "44372620603.146774" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "532471447237.7613" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6117548796389721" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003773213782537234" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "35530" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I32 Out=F64": { + "device": 0, + "type_config_index": 17, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "35498" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0003847829341653037" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007946267454909624" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003785711211350411" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008088117160474145" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "44317210329.45711" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "531806523953.4853" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6109909512333241" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00037782656535605304" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "35499" + } + } + }, + "is_skipped": false + }, + "Device=0 In=F32 Out=I8": { + "device": 0, + "type_config_index": 18, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=F32 Out=I16": { + "device": 0, + "type_config_index": 19, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=F32 Out=I32": { + "device": 0, + "type_config_index": 20, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "47607" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00027168871905392146" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.01320649583700062" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002654402574849676" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.013542194342127112" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "63205243089.21048" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "505641944713.68384" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.580930543099361" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00026387154338796694" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "47608" + } + } + }, + "is_skipped": false + }, + "Device=0 In=F32 Out=F32": { + "device": 0, + "type_config_index": 21, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=0 In=F32 Out=I64": { + "device": 0, + "type_config_index": 22, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "35499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00038478567168089093" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007793965803915478" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00037854311241399495" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007936841129938173" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "44320489396.863045" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "531845872762.35657" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6110361589641045" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003779212159950015" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "35500" + } + } + }, + "is_skipped": false + }, + "Device=0 In=F32 Out=F64": { + "device": 0, + "type_config_index": 23, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "35509" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00038455552378270424" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008192214819970133" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003782884006425947" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008364103785135068" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "44350331576.38646" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "532203978916.6376" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6114475860715045" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0003775098086048749" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "35510" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I64 Out=I8": { + "device": 0, + "type_config_index": 24, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=I16": { + "device": 0, + "type_config_index": 25, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=I32": { + "device": 0, + "type_config_index": 26, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=F32": { + "device": 0, + "type_config_index": 27, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=I64": { + "device": 0, + "type_config_index": 28, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=0 In=I64 Out=F64": { + "device": 0, + "type_config_index": 29, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "8388608" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "52100" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00024245567445297397" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008539487651977316" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002361851441692315" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008809731550468568" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "35517085672.37146" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "568273370757.9434" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6528876042715341" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00023509478066781645" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "52101" + } + } + }, + "is_skipped": false + }, + "Device=0 In=F64 Out=I8": { + "device": 0, + "type_config_index": 30, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=F64 Out=I16": { + "device": 0, + "type_config_index": 31, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=F64 Out=I32": { + "device": 0, + "type_config_index": 32, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=F64 Out=F32": { + "device": 0, + "type_config_index": 33, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=F64 Out=I64": { + "device": 0, + "type_config_index": 34, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "8388608" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "51780" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0002443157005407483" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009752081536074006" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00023802995230194252" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.010060889608084888" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "35241816917.89358" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "563869070686.2972" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6478275168730437" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00023685153101656335" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "51781" + } + } + }, + "is_skipped": false + }, + "Device=0 In=F64 Out=F64": { + "device": 0, + "type_config_index": 35, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=1 In=I8 Out=I8": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=1 In=I8 Out=I16": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "21576" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0006664836124397485" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008032774520430731" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0006617529590178744" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008026552131843242" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "101410750168.15656" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "304232250504.46967" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.415527002983596" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000660956536269952" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "21577" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=I32": { + "device": 1, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "16864" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0008629180932163245" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007882137751662148" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008582004840137586" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00791837780480227" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "78197187312.3811" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "390985936561.9055" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.534017068075155" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008575679956081893" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "16865" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=F32": { + "device": 1, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "16866" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0008626138966559889" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007971447505119814" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008578826743068147" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.008007931525887958" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "78226156104.88371" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "391130780524.4186" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5342148990991294" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0008570867944797" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "16867" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=I64": { + "device": 1, + "type_config_index": 4, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "536870912" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "10113" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0014600333443093004" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005526790210274132" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0014553281934728316" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0055395074611650314" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "46112529325.676674" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "415012763931.09" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5668334297572799" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0014537668022591766" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "10114" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=F64": { + "device": 1, + "type_config_index": 5, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "536870912" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "10100" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0014618894557425724" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005445897883226132" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001457197949886321" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005462563281467926" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "46053361525.27205" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "414480253727.4484" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5661061157772187" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0014559329674746807" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "10101" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=I8": { + "device": 1, + "type_config_index": 6, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I16 Out=I16": { + "device": 1, + "type_config_index": 7, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=1 In=I16 Out=I32": { + "device": 1, + "type_config_index": 8, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "30414" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00046039200555665144" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00783107553466636" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00045566900502105564" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00787891812842585" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "73637731841.00926" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "441826391046.0556" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6034560629453338" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004554145292263416" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "30415" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=F32": { + "device": 1, + "type_config_index": 9, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "30506" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00045893235589720245" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007769133803136496" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00045424179060767235" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007825865470567534" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "73869099439.55573" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "443214596637.33435" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6053521042358697" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00045380600078266583" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "30507" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=I64": { + "device": 1, + "type_config_index": 10, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "19198" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007536445464110835" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0055724274957070705" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007489312136715944" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005596667585138299" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "44803089238.998634" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "448030892389.9864" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6119303053840505" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007480235320061677" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "19199" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=F64": { + "device": 1, + "type_config_index": 11, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "33554432" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "268435456" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "19239" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007522073903529291" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005390014038421145" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007475173373753664" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00541460189354359" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "44887831120.83274" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "448878311208.3274" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6130877283767584" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007464590773358217" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "19240" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I32 Out=I8": { + "device": 1, + "type_config_index": 12, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I32 Out=I16": { + "device": 1, + "type_config_index": 13, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I32 Out=I32": { + "device": 1, + "type_config_index": 14, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=1 In=I32 Out=F32": { + "device": 1, + "type_config_index": 15, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "47007" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00027858482081392173" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007845561244706417" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00027388008982136936" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.007927787169621173" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "61257523359.73918" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "490060186877.91345" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.669334826920227" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002735391274529489" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "47008" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I32 Out=I64": { + "device": 1, + "type_config_index": 16, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1196" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00042285716053511705" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0044259706705914136" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000418105284879638" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004434430384125105" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "40126773343.29734" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "481521280119.5681" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6576722029605115" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00041619211102596686" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1254" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I32 Out=F64": { + "device": 1, + "type_config_index": 17, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1195" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0004233829054393305" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004675323526829325" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0004187026210409825" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004672495644368017" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "40069527050.69847" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "480834324608.3817" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6567339442312906" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000416603259004343" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1252" + } + } + }, + "is_skipped": false + }, + "Device=1 In=F32 Out=I8": { + "device": 1, + "type_config_index": 18, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=F32 Out=I16": { + "device": 1, + "type_config_index": 19, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=F32 Out=I32": { + "device": 1, + "type_config_index": 20, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "46545" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.000281580676807392" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.012343793517939674" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00027684729846364837" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.012502080899567246" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "60600974230.5755" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "484807793844.604" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6621609946522673" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00027647899042774625" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "46546" + } + } + }, + "is_skipped": false + }, + "Device=1 In=F32 Out=F32": { + "device": 1, + "type_config_index": 21, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=1 In=F32 Out=I64": { + "device": 1, + "type_config_index": 22, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1196" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00042307038377926434" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004605601751545893" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00041839050190105916" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004593649085141324" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "40099418901.16681" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "481193026814.0017" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6572238674797882" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00041637280448239853" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1257" + } + } + }, + "is_skipped": false + }, + "Device=1 In=F32 Out=F64": { + "device": 1, + "type_config_index": 23, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F32" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "134217728" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1195" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00042337769372384964" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004694569298674352" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00041869027609605687" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004721621480904926" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "40070708487.50959" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "480848501850.1151" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6567533078153889" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00041666047469429343" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1265" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I64 Out=I8": { + "device": 1, + "type_config_index": 24, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=I16": { + "device": 1, + "type_config_index": 25, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=I32": { + "device": 1, + "type_config_index": 26, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=F32": { + "device": 1, + "type_config_index": 27, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=I64": { + "device": 1, + "type_config_index": 28, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + }, + "Device=1 In=I64 Out=F64": { + "device": 1, + "type_config_index": 29, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "8388608" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1910" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0002665688994764398" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004225222015302186" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002618850015064806" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004212590939733298" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "32031647294.594746" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "512506356713.5159" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6999922922769831" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002600365045472557" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2011" + } + } + }, + "is_skipped": false + }, + "Device=1 In=F64 Out=I8": { + "device": 1, + "type_config_index": 30, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=F64 Out=I16": { + "device": 1, + "type_config_index": 31, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=F64 Out=I32": { + "device": 1, + "type_config_index": 32, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=F64 Out=F32": { + "device": 1, + "type_config_index": 33, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "F32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=F64 Out=I64": { + "device": 1, + "type_config_index": 34, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "8388608" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "InSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "OutSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1912" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00026628649320083707" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004156825161653807" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002615861928238536" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00409518011787359" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "32068236895.242805" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "513091790323.8849" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.7007918901932432" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002597900572277251" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2016" + } + } + }, + "is_skipped": false + }, + "Device=1 In=F64 Out=F64": { + "device": 1, + "type_config_index": 35, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "F64" + }, + "Out": { + "type": "string", + "value": "F64" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Not a conversion: InputType == OutputType." + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.axes.list.md b/examples/outputs/nvbench.example.axes.list.md new file mode 100644 index 0000000..7cb1d2c --- /dev/null +++ b/examples/outputs/nvbench.example.axes.list.md @@ -0,0 +1,93 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 30117 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 14891 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `simple` (1 configurations) + +## [1] `single_float64_axis` (11 configurations) + +### Axes + +* `Duration` : float64 + * `0` + * `0.0001` + * `0.0002` + * `0.0003` + * `0.0004` + * `0.0005` + * `0.0006` + * `0.0007` + * `0.0008` + * `0.0009` + * `0.001` + +## [2] `copy_sweep_grid_shape` (9 configurations) + +### Axes + +* `BlockSize` : int64 [pow2] + * `6` (2^6 = 64) + * `8` (2^8 = 256) + * `10` (2^10 = 1024) +* `NumBlocks` : int64 [pow2] + * `6` (2^6 = 64) + * `8` (2^8 = 256) + * `10` (2^10 = 1024) + +## [3] `copy_type_sweep` (6 configurations) + +### Axes + +* `T` : type + * `U8` (uint8_t) + * `U16` (uint16_t) + * `U32` (uint32_t) + * `U64` (uint64_t) + * `F32` (float) + * `F64` (double) + +## [4] `copy_type_conversion_sweep` (36 configurations) + +### Axes + +* `In` : type + * `I8` (int8_t) + * `I16` (int16_t) + * `I32` (int32_t) + * `F32` (float) + * `I64` (int64_t) + * `F64` (double) +* `Out` : type + * `I8` (int8_t) + * `I16` (int16_t) + * `I32` (int32_t) + * `F32` (float) + * `I64` (int64_t) + * `F64` (double) + diff --git a/examples/outputs/nvbench.example.axes.md b/examples/outputs/nvbench.example.axes.md new file mode 100644 index 0000000..3c0ec03 --- /dev/null +++ b/examples/outputs/nvbench.example.axes.md @@ -0,0 +1,563 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 29776 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 14335 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: simple [Device=0] +Pass: Cold: 1.003764ms GPU, 1.010252ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: simple [Device=1] +Pass: Cold: 1.002567ms GPU, 1.007237ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: single_float64_axis [Device=0 Duration=0] +Warn: Current measurement timed out (15.00s) while over noise threshold (10.78% > 0.50%) +Pass: Cold: 0.004424ms GPU, 0.010618ms CPU, 0.65s total GPU, 147957x +Pass: Batch: 0.002043ms GPU, 0.50s total GPU, 244766x +Run: single_float64_axis [Device=0 Duration=0.0001] +Pass: Cold: 0.103515ms GPU, 0.110048ms CPU, 0.50s total GPU, 4831x +Pass: Batch: 0.101376ms GPU, 0.52s total GPU, 5088x +Run: single_float64_axis [Device=0 Duration=0.0002] +Pass: Cold: 0.203903ms GPU, 0.210369ms CPU, 0.50s total GPU, 2453x +Pass: Batch: 0.201729ms GPU, 0.52s total GPU, 2582x +Run: single_float64_axis [Device=0 Duration=0.0003] +Pass: Cold: 0.303412ms GPU, 0.309866ms CPU, 0.50s total GPU, 1648x +Pass: Batch: 0.301164ms GPU, 0.52s total GPU, 1736x +Run: single_float64_axis [Device=0 Duration=0.0004] +Pass: Cold: 0.403673ms GPU, 0.410148ms CPU, 0.50s total GPU, 1239x +Pass: Batch: 0.401410ms GPU, 0.52s total GPU, 1304x +Run: single_float64_axis [Device=0 Duration=0.0005] +Pass: Cold: 0.504089ms GPU, 0.510529ms CPU, 0.50s total GPU, 992x +Pass: Batch: 0.501762ms GPU, 0.52s total GPU, 1042x +Run: single_float64_axis [Device=0 Duration=0.0006] +Pass: Cold: 0.603471ms GPU, 0.609862ms CPU, 0.50s total GPU, 829x +Pass: Batch: 0.601104ms GPU, 0.52s total GPU, 872x +Run: single_float64_axis [Device=0 Duration=0.0007] +Pass: Cold: 0.703744ms GPU, 0.710294ms CPU, 0.50s total GPU, 711x +Pass: Batch: 0.701443ms GPU, 0.52s total GPU, 748x +Run: single_float64_axis [Device=0 Duration=0.0008] +Pass: Cold: 0.804187ms GPU, 0.810565ms CPU, 0.50s total GPU, 622x +Pass: Batch: 0.801795ms GPU, 0.52s total GPU, 653x +Run: single_float64_axis [Device=0 Duration=0.0009] +Pass: Cold: 0.903433ms GPU, 0.909873ms CPU, 0.50s total GPU, 554x +Pass: Batch: 0.901125ms GPU, 0.52s total GPU, 582x +Run: single_float64_axis [Device=0 Duration=0.001] +Pass: Cold: 1.003807ms GPU, 1.010270ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: single_float64_axis [Device=1 Duration=0] +Warn: Current measurement timed out (15.00s) while over noise threshold (4.13% > 0.50%) +Warn: Current measurement timed out (15.00s) before accumulating min_time (0.46s < 0.50s) +Pass: Cold: 0.003016ms GPU, 0.007705ms CPU, 0.46s total GPU, 152839x +Pass: Batch: 0.001343ms GPU, 0.50s total GPU, 372166x +Run: single_float64_axis [Device=1 Duration=0.0001] +Pass: Cold: 0.102481ms GPU, 0.107156ms CPU, 0.50s total GPU, 4879x +Pass: Batch: 0.101376ms GPU, 0.52s total GPU, 5107x +Run: single_float64_axis [Device=1 Duration=0.0002] +Pass: Cold: 0.202833ms GPU, 0.207544ms CPU, 0.50s total GPU, 2466x +Pass: Batch: 0.201728ms GPU, 0.52s total GPU, 2586x +Run: single_float64_axis [Device=1 Duration=0.0003] +Pass: Cold: 0.302191ms GPU, 0.306880ms CPU, 0.50s total GPU, 1655x +Pass: Batch: 0.301057ms GPU, 0.52s total GPU, 1736x +Run: single_float64_axis [Device=1 Duration=0.0004] +Pass: Cold: 0.402508ms GPU, 0.407214ms CPU, 0.50s total GPU, 1243x +Pass: Batch: 0.401409ms GPU, 0.52s total GPU, 1305x +Run: single_float64_axis [Device=1 Duration=0.0005] +Pass: Cold: 0.502864ms GPU, 0.507562ms CPU, 0.50s total GPU, 995x +Pass: Batch: 0.501761ms GPU, 0.52s total GPU, 1045x +Run: single_float64_axis [Device=1 Duration=0.0006] +Pass: Cold: 0.602223ms GPU, 0.606954ms CPU, 0.50s total GPU, 831x +Pass: Batch: 0.601089ms GPU, 0.52s total GPU, 873x +Run: single_float64_axis [Device=1 Duration=0.0007] +Pass: Cold: 0.702559ms GPU, 0.707255ms CPU, 0.50s total GPU, 712x +Pass: Batch: 0.701442ms GPU, 0.52s total GPU, 748x +Run: single_float64_axis [Device=1 Duration=0.0008] +Pass: Cold: 0.802910ms GPU, 0.807636ms CPU, 0.50s total GPU, 623x +Pass: Batch: 0.801794ms GPU, 0.53s total GPU, 655x +Run: single_float64_axis [Device=1 Duration=0.0009] +Pass: Cold: 0.902248ms GPU, 0.906935ms CPU, 0.50s total GPU, 555x +Pass: Batch: 0.901123ms GPU, 0.52s total GPU, 582x +Run: single_float64_axis [Device=1 Duration=0.001] +Pass: Cold: 1.002594ms GPU, 1.007296ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^6 NumBlocks=2^6] +Pass: Cold: 7.615783ms GPU, 7.622137ms CPU, 0.50s total GPU, 66x +Pass: Batch: 7.614613ms GPU, 0.53s total GPU, 69x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^8 NumBlocks=2^6] +Pass: Cold: 2.436121ms GPU, 2.442463ms CPU, 0.50s total GPU, 206x +Pass: Batch: 2.433948ms GPU, 0.52s total GPU, 215x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^10 NumBlocks=2^6] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.30% > 0.50%) +Pass: Cold: 1.105935ms GPU, 1.112225ms CPU, 14.56s total GPU, 13161x +Pass: Batch: 1.102809ms GPU, 14.52s total GPU, 13162x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^6 NumBlocks=2^8] +Pass: Cold: 2.444184ms GPU, 2.450441ms CPU, 0.92s total GPU, 375x +Pass: Batch: 2.444397ms GPU, 0.92s total GPU, 376x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^8 NumBlocks=2^8] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.97% > 0.50%) +Pass: Cold: 1.076641ms GPU, 1.082913ms CPU, 14.54s total GPU, 13509x +Pass: Batch: 1.075515ms GPU, 14.53s total GPU, 13510x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^10 NumBlocks=2^8] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.51% > 0.50%) +Pass: Cold: 0.958478ms GPU, 0.964751ms CPU, 14.48s total GPU, 15105x +Pass: Batch: 0.957249ms GPU, 14.46s total GPU, 15106x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^6 NumBlocks=2^10] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.74% > 0.50%) +Pass: Cold: 1.070616ms GPU, 1.076881ms CPU, 14.54s total GPU, 13582x +Pass: Batch: 1.070915ms GPU, 14.55s total GPU, 13583x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^8 NumBlocks=2^10] +Pass: Cold: 0.956568ms GPU, 0.962899ms CPU, 1.70s total GPU, 1782x +Pass: Batch: 0.954599ms GPU, 1.70s total GPU, 1783x +Run: copy_sweep_grid_shape [Device=0 BlockSize=2^10 NumBlocks=2^10] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.94% > 0.50%) +Pass: Cold: 0.994218ms GPU, 1.000471ms CPU, 14.49s total GPU, 14579x +Pass: Batch: 0.992819ms GPU, 14.48s total GPU, 14580x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^6 NumBlocks=2^6] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.10% > 0.50%) +Pass: Cold: 6.684226ms GPU, 6.688950ms CPU, 14.95s total GPU, 2236x +Pass: Batch: 6.674803ms GPU, 14.93s total GPU, 2237x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^8 NumBlocks=2^6] +Pass: Cold: 2.296344ms GPU, 2.301080ms CPU, 0.50s total GPU, 218x +Pass: Batch: 2.298271ms GPU, 0.52s total GPU, 228x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^10 NumBlocks=2^6] +Pass: Cold: 1.174374ms GPU, 1.179136ms CPU, 0.50s total GPU, 426x +Pass: Batch: 1.172158ms GPU, 0.53s total GPU, 449x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^6 NumBlocks=2^8] +Pass: Cold: 2.213621ms GPU, 2.218381ms CPU, 0.50s total GPU, 226x +Pass: Batch: 2.213030ms GPU, 0.52s total GPU, 237x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^8 NumBlocks=2^8] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.67% > 0.50%) +Pass: Cold: 1.130514ms GPU, 1.135236ms CPU, 14.62s total GPU, 12933x +Pass: Batch: 1.130124ms GPU, 14.62s total GPU, 12934x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^10 NumBlocks=2^8] +Pass: Cold: 1.118955ms GPU, 1.123642ms CPU, 0.50s total GPU, 447x +Pass: Batch: 1.117003ms GPU, 0.52s total GPU, 468x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^6 NumBlocks=2^10] +Pass: Cold: 1.116924ms GPU, 1.121675ms CPU, 0.50s total GPU, 448x +Pass: Batch: 1.114889ms GPU, 0.52s total GPU, 470x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^8 NumBlocks=2^10] +Pass: Cold: 1.117701ms GPU, 1.122388ms CPU, 0.50s total GPU, 448x +Pass: Batch: 1.115605ms GPU, 0.53s total GPU, 471x +Run: copy_sweep_grid_shape [Device=1 BlockSize=2^10 NumBlocks=2^10] +Pass: Cold: 1.055706ms GPU, 1.060387ms CPU, 0.50s total GPU, 474x +Pass: Batch: 1.054097ms GPU, 0.52s total GPU, 498x +Run: copy_type_sweep [Device=0 T=U8] +Pass: Cold: 2.543548ms GPU, 2.549831ms CPU, 0.50s total GPU, 197x +Pass: Batch: 2.539371ms GPU, 0.52s total GPU, 206x +Run: copy_type_sweep [Device=0 T=U16] +Pass: Cold: 1.595621ms GPU, 1.601868ms CPU, 0.50s total GPU, 314x +Pass: Batch: 1.591500ms GPU, 0.53s total GPU, 331x +Run: copy_type_sweep [Device=0 T=U32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.97% > 0.50%) +Pass: Cold: 1.076611ms GPU, 1.082876ms CPU, 14.54s total GPU, 13509x +Pass: Batch: 1.075475ms GPU, 14.53s total GPU, 13510x +Run: copy_type_sweep [Device=0 T=U64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.61% > 0.50%) +Pass: Cold: 0.930416ms GPU, 0.936680ms CPU, 14.46s total GPU, 15542x +Pass: Batch: 0.929189ms GPU, 14.44s total GPU, 15543x +Run: copy_type_sweep [Device=0 T=F32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.97% > 0.50%) +Pass: Cold: 1.076786ms GPU, 1.083044ms CPU, 14.55s total GPU, 13508x +Pass: Batch: 1.075385ms GPU, 14.53s total GPU, 13509x +Run: copy_type_sweep [Device=0 T=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.60% > 0.50%) +Pass: Cold: 0.930412ms GPU, 0.936683ms CPU, 14.46s total GPU, 15546x +Pass: Batch: 0.929182ms GPU, 14.45s total GPU, 15547x +Run: copy_type_sweep [Device=1 T=U8] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.64% > 0.50%) +Pass: Cold: 2.703092ms GPU, 2.707829ms CPU, 14.86s total GPU, 5497x +Pass: Batch: 2.699732ms GPU, 14.84s total GPU, 5498x +Run: copy_type_sweep [Device=1 T=U16] +Pass: Cold: 1.515335ms GPU, 1.520048ms CPU, 0.50s total GPU, 330x +Pass: Batch: 1.513689ms GPU, 0.53s total GPU, 348x +Run: copy_type_sweep [Device=1 T=U32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.66% > 0.50%) +Pass: Cold: 1.130622ms GPU, 1.135307ms CPU, 14.62s total GPU, 12935x +Pass: Batch: 1.130123ms GPU, 14.62s total GPU, 12936x +Run: copy_type_sweep [Device=1 T=U64] +Pass: Cold: 1.047513ms GPU, 1.052201ms CPU, 0.50s total GPU, 478x +Pass: Batch: 1.044906ms GPU, 0.52s total GPU, 500x +Run: copy_type_sweep [Device=1 T=F32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.67% > 0.50%) +Pass: Cold: 1.130631ms GPU, 1.135341ms CPU, 14.62s total GPU, 12933x +Pass: Batch: 1.130335ms GPU, 14.62s total GPU, 12934x +Run: copy_type_sweep [Device=1 T=F64] +Pass: Cold: 1.048417ms GPU, 1.053125ms CPU, 0.50s total GPU, 477x +Pass: Batch: 1.045540ms GPU, 0.52s total GPU, 497x +Run: copy_type_conversion_sweep [Device=0 In=I8 Out=I8] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=0 In=I8 Out=I16] +Pass: Cold: 0.702933ms GPU, 0.709177ms CPU, 0.50s total GPU, 712x +Pass: Batch: 0.699714ms GPU, 0.52s total GPU, 748x +Run: copy_type_conversion_sweep [Device=0 In=I8 Out=I32] +Pass: Cold: 0.804698ms GPU, 0.810934ms CPU, 0.50s total GPU, 622x +Pass: Batch: 0.802089ms GPU, 0.52s total GPU, 654x +Run: copy_type_conversion_sweep [Device=0 In=I8 Out=F32] +Pass: Cold: 0.814768ms GPU, 0.821028ms CPU, 0.50s total GPU, 614x +Pass: Batch: 0.812088ms GPU, 0.52s total GPU, 645x +Run: copy_type_conversion_sweep [Device=0 In=I8 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.04% > 0.50%) +Pass: Cold: 1.212034ms GPU, 1.218286ms CPU, 14.60s total GPU, 12047x +Pass: Batch: 1.210944ms GPU, 14.59s total GPU, 12048x +Run: copy_type_conversion_sweep [Device=0 In=I8 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.74% > 0.50%) +Pass: Cold: 1.181759ms GPU, 1.188029ms CPU, 14.59s total GPU, 12345x +Pass: Batch: 1.180483ms GPU, 14.57s total GPU, 12346x +Run: copy_type_conversion_sweep [Device=0 In=I16 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I16 Out=I16] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=0 In=I16 Out=I32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.57% > 0.50%) +Pass: Cold: 0.447104ms GPU, 0.453377ms CPU, 13.72s total GPU, 30679x +Pass: Batch: 0.446394ms GPU, 13.70s total GPU, 30680x +Run: copy_type_conversion_sweep [Device=0 In=I16 Out=F32] +Pass: Cold: 0.450117ms GPU, 0.456445ms CPU, 0.50s total GPU, 1111x +Pass: Batch: 0.447497ms GPU, 0.52s total GPU, 1162x +Run: copy_type_conversion_sweep [Device=0 In=I16 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.69% > 0.50%) +Pass: Cold: 0.657381ms GPU, 0.663639ms CPU, 14.19s total GPU, 21586x +Pass: Batch: 0.656117ms GPU, 14.16s total GPU, 21587x +Run: copy_type_conversion_sweep [Device=0 In=I16 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.70% > 0.50%) +Pass: Cold: 0.655457ms GPU, 0.661733ms CPU, 14.18s total GPU, 21638x +Pass: Batch: 0.653902ms GPU, 14.15s total GPU, 21639x +Run: copy_type_conversion_sweep [Device=0 In=I32 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I32 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I32 Out=I32] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=0 In=I32 Out=F32] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.26% > 0.50%) +Pass: Cold: 0.266290ms GPU, 0.272530ms CPU, 12.64s total GPU, 47462x +Pass: Batch: 0.264891ms GPU, 12.57s total GPU, 47463x +Run: copy_type_conversion_sweep [Device=0 In=I32 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.72% > 0.50%) +Pass: Cold: 0.378098ms GPU, 0.384345ms CPU, 13.43s total GPU, 35529x +Pass: Batch: 0.377321ms GPU, 13.41s total GPU, 35530x +Run: copy_type_conversion_sweep [Device=0 In=I32 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.81% > 0.50%) +Pass: Cold: 0.378571ms GPU, 0.384783ms CPU, 13.44s total GPU, 35498x +Pass: Batch: 0.377827ms GPU, 13.41s total GPU, 35499x +Run: copy_type_conversion_sweep [Device=0 In=F32 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=F32 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=F32 Out=I32] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.35% > 0.50%) +Pass: Cold: 0.265440ms GPU, 0.271689ms CPU, 12.64s total GPU, 47607x +Pass: Batch: 0.263872ms GPU, 12.56s total GPU, 47608x +Run: copy_type_conversion_sweep [Device=0 In=F32 Out=F32] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=0 In=F32 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%) +Pass: Cold: 0.378543ms GPU, 0.384786ms CPU, 13.44s total GPU, 35499x +Pass: Batch: 0.377921ms GPU, 13.42s total GPU, 35500x +Run: copy_type_conversion_sweep [Device=0 In=F32 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.84% > 0.50%) +Pass: Cold: 0.378288ms GPU, 0.384556ms CPU, 13.43s total GPU, 35509x +Pass: Batch: 0.377510ms GPU, 13.41s total GPU, 35510x +Run: copy_type_conversion_sweep [Device=0 In=I64 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I64 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I64 Out=I32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I64 Out=F32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=I64 Out=I64] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=0 In=I64 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.88% > 0.50%) +Pass: Cold: 0.236185ms GPU, 0.242456ms CPU, 12.31s total GPU, 52100x +Pass: Batch: 0.235095ms GPU, 12.25s total GPU, 52101x +Run: copy_type_conversion_sweep [Device=0 In=F64 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=F64 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=F64 Out=I32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=F64 Out=F32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=0 In=F64 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.01% > 0.50%) +Pass: Cold: 0.238030ms GPU, 0.244316ms CPU, 12.33s total GPU, 51780x +Pass: Batch: 0.236852ms GPU, 12.26s total GPU, 51781x +Run: copy_type_conversion_sweep [Device=0 In=F64 Out=F64] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=1 In=I8 Out=I8] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=1 In=I8 Out=I16] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.80% > 0.50%) +Pass: Cold: 0.661753ms GPU, 0.666484ms CPU, 14.28s total GPU, 21576x +Pass: Batch: 0.660957ms GPU, 14.26s total GPU, 21577x +Run: copy_type_conversion_sweep [Device=1 In=I8 Out=I32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%) +Pass: Cold: 0.858200ms GPU, 0.862918ms CPU, 14.47s total GPU, 16864x +Pass: Batch: 0.857568ms GPU, 14.46s total GPU, 16865x +Run: copy_type_conversion_sweep [Device=1 In=I8 Out=F32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.80% > 0.50%) +Pass: Cold: 0.857883ms GPU, 0.862614ms CPU, 14.47s total GPU, 16866x +Pass: Batch: 0.857087ms GPU, 14.46s total GPU, 16867x +Run: copy_type_conversion_sweep [Device=1 In=I8 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.55% > 0.50%) +Pass: Cold: 1.455328ms GPU, 1.460033ms CPU, 14.72s total GPU, 10113x +Pass: Batch: 1.453767ms GPU, 14.70s total GPU, 10114x +Run: copy_type_conversion_sweep [Device=1 In=I8 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.55% > 0.50%) +Pass: Cold: 1.457198ms GPU, 1.461889ms CPU, 14.72s total GPU, 10100x +Pass: Batch: 1.455933ms GPU, 14.71s total GPU, 10101x +Run: copy_type_conversion_sweep [Device=1 In=I16 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I16 Out=I16] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=1 In=I16 Out=I32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%) +Pass: Cold: 0.455669ms GPU, 0.460392ms CPU, 13.86s total GPU, 30414x +Pass: Batch: 0.455415ms GPU, 13.85s total GPU, 30415x +Run: copy_type_conversion_sweep [Device=1 In=I16 Out=F32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.78% > 0.50%) +Pass: Cold: 0.454242ms GPU, 0.458932ms CPU, 13.86s total GPU, 30506x +Pass: Batch: 0.453806ms GPU, 13.84s total GPU, 30507x +Run: copy_type_conversion_sweep [Device=1 In=I16 Out=I64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.56% > 0.50%) +Pass: Cold: 0.748931ms GPU, 0.753645ms CPU, 14.38s total GPU, 19198x +Pass: Batch: 0.748024ms GPU, 14.36s total GPU, 19199x +Run: copy_type_conversion_sweep [Device=1 In=I16 Out=F64] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.54% > 0.50%) +Pass: Cold: 0.747517ms GPU, 0.752207ms CPU, 14.38s total GPU, 19239x +Pass: Batch: 0.746459ms GPU, 14.36s total GPU, 19240x +Run: copy_type_conversion_sweep [Device=1 In=I32 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I32 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I32 Out=I32] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=1 In=I32 Out=F32] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%) +Pass: Cold: 0.273880ms GPU, 0.278585ms CPU, 12.87s total GPU, 47007x +Pass: Batch: 0.273539ms GPU, 12.86s total GPU, 47008x +Run: copy_type_conversion_sweep [Device=1 In=I32 Out=I64] +Pass: Cold: 0.418105ms GPU, 0.422857ms CPU, 0.50s total GPU, 1196x +Pass: Batch: 0.416192ms GPU, 0.52s total GPU, 1254x +Run: copy_type_conversion_sweep [Device=1 In=I32 Out=F64] +Pass: Cold: 0.418703ms GPU, 0.423383ms CPU, 0.50s total GPU, 1195x +Pass: Batch: 0.416603ms GPU, 0.52s total GPU, 1252x +Run: copy_type_conversion_sweep [Device=1 In=F32 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=F32 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=F32 Out=I32] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.25% > 0.50%) +Pass: Cold: 0.276847ms GPU, 0.281581ms CPU, 12.89s total GPU, 46545x +Pass: Batch: 0.276479ms GPU, 12.87s total GPU, 46546x +Run: copy_type_conversion_sweep [Device=1 In=F32 Out=F32] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=1 In=F32 Out=I64] +Pass: Cold: 0.418391ms GPU, 0.423070ms CPU, 0.50s total GPU, 1196x +Pass: Batch: 0.416373ms GPU, 0.52s total GPU, 1257x +Run: copy_type_conversion_sweep [Device=1 In=F32 Out=F64] +Pass: Cold: 0.418690ms GPU, 0.423378ms CPU, 0.50s total GPU, 1195x +Pass: Batch: 0.416660ms GPU, 0.53s total GPU, 1265x +Run: copy_type_conversion_sweep [Device=1 In=I64 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I64 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I64 Out=I32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I64 Out=F32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=I64 Out=I64] +Skip: Not a conversion: InputType == OutputType. +Run: copy_type_conversion_sweep [Device=1 In=I64 Out=F64] +Pass: Cold: 0.261885ms GPU, 0.266569ms CPU, 0.50s total GPU, 1910x +Pass: Batch: 0.260037ms GPU, 0.52s total GPU, 2011x +Run: copy_type_conversion_sweep [Device=1 In=F64 Out=I8] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=F64 Out=I16] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=F64 Out=I32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=F64 Out=F32] +Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType). +Run: copy_type_conversion_sweep [Device=1 In=F64 Out=I64] +Pass: Cold: 0.261586ms GPU, 0.266286ms CPU, 0.50s total GPU, 1912x +Pass: Batch: 0.259790ms GPU, 0.52s total GPU, 2016x +Run: copy_type_conversion_sweep [Device=1 In=F64 Out=F64] +Skip: Not a conversion: InputType == OutputType. +``` + +# Benchmark Results + +## simple + +### [0] Quadro GV100 + +| Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|---------|----------|-------|----------|-------|-----------|-------| +| 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 524x | + +### [1] Quadro GP100 + +| Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|---------|----------|-------|----------|-------|-----------|-------| +| 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## single_float64_axis + +### [0] Quadro GV100 + +| Duration | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|----------|---------|------------|-------|------------|--------|------------|---------| +| 0 | 147957x | 10.618 us | 3.25% | 4.424 us | 10.78% | 2.043 us | 244766x | +| 0.0001 | 4831x | 110.048 us | 0.42% | 103.515 us | 0.48% | 101.376 us | 5088x | +| 0.0002 | 2453x | 210.369 us | 0.22% | 203.903 us | 0.25% | 201.729 us | 2582x | +| 0.0003 | 1648x | 309.866 us | 0.15% | 303.412 us | 0.17% | 301.164 us | 1736x | +| 0.0004 | 1239x | 410.148 us | 0.12% | 403.673 us | 0.14% | 401.410 us | 1304x | +| 0.0005 | 992x | 510.529 us | 0.09% | 504.089 us | 0.11% | 501.762 us | 1042x | +| 0.0006 | 829x | 609.862 us | 0.08% | 603.471 us | 0.10% | 601.104 us | 872x | +| 0.0007 | 711x | 710.294 us | 0.07% | 703.744 us | 0.08% | 701.443 us | 748x | +| 0.0008 | 622x | 810.565 us | 0.06% | 804.187 us | 0.07% | 801.795 us | 653x | +| 0.0009 | 554x | 909.873 us | 0.05% | 903.433 us | 0.06% | 901.125 us | 582x | +| 0.001 | 499x | 1.010 ms | 0.04% | 1.004 ms | 0.05% | 1.001 ms | 523x | + +### [1] Quadro GP100 + +| Duration | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|----------|---------|------------|-------|------------|-------|------------|---------| +| 0 | 152839x | 7.705 us | 5.42% | 3.016 us | 4.13% | 1.343 us | 372166x | +| 0.0001 | 4879x | 107.156 us | 0.41% | 102.481 us | 0.31% | 101.376 us | 5107x | +| 0.0002 | 2466x | 207.544 us | 0.19% | 202.833 us | 0.15% | 201.728 us | 2586x | +| 0.0003 | 1655x | 306.880 us | 0.13% | 302.191 us | 0.11% | 301.057 us | 1736x | +| 0.0004 | 1243x | 407.214 us | 0.11% | 402.508 us | 0.08% | 401.409 us | 1305x | +| 0.0005 | 995x | 507.562 us | 0.08% | 502.864 us | 0.06% | 501.761 us | 1045x | +| 0.0006 | 831x | 606.954 us | 0.07% | 602.223 us | 0.05% | 601.089 us | 873x | +| 0.0007 | 712x | 707.255 us | 0.06% | 702.559 us | 0.04% | 701.442 us | 748x | +| 0.0008 | 623x | 807.636 us | 0.05% | 802.910 us | 0.04% | 801.794 us | 655x | +| 0.0009 | 555x | 906.935 us | 0.05% | 902.248 us | 0.03% | 901.123 us | 582x | +| 0.001 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## copy_sweep_grid_shape + +### [0] Quadro GV100 + +| BlockSize | (BlockSize) | NumBlocks | (NumBlocks) | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-----------|-------------|-----------|-------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------| +| 2^6 | 64 | 2^6 | 64 | 66x | 7.622 ms | 0.14% | 7.616 ms | 0.14% | 8.812G | 70.495 GB/s | 8.10% | 7.615 ms | 69x | +| 2^8 | 256 | 2^6 | 64 | 206x | 2.442 ms | 0.46% | 2.436 ms | 0.46% | 27.547G | 220.379 GB/s | 25.32% | 2.434 ms | 215x | +| 2^10 | 1024 | 2^6 | 64 | 13161x | 1.112 ms | 1.29% | 1.106 ms | 1.30% | 60.681G | 485.445 GB/s | 55.77% | 1.103 ms | 13162x | +| 2^6 | 64 | 2^8 | 256 | 375x | 2.450 ms | 0.50% | 2.444 ms | 0.50% | 27.457G | 219.652 GB/s | 25.24% | 2.444 ms | 376x | +| 2^8 | 256 | 2^8 | 256 | 13509x | 1.083 ms | 0.96% | 1.077 ms | 0.97% | 62.332G | 498.653 GB/s | 57.29% | 1.076 ms | 13510x | +| 2^10 | 1024 | 2^8 | 256 | 15105x | 964.751 us | 0.51% | 958.478 us | 0.51% | 70.016G | 560.129 GB/s | 64.35% | 957.249 us | 15106x | +| 2^6 | 64 | 2^10 | 1024 | 13582x | 1.077 ms | 0.74% | 1.071 ms | 0.74% | 62.682G | 501.460 GB/s | 57.61% | 1.071 ms | 13583x | +| 2^8 | 256 | 2^10 | 1024 | 1782x | 962.899 us | 0.50% | 956.568 us | 0.50% | 70.156G | 561.247 GB/s | 64.48% | 954.599 us | 1783x | +| 2^10 | 1024 | 2^10 | 1024 | 14579x | 1.000 ms | 1.93% | 994.218 us | 1.94% | 67.499G | 539.993 GB/s | 62.04% | 992.819 us | 14580x | + +### [1] Quadro GP100 + +| BlockSize | (BlockSize) | NumBlocks | (NumBlocks) | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-----------|-------------|-----------|-------------|---------|----------|-------|----------|-------|---------|--------------|--------|-----------|--------| +| 2^6 | 64 | 2^6 | 64 | 2236x | 6.689 ms | 1.09% | 6.684 ms | 1.10% | 10.040G | 80.319 GB/s | 10.97% | 6.675 ms | 2237x | +| 2^8 | 256 | 2^6 | 64 | 218x | 2.301 ms | 0.29% | 2.296 ms | 0.29% | 29.224G | 233.794 GB/s | 31.93% | 2.298 ms | 228x | +| 2^10 | 1024 | 2^6 | 64 | 426x | 1.179 ms | 0.39% | 1.174 ms | 0.39% | 57.144G | 457.155 GB/s | 62.44% | 1.172 ms | 449x | +| 2^6 | 64 | 2^8 | 256 | 226x | 2.218 ms | 0.16% | 2.214 ms | 0.16% | 30.316G | 242.531 GB/s | 33.13% | 2.213 ms | 237x | +| 2^8 | 256 | 2^8 | 256 | 12933x | 1.135 ms | 0.67% | 1.131 ms | 0.67% | 59.361G | 474.891 GB/s | 64.86% | 1.130 ms | 12934x | +| 2^10 | 1024 | 2^8 | 256 | 447x | 1.124 ms | 0.22% | 1.119 ms | 0.22% | 59.975G | 479.797 GB/s | 65.53% | 1.117 ms | 468x | +| 2^6 | 64 | 2^10 | 1024 | 448x | 1.122 ms | 0.30% | 1.117 ms | 0.30% | 60.084G | 480.669 GB/s | 65.65% | 1.115 ms | 470x | +| 2^8 | 256 | 2^10 | 1024 | 448x | 1.122 ms | 0.28% | 1.118 ms | 0.28% | 60.042G | 480.335 GB/s | 65.61% | 1.116 ms | 471x | +| 2^10 | 1024 | 2^10 | 1024 | 474x | 1.060 ms | 0.15% | 1.056 ms | 0.15% | 63.568G | 508.542 GB/s | 69.46% | 1.054 ms | 498x | + +## copy_type_sweep + +### [0] Quadro GV100 + +| T | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-----|---------|------------|-------|------------|-------|----------|--------------|--------|------------|--------| +| U8 | 197x | 2.550 ms | 0.30% | 2.544 ms | 0.30% | 105.536G | 211.072 GB/s | 24.25% | 2.539 ms | 206x | +| U16 | 314x | 1.602 ms | 0.41% | 1.596 ms | 0.41% | 84.116G | 336.465 GB/s | 38.66% | 1.592 ms | 331x | +| U32 | 13509x | 1.083 ms | 0.96% | 1.077 ms | 0.97% | 62.333G | 498.668 GB/s | 57.29% | 1.075 ms | 13510x | +| U64 | 15542x | 936.680 us | 0.60% | 930.416 us | 0.61% | 36.064G | 577.023 GB/s | 66.29% | 929.189 us | 15543x | +| F32 | 13508x | 1.083 ms | 0.97% | 1.077 ms | 0.97% | 62.323G | 498.586 GB/s | 57.28% | 1.075 ms | 13509x | +| F64 | 15546x | 936.683 us | 0.59% | 930.412 us | 0.60% | 36.064G | 577.025 GB/s | 66.29% | 929.182 us | 15547x | + +### [1] Quadro GP100 + +| T | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-----|---------|----------|-------|----------|-------|---------|--------------|--------|-----------|--------| +| U8 | 5497x | 2.708 ms | 0.64% | 2.703 ms | 0.64% | 99.307G | 198.614 GB/s | 27.13% | 2.700 ms | 5498x | +| U16 | 330x | 1.520 ms | 0.44% | 1.515 ms | 0.44% | 88.573G | 354.292 GB/s | 48.39% | 1.514 ms | 348x | +| U32 | 12935x | 1.135 ms | 0.66% | 1.131 ms | 0.66% | 59.356G | 474.846 GB/s | 64.86% | 1.130 ms | 12936x | +| U64 | 478x | 1.052 ms | 0.27% | 1.048 ms | 0.27% | 32.032G | 512.520 GB/s | 70.00% | 1.045 ms | 500x | +| F32 | 12933x | 1.135 ms | 0.67% | 1.131 ms | 0.67% | 59.355G | 474.842 GB/s | 64.85% | 1.130 ms | 12934x | +| F64 | 477x | 1.053 ms | 0.28% | 1.048 ms | 0.28% | 32.005G | 512.078 GB/s | 69.94% | 1.046 ms | 497x | + +## copy_type_conversion_sweep + +### [0] Quadro GV100 + +| In | Out | Items | InSize | OutSize | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-----|-----|----------|------------|-------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------| +| I8 | I16 | 67108864 | 64.000 MiB | 128.000 MiB | 712x | 709.177 us | 0.29% | 702.933 us | 0.30% | 95.470G | 286.410 GB/s | 32.91% | 699.714 us | 748x | +| I8 | I32 | 67108864 | 64.000 MiB | 256.000 MiB | 622x | 810.934 us | 0.31% | 804.698 us | 0.31% | 83.396G | 416.982 GB/s | 47.91% | 802.089 us | 654x | +| I8 | F32 | 67108864 | 64.000 MiB | 256.000 MiB | 614x | 821.028 us | 0.32% | 814.768 us | 0.33% | 82.366G | 411.828 GB/s | 47.31% | 812.088 us | 645x | +| I8 | I64 | 67108864 | 64.000 MiB | 512.000 MiB | 12047x | 1.218 ms | 1.04% | 1.212 ms | 1.04% | 55.369G | 498.319 GB/s | 57.25% | 1.211 ms | 12048x | +| I8 | F64 | 67108864 | 64.000 MiB | 512.000 MiB | 12345x | 1.188 ms | 0.74% | 1.182 ms | 0.74% | 56.787G | 511.086 GB/s | 58.72% | 1.180 ms | 12346x | +| I16 | I32 | 33554432 | 64.000 MiB | 128.000 MiB | 30679x | 453.377 us | 0.56% | 447.104 us | 0.57% | 75.048G | 450.290 GB/s | 51.73% | 446.394 us | 30680x | +| I16 | F32 | 33554432 | 64.000 MiB | 128.000 MiB | 1111x | 456.445 us | 0.46% | 450.117 us | 0.47% | 74.546G | 447.276 GB/s | 51.39% | 447.497 us | 1162x | +| I16 | I64 | 33554432 | 64.000 MiB | 256.000 MiB | 21586x | 663.639 us | 0.68% | 657.381 us | 0.69% | 51.043G | 510.426 GB/s | 58.64% | 656.117 us | 21587x | +| I16 | F64 | 33554432 | 64.000 MiB | 256.000 MiB | 21638x | 661.733 us | 0.69% | 655.457 us | 0.70% | 51.192G | 511.925 GB/s | 58.81% | 653.902 us | 21639x | +| I32 | F32 | 16777216 | 64.000 MiB | 64.000 MiB | 47462x | 272.530 us | 1.23% | 266.290 us | 1.26% | 63.003G | 504.028 GB/s | 57.91% | 264.891 us | 47463x | +| I32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB | 35529x | 384.345 us | 0.71% | 378.098 us | 0.72% | 44.373G | 532.471 GB/s | 61.18% | 377.321 us | 35530x | +| I32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB | 35498x | 384.783 us | 0.79% | 378.571 us | 0.81% | 44.317G | 531.807 GB/s | 61.10% | 377.827 us | 35499x | +| F32 | I32 | 16777216 | 64.000 MiB | 64.000 MiB | 47607x | 271.689 us | 1.32% | 265.440 us | 1.35% | 63.205G | 505.642 GB/s | 58.09% | 263.872 us | 47608x | +| F32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB | 35499x | 384.786 us | 0.78% | 378.543 us | 0.79% | 44.320G | 531.846 GB/s | 61.10% | 377.921 us | 35500x | +| F32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB | 35509x | 384.556 us | 0.82% | 378.288 us | 0.84% | 44.350G | 532.204 GB/s | 61.14% | 377.510 us | 35510x | +| I64 | F64 | 8388608 | 64.000 MiB | 64.000 MiB | 52100x | 242.456 us | 0.85% | 236.185 us | 0.88% | 35.517G | 568.273 GB/s | 65.29% | 235.095 us | 52101x | +| F64 | I64 | 8388608 | 64.000 MiB | 64.000 MiB | 51780x | 244.316 us | 0.98% | 238.030 us | 1.01% | 35.242G | 563.869 GB/s | 64.78% | 236.852 us | 51781x | + +### [1] Quadro GP100 + +| In | Out | Items | InSize | OutSize | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-----|-----|----------|------------|-------------|---------|------------|-------|------------|-------|----------|--------------|--------|------------|--------| +| I8 | I16 | 67108864 | 64.000 MiB | 128.000 MiB | 21576x | 666.484 us | 0.80% | 661.753 us | 0.80% | 101.411G | 304.232 GB/s | 41.55% | 660.957 us | 21577x | +| I8 | I32 | 67108864 | 64.000 MiB | 256.000 MiB | 16864x | 862.918 us | 0.79% | 858.200 us | 0.79% | 78.197G | 390.986 GB/s | 53.40% | 857.568 us | 16865x | +| I8 | F32 | 67108864 | 64.000 MiB | 256.000 MiB | 16866x | 862.614 us | 0.80% | 857.883 us | 0.80% | 78.226G | 391.131 GB/s | 53.42% | 857.087 us | 16867x | +| I8 | I64 | 67108864 | 64.000 MiB | 512.000 MiB | 10113x | 1.460 ms | 0.55% | 1.455 ms | 0.55% | 46.113G | 415.013 GB/s | 56.68% | 1.454 ms | 10114x | +| I8 | F64 | 67108864 | 64.000 MiB | 512.000 MiB | 10100x | 1.462 ms | 0.54% | 1.457 ms | 0.55% | 46.053G | 414.480 GB/s | 56.61% | 1.456 ms | 10101x | +| I16 | I32 | 33554432 | 64.000 MiB | 128.000 MiB | 30414x | 460.392 us | 0.78% | 455.669 us | 0.79% | 73.638G | 441.826 GB/s | 60.35% | 455.415 us | 30415x | +| I16 | F32 | 33554432 | 64.000 MiB | 128.000 MiB | 30506x | 458.932 us | 0.78% | 454.242 us | 0.78% | 73.869G | 443.215 GB/s | 60.54% | 453.806 us | 30507x | +| I16 | I64 | 33554432 | 64.000 MiB | 256.000 MiB | 19198x | 753.645 us | 0.56% | 748.931 us | 0.56% | 44.803G | 448.031 GB/s | 61.19% | 748.024 us | 19199x | +| I16 | F64 | 33554432 | 64.000 MiB | 256.000 MiB | 19239x | 752.207 us | 0.54% | 747.517 us | 0.54% | 44.888G | 448.878 GB/s | 61.31% | 746.459 us | 19240x | +| I32 | F32 | 16777216 | 64.000 MiB | 64.000 MiB | 47007x | 278.585 us | 0.78% | 273.880 us | 0.79% | 61.258G | 490.060 GB/s | 66.93% | 273.539 us | 47008x | +| I32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB | 1196x | 422.857 us | 0.44% | 418.105 us | 0.44% | 40.127G | 481.521 GB/s | 65.77% | 416.192 us | 1254x | +| I32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB | 1195x | 423.383 us | 0.47% | 418.703 us | 0.47% | 40.070G | 480.834 GB/s | 65.67% | 416.603 us | 1252x | +| F32 | I32 | 16777216 | 64.000 MiB | 64.000 MiB | 46545x | 281.581 us | 1.23% | 276.847 us | 1.25% | 60.601G | 484.808 GB/s | 66.22% | 276.479 us | 46546x | +| F32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB | 1196x | 423.070 us | 0.46% | 418.391 us | 0.46% | 40.099G | 481.193 GB/s | 65.72% | 416.373 us | 1257x | +| F32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB | 1195x | 423.378 us | 0.47% | 418.690 us | 0.47% | 40.071G | 480.849 GB/s | 65.68% | 416.660 us | 1265x | +| I64 | F64 | 8388608 | 64.000 MiB | 64.000 MiB | 1910x | 266.569 us | 0.42% | 261.885 us | 0.42% | 32.032G | 512.506 GB/s | 70.00% | 260.037 us | 2011x | +| F64 | I64 | 8388608 | 64.000 MiB | 64.000 MiB | 1912x | 266.286 us | 0.42% | 261.586 us | 0.41% | 32.068G | 513.092 GB/s | 70.08% | 259.790 us | 2016x | diff --git a/examples/outputs/nvbench.example.enums.csv b/examples/outputs/nvbench.example.enums.csv new file mode 100644 index 0000000..e644155 --- /dev/null +++ b/examples/outputs/nvbench.example.enums.csv @@ -0,0 +1,27 @@ +Benchmark,Device,Device Name,MyEnum,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Batch GPU (sec),Batch,SomeInts +runtime_enum_sweep_string,0,Quadro GV100,A,No,499,0.0010101454729458924,0.00046483104049603716,0.001003841868144471,0.0005749097846815492,0.001001477914376195,523, +runtime_enum_sweep_string,0,Quadro GV100,B,No,499,0.001010159408817635,0.0004613137827456428,0.0010038492447866406,0.0005555267067795092,0.001001477914376195,523, +runtime_enum_sweep_string,0,Quadro GV100,C,No,499,0.0010101243687374746,0.0005092828188229102,0.001003840521246726,0.0005589757153765308,0.0010014759304418617,523, +runtime_enum_sweep_string,1,Quadro GP100,A,No,499,0.0010076531543086172,0.0004980799779298182,0.0010027744387815845,0.0003286485696207647,0.0010014733467393249,524, +runtime_enum_sweep_string,1,Quadro GP100,B,No,499,0.001007305567134269,0.0004149742922173277,0.0010026098881551395,0.0003090849629556725,0.0010014743950530773,524, +runtime_enum_sweep_string,1,Quadro GP100,C,No,499,0.0010073099939879758,0.00041034663323030794,0.001002610978000388,0.0003063685132670013,0.0010014735796979365,524, +runtime_enum_sweep_int64,0,Quadro GV100,0,No,499,0.0010101685511022043,0.00048173554195341087,0.001003846292266381,0.0005760736687258556,0.001001475909284053,524, +runtime_enum_sweep_int64,0,Quadro GV100,1,No,499,0.0010101069098196389,0.000452307701913514,0.0010036457628907521,0.0004970727279364392,0.0010014759304418617,523, +runtime_enum_sweep_int64,0,Quadro GV100,2,No,499,0.0010101263466933872,0.0004551387873866542,0.001003668336447824,0.0004936151206687181,0.0010014759304418617,523, +runtime_enum_sweep_int64,1,Quadro GP100,0,No,499,0.0010076479038076157,0.0005063884103258074,0.0010027646934818884,0.0003189744202423173,0.0010014735796979365,524, +runtime_enum_sweep_int64,1,Quadro GP100,1,No,499,0.0010073362545090188,0.0006783144138303011,0.001002585007575805,0.00032119540844936307,0.0010014733467393249,524, +runtime_enum_sweep_int64,1,Quadro GP100,2,No,499,0.0010073002645290582,0.0004221934097940968,0.0010025900736122663,0.00030148599118709333,0.0010014748609703007,524, +compile_time_enum_sweep,0,Quadro GV100,A,No,499,0.001010085394789578,0.0004655817990209206,0.001003753755278955,0.0005493729335684191,0.0010014759304418617,523, +compile_time_enum_sweep,0,Quadro GV100,B,No,499,0.001010053595190381,0.0005071877844696576,0.0010036782112293535,0.0005775279883320534,0.001001473929135854,524, +compile_time_enum_sweep,0,Quadro GV100,C,No,499,0.0010101191442885773,0.000491671020220968,0.001003674685835594,0.0005387103704343772,0.001001475909284053,524, +compile_time_enum_sweep,1,Quadro GP100,A,No,499,0.0010072828116232475,0.0004397728983271746,0.001002574038171099,0.0003125584355538476,0.0010014731137807133,524, +compile_time_enum_sweep,1,Quadro GP100,B,No,499,0.0010073154749498997,0.00040217119470421287,0.0010025884063544867,0.0002963185452948694,0.0010014750939289121,524, +compile_time_enum_sweep,1,Quadro GP100,C,No,499,0.0010073215450901796,0.00042671482068318837,0.0010026036672936165,0.000296913783133073,0.0010014753268875237,524, +compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010101242705410818,0.00046763787302286317,0.0010036719920401053,0.0005214982620209435,0.001001477889432252,524,0 +compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010101197975951911,0.00047310755052367136,0.0010037048890260939,0.0005711753095722302,0.0010014759304418617,523,16 +compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010100482024048096,0.00048230228130997395,0.0010036862909435461,0.0005703633321237566,0.001001475909284053,524,4096 +compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010101047835671339,0.0004779495034261126,0.0010036583329011495,0.0004905372193961659,0.0010014759304418617,523,-12 +compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010073524008016035,0.00045954094220701026,0.001002591226765053,0.0003132035131857924,0.0010014731137807133,524,0 +compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010072802204408825,0.00041173035101990657,0.001002568971417949,0.00030006995446635803,0.0010014742785737715,524,16 +compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010072856112224454,0.00041573006780551387,0.0010025766641200192,0.0003017176216493314,0.0010014750939289121,524,4096 +compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010073294088176355,0.00042082782124641986,0.001002588534402943,0.0002955692521037092,0.0010014733467393249,524,-12 diff --git a/examples/outputs/nvbench.example.enums.json b/examples/outputs/nvbench.example.enums.json new file mode 100644 index 0000000..e4430f1 --- /dev/null +++ b/examples/outputs/nvbench.example.enums.json @@ -0,0 +1,3949 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "runtime_enum_sweep_string", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "MyEnum": { + "type": "string", + "flags": "", + "values": [ + { + "input_string": "A", + "description": "", + "value": "A" + }, + { + "input_string": "B", + "description": "", + "value": "B" + }, + { + "input_string": "C", + "description": "", + "value": "C" + } + ] + } + }, + "states": { + "Device=0 MyEnum=A": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "A" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101454729458924" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00046483104049603716" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003841868144471" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005749097846815492" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001477914376195" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 MyEnum=B": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "B" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001010159408817635" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004613137827456428" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010038492447866406" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005555267067795092" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001477914376195" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 MyEnum=C": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "C" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101243687374746" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005092828188229102" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003840521246726" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005589757153765308" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=A": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "A" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010076531543086172" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004980799779298182" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010027744387815845" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003286485696207647" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014733467393249" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=B": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "B" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001007305567134269" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004149742922173277" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010026098881551395" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003090849629556725" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014743950530773" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=C": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "C" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073099939879758" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041034663323030794" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002610978000388" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003063685132670013" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014735796979365" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 1, + "name": "runtime_enum_sweep_int64", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "MyEnum": { + "type": "int64", + "flags": "", + "values": [ + { + "input_string": "0", + "description": "", + "value": 0 + }, + { + "input_string": "1", + "description": "", + "value": 1 + }, + { + "input_string": "2", + "description": "", + "value": 2 + } + ] + } + }, + "states": { + "Device=0 MyEnum=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "int64", + "value": "0" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101685511022043" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00048173554195341087" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003846292266381" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005760736687258556" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 MyEnum=1": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "int64", + "value": "1" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101069098196389" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.000452307701913514" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036457628907521" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004970727279364392" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 MyEnum=2": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "int64", + "value": "2" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101263466933872" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004551387873866542" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003668336447824" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004936151206687181" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=0": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "int64", + "value": "0" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010076479038076157" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005063884103258074" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010027646934818884" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003189744202423173" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014735796979365" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=1": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "int64", + "value": "1" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073362545090188" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006783144138303011" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002585007575805" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00032119540844936307" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014733467393249" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=2": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "int64", + "value": "2" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073002645290582" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004221934097940968" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025900736122663" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00030148599118709333" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014748609703007" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 2, + "name": "compile_time_enum_sweep", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "MyEnum": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "A", + "description": "MyEnum::ValueA", + "is_active": true + }, + { + "input_string": "B", + "description": "MyEnum::ValueB", + "is_active": true + }, + { + "input_string": "C", + "description": "MyEnum::ValueC", + "is_active": true + } + ] + } + }, + "states": { + "Device=0 MyEnum=A": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "A" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001010085394789578" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004655817990209206" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003753755278955" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005493729335684191" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 MyEnum=B": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "B" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001010053595190381" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005071877844696576" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036782112293535" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005775279883320534" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001473929135854" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 MyEnum=C": { + "device": 0, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "C" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101191442885773" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.000491671020220968" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003674685835594" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005387103704343772" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=A": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "A" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072828116232475" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004397728983271746" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002574038171099" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003125584355538476" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014731137807133" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=B": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "B" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073154749498997" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00040217119470421287" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025884063544867" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0002963185452948694" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014750939289121" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 MyEnum=C": { + "device": 1, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "MyEnum": { + "type": "string", + "value": "C" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073215450901796" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00042671482068318837" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010026036672936165" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.000296913783133073" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014753268875237" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 3, + "name": "compile_time_int_sweep", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "SomeInts": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "0", + "description": "nvbench::enum_type<0, int>", + "is_active": true + }, + { + "input_string": "16", + "description": "nvbench::enum_type<16, int>", + "is_active": true + }, + { + "input_string": "4096", + "description": "nvbench::enum_type<4096, int>", + "is_active": true + }, + { + "input_string": "-12", + "description": "nvbench::enum_type<-12, int>", + "is_active": true + } + ] + } + }, + "states": { + "Device=0 SomeInts=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "0" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101242705410818" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00046763787302286317" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036719920401053" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005214982620209435" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001477889432252" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 SomeInts=16": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101197975951911" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00047310755052367136" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010037048890260939" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005711753095722302" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 SomeInts=4096": { + "device": 0, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "4096" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010100482024048096" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00048230228130997395" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036862909435461" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005703633321237566" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 SomeInts=-12": { + "device": 0, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "-12" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101047835671339" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004779495034261126" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036583329011495" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004905372193961659" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=1 SomeInts=0": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "0" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073524008016035" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00045954094220701026" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002591226765053" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003132035131857924" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014731137807133" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 SomeInts=16": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072802204408825" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041173035101990657" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002568971417949" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00030006995446635803" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014742785737715" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 SomeInts=4096": { + "device": 1, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "4096" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072856112224454" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041573006780551387" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025766641200192" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003017176216493314" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014750939289121" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 SomeInts=-12": { + "device": 1, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "SomeInts": { + "type": "string", + "value": "-12" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073294088176355" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00042082782124641986" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001002588534402943" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0002955692521037092" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014733467393249" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.enums.list.md b/examples/outputs/nvbench.example.enums.list.md new file mode 100644 index 0000000..9cb3a73 --- /dev/null +++ b/examples/outputs/nvbench.example.enums.list.md @@ -0,0 +1,67 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 30117 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 14939 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `runtime_enum_sweep_string` (3 configurations) + +### Axes + +* `MyEnum` : string + * `A` + * `B` + * `C` + +## [1] `runtime_enum_sweep_int64` (3 configurations) + +### Axes + +* `MyEnum` : int64 + * `0` + * `1` + * `2` + +## [2] `compile_time_enum_sweep` (3 configurations) + +### Axes + +* `MyEnum` : type + * `A` (MyEnum::ValueA) + * `B` (MyEnum::ValueB) + * `C` (MyEnum::ValueC) + +## [3] `compile_time_int_sweep` (4 configurations) + +### Axes + +* `SomeInts` : type + * `0` (nvbench::enum_type<0, int>) + * `16` (nvbench::enum_type<16, int>) + * `4096` (nvbench::enum_type<4096, int>) + * `-12` (nvbench::enum_type<-12, int>) + diff --git a/examples/outputs/nvbench.example.enums.md b/examples/outputs/nvbench.example.enums.md new file mode 100644 index 0000000..583f140 --- /dev/null +++ b/examples/outputs/nvbench.example.enums.md @@ -0,0 +1,186 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 32163 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15999 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: runtime_enum_sweep_string [Device=0 MyEnum=A] +Pass: Cold: 1.003842ms GPU, 1.010145ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x +Run: runtime_enum_sweep_string [Device=0 MyEnum=B] +Pass: Cold: 1.003849ms GPU, 1.010159ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x +Run: runtime_enum_sweep_string [Device=0 MyEnum=C] +Pass: Cold: 1.003841ms GPU, 1.010124ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: runtime_enum_sweep_string [Device=1 MyEnum=A] +Pass: Cold: 1.002774ms GPU, 1.007653ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +Run: runtime_enum_sweep_string [Device=1 MyEnum=B] +Pass: Cold: 1.002610ms GPU, 1.007306ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: runtime_enum_sweep_string [Device=1 MyEnum=C] +Pass: Cold: 1.002611ms GPU, 1.007310ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: runtime_enum_sweep_int64 [Device=0 MyEnum=0] +Pass: Cold: 1.003846ms GPU, 1.010169ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: runtime_enum_sweep_int64 [Device=0 MyEnum=1] +Pass: Cold: 1.003646ms GPU, 1.010107ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: runtime_enum_sweep_int64 [Device=0 MyEnum=2] +Pass: Cold: 1.003668ms GPU, 1.010126ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: runtime_enum_sweep_int64 [Device=1 MyEnum=0] +Pass: Cold: 1.002765ms GPU, 1.007648ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: runtime_enum_sweep_int64 [Device=1 MyEnum=1] +Pass: Cold: 1.002585ms GPU, 1.007336ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +Run: runtime_enum_sweep_int64 [Device=1 MyEnum=2] +Pass: Cold: 1.002590ms GPU, 1.007300ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: compile_time_enum_sweep [Device=0 MyEnum=A] +Pass: Cold: 1.003754ms GPU, 1.010085ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: compile_time_enum_sweep [Device=0 MyEnum=B] +Pass: Cold: 1.003678ms GPU, 1.010054ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: compile_time_enum_sweep [Device=0 MyEnum=C] +Pass: Cold: 1.003675ms GPU, 1.010119ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: compile_time_enum_sweep [Device=1 MyEnum=A] +Pass: Cold: 1.002574ms GPU, 1.007283ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +Run: compile_time_enum_sweep [Device=1 MyEnum=B] +Pass: Cold: 1.002588ms GPU, 1.007315ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: compile_time_enum_sweep [Device=1 MyEnum=C] +Pass: Cold: 1.002604ms GPU, 1.007322ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: compile_time_int_sweep [Device=0 SomeInts=0] +Pass: Cold: 1.003672ms GPU, 1.010124ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 524x +Run: compile_time_int_sweep [Device=0 SomeInts=16] +Pass: Cold: 1.003705ms GPU, 1.010120ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: compile_time_int_sweep [Device=0 SomeInts=4096] +Pass: Cold: 1.003686ms GPU, 1.010048ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: compile_time_int_sweep [Device=0 SomeInts=-12] +Pass: Cold: 1.003658ms GPU, 1.010105ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: compile_time_int_sweep [Device=1 SomeInts=0] +Pass: Cold: 1.002591ms GPU, 1.007352ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +Run: compile_time_int_sweep [Device=1 SomeInts=16] +Pass: Cold: 1.002569ms GPU, 1.007280ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: compile_time_int_sweep [Device=1 SomeInts=4096] +Pass: Cold: 1.002577ms GPU, 1.007286ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: compile_time_int_sweep [Device=1 SomeInts=-12] +Pass: Cold: 1.002589ms GPU, 1.007329ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +``` + +# Benchmark Results + +## runtime_enum_sweep_string + +### [0] Quadro GV100 + +| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|--------|---------|----------|-------|----------|-------|-----------|-------| +| A | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 523x | +| B | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 523x | +| C | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 523x | + +### [1] Quadro GP100 + +| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|--------|---------|----------|-------|----------|-------|-----------|-------| +| A | 499x | 1.008 ms | 0.05% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| B | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| C | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## runtime_enum_sweep_int64 + +### [0] Quadro GV100 + +| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|--------|---------|----------|-------|----------|-------|-----------|-------| +| 0 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 524x | +| 1 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| 2 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | + +### [1] Quadro GP100 + +| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|--------|---------|----------|-------|----------|-------|-----------|-------| +| 0 | 499x | 1.008 ms | 0.05% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| 1 | 499x | 1.007 ms | 0.07% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| 2 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## compile_time_enum_sweep + +### [0] Quadro GV100 + +| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|--------|---------|----------|-------|----------|-------|-----------|-------| +| A | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| B | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 524x | +| C | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | + +### [1] Quadro GP100 + +| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|--------|---------|----------|-------|----------|-------|-----------|-------| +| A | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| B | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| C | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## compile_time_int_sweep + +### [0] Quadro GV100 + +| SomeInts | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|----------|---------|----------|-------|----------|-------|-----------|-------| +| 0 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | +| 16 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 523x | +| 4096 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% | 1.001 ms | 524x | +| -12 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | + +### [1] Quadro GP100 + +| SomeInts | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|----------|---------|----------|-------|----------|-------|-----------|-------| +| 0 | 499x | 1.007 ms | 0.05% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| 16 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| 4096 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| -12 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | diff --git a/examples/outputs/nvbench.example.exec_tag_sync.csv b/examples/outputs/nvbench.example.exec_tag_sync.csv new file mode 100644 index 0000000..d8eb773 --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_sync.csv @@ -0,0 +1,3 @@ +Benchmark,Device,Device Name,Skipped,Items,Size (bytes),Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak +sequence_bench,0,Quadro GV100,No,16777216,67108864,88096,0.00011210838060751815,0.0043948813906645855,0.00010738264021815305,0.005226831698093829,156237693224.12143,624950772896.4857,0.7180041048902639 +sequence_bench,1,Quadro GP100,No,16777216,67108864,4236,0.0001220395009442869,0.0031913153957058224,0.00011805303109505499,0.0030945635166685077,142115927430.03076,568463709720.123,0.7764200580749058 diff --git a/examples/outputs/nvbench.example.exec_tag_sync.json b/examples/outputs/nvbench.example.exec_tag_sync.json new file mode 100644 index 0000000..4d4aa0a --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_sync.json @@ -0,0 +1,426 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "sequence_bench", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": null, + "states": { + "Device=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "Size" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "88096" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00011210838060751815" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0043948813906645855" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00010738264021815305" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005226831698093829" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "156237693224.12143" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "624950772896.4857" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.7180041048902639" + } + } + }, + "is_skipped": false + }, + "Device=1": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Element count: Items": { + "short_name": { + "type": "string", + "value": "Items" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Output Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "Size" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "4236" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0001220395009442869" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0031913153957058224" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00011805303109505499" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0030945635166685077" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "142115927430.03076" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "568463709720.123" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.7764200580749058" + } + } + }, + "is_skipped": false + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.exec_tag_sync.list.md b/examples/outputs/nvbench.example.exec_tag_sync.list.md new file mode 100644 index 0000000..29a2bc7 --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_sync.list.md @@ -0,0 +1,32 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 30269 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 14939 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `sequence_bench` (1 configurations) + diff --git a/examples/outputs/nvbench.example.exec_tag_sync.md b/examples/outputs/nvbench.example.exec_tag_sync.md new file mode 100644 index 0000000..4b2059c --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_sync.md @@ -0,0 +1,53 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 32163 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15999 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: sequence_bench [Device=0] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.52% > 0.50%) +Pass: Cold: 0.107383ms GPU, 0.112108ms CPU, 9.46s total GPU, 88096x +Run: sequence_bench [Device=1] +Pass: Cold: 0.118053ms GPU, 0.122040ms CPU, 0.50s total GPU, 4236x +``` + +# Benchmark Results + +## sequence_bench + +### [0] Quadro GV100 + +| Items | Size | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | +|----------|------------|---------|------------|-------|------------|-------|----------|--------------|--------| +| 16777216 | 64.000 MiB | 88096x | 112.108 us | 0.44% | 107.383 us | 0.52% | 156.238G | 624.951 GB/s | 71.80% | + +### [1] Quadro GP100 + +| Items | Size | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | +|----------|------------|---------|------------|-------|------------|-------|----------|--------------|--------| +| 16777216 | 64.000 MiB | 4236x | 122.040 us | 0.32% | 118.053 us | 0.31% | 142.116G | 568.464 GB/s | 77.64% | diff --git a/examples/outputs/nvbench.example.exec_tag_timer.csv b/examples/outputs/nvbench.example.exec_tag_timer.csv new file mode 100644 index 0000000..9680594 --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_timer.csv @@ -0,0 +1,3 @@ +Benchmark,Device,Device Name,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak +mod2_inplace,0,Quadro GV100,No,27572,0.00026979653764688707,0.009574712799615451,0.0002636220670364504,0.009810298459991085,63641167025.97682,509129336207.8146,0.5849371969299341 +mod2_inplace,1,Quadro GP100,No,26721,0.0002731037747090319,0.005791152713027165,0.000268388258513477,0.005821839019989222,62510990953.64315,500087927629.1452,0.6830309326228491 diff --git a/examples/outputs/nvbench.example.exec_tag_timer.json b/examples/outputs/nvbench.example.exec_tag_timer.json new file mode 100644 index 0000000..d5650c9 --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_timer.json @@ -0,0 +1,378 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "mod2_inplace", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": null, + "states": { + "Device=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "27572" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00026979653764688707" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009574712799615451" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002636220670364504" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009810298459991085" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "63641167025.97682" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "509129336207.8146" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5849371969299341" + } + } + }, + "is_skipped": false + }, + "Device=1": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "26721" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0002731037747090319" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005791152713027165" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.000268388258513477" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.005821839019989222" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "62510990953.64315" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "500087927629.1452" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.6830309326228491" + } + } + }, + "is_skipped": false + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.exec_tag_timer.list.md b/examples/outputs/nvbench.example.exec_tag_timer.list.md new file mode 100644 index 0000000..09c2b13 --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_timer.list.md @@ -0,0 +1,32 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 30117 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 14891 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `mod2_inplace` (1 configurations) + diff --git a/examples/outputs/nvbench.example.exec_tag_timer.md b/examples/outputs/nvbench.example.exec_tag_timer.md new file mode 100644 index 0000000..8f842a2 --- /dev/null +++ b/examples/outputs/nvbench.example.exec_tag_timer.md @@ -0,0 +1,54 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 32163 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15999 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: mod2_inplace [Device=0] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.98% > 0.50%) +Pass: Cold: 0.263622ms GPU, 0.269797ms CPU, 7.27s total GPU, 27572x +Run: mod2_inplace [Device=1] +Warn: Current measurement timed out (15.00s) while over noise threshold (0.58% > 0.50%) +Pass: Cold: 0.268388ms GPU, 0.273104ms CPU, 7.17s total GPU, 26721x +``` + +# Benchmark Results + +## mod2_inplace + +### [0] Quadro GV100 + +| Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | +|---------|------------|-------|------------|-------|---------|--------------|--------| +| 27572x | 269.797 us | 0.96% | 263.622 us | 0.98% | 63.641G | 509.129 GB/s | 58.49% | + +### [1] Quadro GP100 + +| Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | +|---------|------------|-------|------------|-------|---------|--------------|--------| +| 26721x | 273.104 us | 0.58% | 268.388 us | 0.58% | 62.511G | 500.088 GB/s | 68.30% | diff --git a/examples/outputs/nvbench.example.skip.csv b/examples/outputs/nvbench.example.skip.csv new file mode 100644 index 0000000..4d85c73 --- /dev/null +++ b/examples/outputs/nvbench.example.skip.csv @@ -0,0 +1,71 @@ +Benchmark,Device,Device Name,Duration,Kramble,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Batch GPU (sec),Batch,In,Out +runtime_skip,0,Quadro GV100,0,Foo,No,148083,1.0558313054165725e-05,0.03460045726144196,4.411147755695762e-06,0.10691914249190457,2.086331167986729e-06,239683,, +runtime_skip,0,Quadro GV100,0.00025,Foo,No,1967,0.00026056515149974605,0.001785414177412443,0.0002542243968822252,0.0022747905334427393,0.0002519049829290819,2064,, +runtime_skip,0,Quadro GV100,0.0005,Foo,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0.00075,Foo,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0.001,Foo,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0,Bar,No,147976,1.0477963967129967e-05,0.05193537216616394,4.309996180719637e-06,0.09631511021814754,2.1032374904355167e-06,237900,, +runtime_skip,0,Quadro GV100,0.00025,Bar,No,1967,0.0002606047381799697,0.004402412277520235,0.00025425122354802296,0.0019316173072071725,0.00025190548564112467,2064,, +runtime_skip,0,Quadro GV100,0.0005,Bar,No,993,0.0005103389546827793,0.0009044135054133329,0.0005038951597545129,0.0009132341980037791,0.0005017609577982818,1044,, +runtime_skip,0,Quadro GV100,0.00075,Bar,No,664,0.0007602315828313258,0.0012393600029445503,0.0007537762098104041,0.0006523899466894487,0.0007516189640871593,697,, +runtime_skip,0,Quadro GV100,0.001,Bar,No,499,0.0010100405511022032,0.0004675770563736675,0.0010036521122785227,0.000506242755428486,0.0010014759304418617,523,, +runtime_skip,0,Quadro GV100,0,Baz,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0.00025,Baz,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0.0005,Baz,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0.00075,Baz,Yes,,,,,,,,, +runtime_skip,0,Quadro GV100,0.001,Baz,No,499,0.0010100214969939881,0.0004890967396052713,0.0010036210096431835,0.0005031795348101782,0.001001475909284053,524,, +runtime_skip,1,Quadro GP100,0,Foo,No,152833,7.790139459409939e-06,0.052486926520143126,3.0540317811368876e-06,0.041965307035547426,1.3476766561262745e-06,371096,, +runtime_skip,1,Quadro GP100,0.00025,Foo,No,1977,0.0002577336697015675,0.0016745726043248602,0.0002530303939943309,0.0012463205002970757,0.0002519046914100187,2073,, +runtime_skip,1,Quadro GP100,0.0005,Foo,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0.00075,Foo,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0.001,Foo,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0,Bar,No,152569,7.89847543734305e-06,0.060900564134115376,3.1315013016758387e-06,0.06800325272918997,1.4410427557047706e-06,346971,, +runtime_skip,1,Quadro GP100,0.00025,Bar,No,1977,0.00025772118614061663,0.0027168225148294085,0.0002530340679558949,0.0012784718063768952,0.0002519045087617526,2074,, +runtime_skip,1,Quadro GP100,0.0005,Bar,No,995,0.0005076002613065325,0.0008452471055542708,0.0005028815761283412,0.0006166721165867504,0.0005017611416903409,1045,, +runtime_skip,1,Quadro GP100,0.00075,Bar,No,665,0.0007574173563909778,0.0005404728977728138,0.0007527303210774766,0.00042097281680210553,0.0007516172567547905,698,, +runtime_skip,1,Quadro GP100,0.001,Bar,No,499,0.0010073301743486979,0.0004181556191860385,0.0010026005258063262,0.0003018745447224176,0.0010014736961772425,524,, +runtime_skip,1,Quadro GP100,0,Baz,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0.00025,Baz,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0.0005,Baz,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0.00075,Baz,Yes,,,,,,,,, +runtime_skip,1,Quadro GP100,0.001,Baz,No,499,0.0010072666492985964,0.0004034748822624355,0.0010025921845006082,0.0002988324879359347,0.001001474511532383,524,, +skip_overload,0,Quadro GV100,,,Yes,,,,,,,,I32,I32 +skip_overload,0,Quadro GV100,,,No,499,0.0010101158116232471,0.00045457872179042216,0.00100363749086498,0.0004920954065745232,0.001001477914376195,523,I32,I64 +skip_overload,0,Quadro GV100,,,No,499,0.0010100936012024046,0.0004656012199546097,0.0010036259481089817,0.00048536100052596265,0.0010014759304418617,523,I64,I32 +skip_overload,0,Quadro GV100,,,Yes,,,,,,,,I64,I64 +skip_overload,1,Quadro GP100,,,Yes,,,,,,,,I32,I32 +skip_overload,1,Quadro GP100,,,No,499,0.0010072941002004009,0.00039832922996069914,0.0010025732686858855,0.0003053582386332306,0.0010014733467393249,524,I32,I64 +skip_overload,1,Quadro GP100,,,No,499,0.0010072575310621243,0.0004222831525597227,0.0010025880217313281,0.0003175275654782765,0.001001473929135854,524,I64,I32 +skip_overload,1,Quadro GP100,,,Yes,,,,,,,,I64,I64 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101381663326664,0.0004740828150298835,0.0010036533932408684,0.0005311247776469751,0.001001475909284053,524,I8,I8 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101366533066124,0.0004762668191055286,0.0010036824432785813,0.0005225074502850834,0.001001475909284053,524,I8,I16 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101454048096201,0.00045304078278456247,0.0010037151510586427,0.0005255710579717081,0.0010014777976741756,523,I8,I32 +skip_sfinae,0,Quadro GV100,,,No,499,0.001010251853707415,0.0004599262349912718,0.001003687190388386,0.0005303166463057176,0.001001477889432252,524,I8,I64 +skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I16,I8 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010102188396793595,0.0004716222422331537,0.0010036723759465746,0.0004916019059506635,0.0010014759304418617,523,I16,I16 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101248837675358,0.0004725693580783232,0.0010036886648567929,0.0005032992730078731,0.0010014777729529462,524,I16,I32 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101596753507014,0.0004462484178997469,0.0010037086084037075,0.0005439265200637692,0.0010014759304418617,523,I16,I64 +skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I32,I8 +skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I32,I16 +skip_sfinae,0,Quadro GV100,,,No,499,0.001010157290581163,0.00048501754313990346,0.001003686870267248,0.0005466560867161739,0.0010014759304418617,523,I32,I32 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101838817635274,0.00045302096806037583,0.0010036930261489573,0.0004856757915143649,0.001001475909284053,524,I32,I64 +skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I64,I8 +skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I64,I16 +skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I64,I32 +skip_sfinae,0,Quadro GV100,,,No,499,0.0010101592084168342,0.00044733655275106617,0.0010036638461516183,0.00048495653843800993,0.0010014777976741756,523,I64,I64 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072615390781564,0.00041755776345504055,0.0010025953974179124,0.00031401673972615964,0.0010014743950530773,524,I8,I8 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072623366733464,0.0004115227504783334,0.0010025991167955256,0.0003150799184063617,0.0010014747444909947,524,I8,I16 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010073937254509017,0.00047436520440331174,0.0010025578190186219,0.0003174654268713629,0.0010014748609703007,524,I8,I32 +skip_sfinae,1,Quadro GP100,,,No,499,0.001007314288577154,0.00044943976467369485,0.0010025514015453848,0.00031771350129621625,0.0010014748609703007,524,I8,I64 +skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I16,I8 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072553206412834,0.00041747568393080776,0.0010025701303042489,0.0003128921530447738,0.0010014740456151597,524,I16,I16 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072746412825657,0.00041073548319461465,0.0010026024477276384,0.00031052785600836483,0.0010014741620944655,524,I16,I32 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072725270541083,0.0004158366079958995,0.0010025823137803163,0.0003051386556782456,0.0010014741620944655,524,I16,I64 +skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I32,I8 +skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I32,I16 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072895851703403,0.00041834159303612994,0.0010025929590026458,0.0002996916722121692,0.0010014740456151597,524,I32,I32 +skip_sfinae,1,Quadro GP100,,,No,499,0.0010072442464929862,0.0004215890640515976,0.0010025749361825603,0.0003141386734389898,0.001001474628011689,524,I32,I64 +skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I64,I8 +skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I64,I16 +skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I64,I32 +skip_sfinae,1,Quadro GP100,,,No,499,0.001007259180360722,0.000416945457190629,0.0010025870577844691,0.0003166772418677883,0.001001475210408218,524,I64,I64 diff --git a/examples/outputs/nvbench.example.skip.json b/examples/outputs/nvbench.example.skip.json new file mode 100644 index 0000000..f17f617 --- /dev/null +++ b/examples/outputs/nvbench.example.skip.json @@ -0,0 +1,6815 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "runtime_skip", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "Duration": { + "type": "float64", + "flags": "", + "values": [ + { + "input_string": "0", + "description": "", + "value": 0.0 + }, + { + "input_string": "0.00025", + "description": "", + "value": 0.00025 + }, + { + "input_string": "0.0005", + "description": "", + "value": 0.0005 + }, + { + "input_string": "0.00075", + "description": "", + "value": 0.00075 + }, + { + "input_string": "0.001", + "description": "", + "value": 0.001 + } + ] + }, + "Kramble": { + "type": "string", + "flags": "", + "values": [ + { + "input_string": "Foo", + "description": "", + "value": "Foo" + }, + { + "input_string": "Bar", + "description": "", + "value": "Bar" + }, + { + "input_string": "Baz", + "description": "", + "value": "Baz" + } + ] + } + }, + "states": { + "Device=0 Duration=0 Kramble=Foo": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "148083" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "1.0558313054165725e-05" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.03460045726144196" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "4.411147755695762e-06" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.10691914249190457" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "2.086331167986729e-06" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "239683" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.00025 Kramble=Foo": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00025" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1967" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00026056515149974605" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.001785414177412443" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002542243968822252" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0022747905334427393" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002519049829290819" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2064" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0005 Kramble=Foo": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Long 'Foo' benchmarks are skipped." + }, + "Device=0 Duration=0.00075 Kramble=Foo": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00075" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Long 'Foo' benchmarks are skipped." + }, + "Device=0 Duration=0.001 Kramble=Foo": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.001" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Long 'Foo' benchmarks are skipped." + }, + "Device=0 Duration=0 Kramble=Bar": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "147976" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "1.0477963967129967e-05" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.05193537216616394" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "4.309996180719637e-06" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.09631511021814754" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "2.1032374904355167e-06" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "237900" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.00025 Kramble=Bar": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00025" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1967" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0002606047381799697" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.004402412277520235" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00025425122354802296" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0019316173072071725" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00025190548564112467" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2064" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.0005 Kramble=Bar": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "993" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0005103389546827793" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0009044135054133329" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005038951597545129" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0009132341980037791" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005017609577982818" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1044" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.00075 Kramble=Bar": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00075" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "664" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007602315828313258" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0012393600029445503" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007537762098104041" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006523899466894487" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007516189640871593" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "697" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0.001 Kramble=Bar": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.001" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010100405511022032" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004675770563736675" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036521122785227" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.000506242755428486" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 Duration=0 Kramble=Baz": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=0 Duration=0.00025 Kramble=Baz": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00025" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=0 Duration=0.0005 Kramble=Baz": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=0 Duration=0.00075 Kramble=Baz": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00075" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=0 Duration=0.001 Kramble=Baz": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.001" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010100214969939881" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004890967396052713" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036210096431835" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005031795348101782" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0 Kramble=Foo": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "152833" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "7.790139459409939e-06" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.052486926520143126" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "3.0540317811368876e-06" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.041965307035547426" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "1.3476766561262745e-06" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "371096" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.00025 Kramble=Foo": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00025" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1977" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0002577336697015675" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0016745726043248602" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002530303939943309" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0012463205002970757" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002519046914100187" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2073" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0005 Kramble=Foo": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Long 'Foo' benchmarks are skipped." + }, + "Device=1 Duration=0.00075 Kramble=Foo": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00075" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Long 'Foo' benchmarks are skipped." + }, + "Device=1 Duration=0.001 Kramble=Foo": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.001" + }, + "Kramble": { + "type": "string", + "value": "Foo" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Long 'Foo' benchmarks are skipped." + }, + "Device=1 Duration=0 Kramble=Bar": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "152569" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "7.89847543734305e-06" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.060900564134115376" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "3.1315013016758387e-06" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.06800325272918997" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "1.4410427557047706e-06" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "346971" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.00025 Kramble=Bar": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00025" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "1977" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00025772118614061663" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0027168225148294085" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002530340679558949" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0012784718063768952" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002519045087617526" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "2074" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.0005 Kramble=Bar": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "995" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0005076002613065325" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0008452471055542708" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005028815761283412" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0006166721165867504" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0005017611416903409" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "1045" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.00075 Kramble=Bar": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00075" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "665" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0007574173563909778" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005404728977728138" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007527303210774766" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00042097281680210553" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0007516172567547905" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "698" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0.001 Kramble=Bar": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.001" + }, + "Kramble": { + "type": "string", + "value": "Bar" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073301743486979" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004181556191860385" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010026005258063262" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003018745447224176" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014736961772425" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 Duration=0 Kramble=Baz": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=1 Duration=0.00025 Kramble=Baz": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00025" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=1 Duration=0.0005 Kramble=Baz": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.0005" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=1 Duration=0.00075 Kramble=Baz": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.00075" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "Short 'Baz' benchmarks are skipped." + }, + "Device=1 Duration=0.001 Kramble=Baz": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "Duration": { + "type": "float64", + "value": "0.001" + }, + "Kramble": { + "type": "string", + "value": "Baz" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072666492985964" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004034748822624355" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025921845006082" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0002988324879359347" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001474511532383" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + }, + { + "index": 1, + "name": "skip_overload", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "In": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "I32", + "description": "int32_t", + "is_active": true + }, + { + "input_string": "I64", + "description": "int64_t", + "is_active": true + } + ] + }, + "Out": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "I32", + "description": "int32_t", + "is_active": true + }, + { + "input_string": "I64", + "description": "int64_t", + "is_active": true + } + ] + } + }, + "states": { + "Device=0 In=I32 Out=I32": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "InputType == OutputType." + }, + "Device=0 In=I32 Out=I64": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101158116232471" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00045457872179042216" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00100363749086498" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004920954065745232" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001477914376195" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I64 Out=I32": { + "device": 0, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010100936012024046" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004656012199546097" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036259481089817" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00048536100052596265" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I64 Out=I64": { + "device": 0, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "InputType == OutputType." + }, + "Device=1 In=I32 Out=I32": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "InputType == OutputType." + }, + "Device=1 In=I32 Out=I64": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072941002004009" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00039832922996069914" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025732686858855" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003053582386332306" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014733467393249" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I64 Out=I32": { + "device": 1, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072575310621243" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004222831525597227" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025880217313281" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003175275654782765" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001473929135854" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I64 Out=I64": { + "device": 1, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "InputType == OutputType." + } + } + }, + { + "index": 2, + "name": "skip_sfinae", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": { + "In": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "I8", + "description": "int8_t", + "is_active": true + }, + { + "input_string": "I16", + "description": "int16_t", + "is_active": true + }, + { + "input_string": "I32", + "description": "int32_t", + "is_active": true + }, + { + "input_string": "I64", + "description": "int64_t", + "is_active": true + } + ] + }, + "Out": { + "type": "type", + "flags": "", + "values": [ + { + "input_string": "I8", + "description": "int8_t", + "is_active": true + }, + { + "input_string": "I16", + "description": "int16_t", + "is_active": true + }, + { + "input_string": "I32", + "description": "int32_t", + "is_active": true + }, + { + "input_string": "I64", + "description": "int64_t", + "is_active": true + } + ] + } + }, + "states": { + "Device=0 In=I8 Out=I8": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101381663326664" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004740828150298835" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036533932408684" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005311247776469751" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=I16": { + "device": 0, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101366533066124" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004762668191055286" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036824432785813" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005225074502850834" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=I32": { + "device": 0, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101454048096201" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00045304078278456247" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010037151510586427" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005255710579717081" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014777976741756" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I8 Out=I64": { + "device": 0, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001010251853707415" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004599262349912718" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003687190388386" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005303166463057176" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001477889432252" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=I8": { + "device": 0, + "type_config_index": 4, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I16 Out=I16": { + "device": 0, + "type_config_index": 5, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010102188396793595" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004716222422331537" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036723759465746" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004916019059506635" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=I32": { + "device": 0, + "type_config_index": 6, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101248837675358" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004725693580783232" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036886648567929" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005032992730078731" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014777729529462" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I16 Out=I64": { + "device": 0, + "type_config_index": 7, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101596753507014" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004462484178997469" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010037086084037075" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005439265200637692" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I32 Out=I8": { + "device": 0, + "type_config_index": 8, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I32 Out=I16": { + "device": 0, + "type_config_index": 9, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I32 Out=I32": { + "device": 0, + "type_config_index": 10, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001010157290581163" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00048501754313990346" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001003686870267248" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0005466560867161739" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014759304418617" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I32 Out=I64": { + "device": 0, + "type_config_index": 11, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101838817635274" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00045302096806037583" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036930261489573" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004856757915143649" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475909284053" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=0 In=I64 Out=I8": { + "device": 0, + "type_config_index": 12, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=I16": { + "device": 0, + "type_config_index": 13, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=I32": { + "device": 0, + "type_config_index": 14, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=0 In=I64 Out=I64": { + "device": 0, + "type_config_index": 15, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010101592084168342" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00044733655275106617" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010036638461516183" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00048495653843800993" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014777976741756" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "523" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=I8": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072615390781564" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041755776345504055" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025953974179124" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00031401673972615964" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014743950530773" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=I16": { + "device": 1, + "type_config_index": 1, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072623366733464" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004115227504783334" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025991167955256" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003150799184063617" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014747444909947" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=I32": { + "device": 1, + "type_config_index": 2, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010073937254509017" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00047436520440331174" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025578190186219" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003174654268713629" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014748609703007" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I8 Out=I64": { + "device": 1, + "type_config_index": 3, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I8" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001007314288577154" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00044943976467369485" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025514015453848" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00031771350129621625" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014748609703007" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=I8": { + "device": 1, + "type_config_index": 4, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I16 Out=I16": { + "device": 1, + "type_config_index": 5, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072553206412834" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041747568393080776" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025701303042489" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003128921530447738" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014740456151597" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=I32": { + "device": 1, + "type_config_index": 6, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072746412825657" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041073548319461465" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010026024477276384" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00031052785600836483" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014741620944655" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I16 Out=I64": { + "device": 1, + "type_config_index": 7, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I16" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072725270541083" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004158366079958995" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025823137803163" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003051386556782456" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014741620944655" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I32 Out=I8": { + "device": 1, + "type_config_index": 8, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I32 Out=I16": { + "device": 1, + "type_config_index": 9, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I32 Out=I32": { + "device": 1, + "type_config_index": 10, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072895851703403" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.00041834159303612994" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025929590026458" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0002996916722121692" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010014740456151597" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I32 Out=I64": { + "device": 1, + "type_config_index": 11, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I32" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.0010072442464929862" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0004215890640515976" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025749361825603" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003141386734389898" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001474628011689" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + }, + "Device=1 In=I64 Out=I8": { + "device": 1, + "type_config_index": 12, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I8" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=I16": { + "device": 1, + "type_config_index": 13, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I16" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=I32": { + "device": 1, + "type_config_index": 14, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I32" + } + }, + "summaries": null, + "is_skipped": true, + "skip_reason": "sizeof(InputType) > sizeof(OutputType)." + }, + "Device=1 In=I64 Out=I64": { + "device": 1, + "type_config_index": 15, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": { + "In": { + "type": "string", + "value": "I64" + }, + "Out": { + "type": "string", + "value": "I64" + } + }, + "summaries": { + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "499" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.001007259180360722" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.000416945457190629" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0010025870577844691" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.0003166772418677883" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.001001475210408218" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "524" + } + } + }, + "is_skipped": false + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.skip.list.md b/examples/outputs/nvbench.example.skip.list.md new file mode 100644 index 0000000..a096004 --- /dev/null +++ b/examples/outputs/nvbench.example.skip.list.md @@ -0,0 +1,71 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 31309 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15467 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `runtime_skip` (15 configurations) + +### Axes + +* `Duration` : float64 + * `0` + * `0.00025` + * `0.0005` + * `0.00075` + * `0.001` +* `Kramble` : string + * `Foo` + * `Bar` + * `Baz` + +## [1] `skip_overload` (4 configurations) + +### Axes + +* `In` : type + * `I32` (int32_t) + * `I64` (int64_t) +* `Out` : type + * `I32` (int32_t) + * `I64` (int64_t) + +## [2] `skip_sfinae` (16 configurations) + +### Axes + +* `In` : type + * `I8` (int8_t) + * `I16` (int16_t) + * `I32` (int32_t) + * `I64` (int64_t) +* `Out` : type + * `I8` (int8_t) + * `I16` (int16_t) + * `I32` (int32_t) + * `I64` (int64_t) + diff --git a/examples/outputs/nvbench.example.skip.md b/examples/outputs/nvbench.example.skip.md new file mode 100644 index 0000000..62e5ca7 --- /dev/null +++ b/examples/outputs/nvbench.example.skip.md @@ -0,0 +1,296 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 32163 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15999 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: runtime_skip [Device=0 Duration=0 Kramble=Foo] +Warn: Current measurement timed out (15.00s) while over noise threshold (10.69% > 0.50%) +Pass: Cold: 0.004411ms GPU, 0.010558ms CPU, 0.65s total GPU, 148083x +Pass: Batch: 0.002086ms GPU, 0.50s total GPU, 239683x +Run: runtime_skip [Device=0 Duration=0.00025 Kramble=Foo] +Pass: Cold: 0.254224ms GPU, 0.260565ms CPU, 0.50s total GPU, 1967x +Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2064x +Run: runtime_skip [Device=0 Duration=0.0005 Kramble=Foo] +Skip: Long 'Foo' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0.00075 Kramble=Foo] +Skip: Long 'Foo' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0.001 Kramble=Foo] +Skip: Long 'Foo' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0 Kramble=Bar] +Warn: Current measurement timed out (15.00s) while over noise threshold (9.63% > 0.50%) +Pass: Cold: 0.004310ms GPU, 0.010478ms CPU, 0.64s total GPU, 147976x +Pass: Batch: 0.002103ms GPU, 0.50s total GPU, 237900x +Run: runtime_skip [Device=0 Duration=0.00025 Kramble=Bar] +Pass: Cold: 0.254251ms GPU, 0.260605ms CPU, 0.50s total GPU, 1967x +Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2064x +Run: runtime_skip [Device=0 Duration=0.0005 Kramble=Bar] +Pass: Cold: 0.503895ms GPU, 0.510339ms CPU, 0.50s total GPU, 993x +Pass: Batch: 0.501761ms GPU, 0.52s total GPU, 1044x +Run: runtime_skip [Device=0 Duration=0.00075 Kramble=Bar] +Pass: Cold: 0.753776ms GPU, 0.760232ms CPU, 0.50s total GPU, 664x +Pass: Batch: 0.751619ms GPU, 0.52s total GPU, 697x +Run: runtime_skip [Device=0 Duration=0.001 Kramble=Bar] +Pass: Cold: 1.003652ms GPU, 1.010041ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: runtime_skip [Device=0 Duration=0 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0.00025 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0.0005 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0.00075 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=0 Duration=0.001 Kramble=Baz] +Pass: Cold: 1.003621ms GPU, 1.010021ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: runtime_skip [Device=1 Duration=0 Kramble=Foo] +Warn: Current measurement timed out (15.00s) while over noise threshold (4.20% > 0.50%) +Warn: Current measurement timed out (15.00s) before accumulating min_time (0.47s < 0.50s) +Pass: Cold: 0.003054ms GPU, 0.007790ms CPU, 0.47s total GPU, 152833x +Pass: Batch: 0.001348ms GPU, 0.50s total GPU, 371096x +Run: runtime_skip [Device=1 Duration=0.00025 Kramble=Foo] +Pass: Cold: 0.253030ms GPU, 0.257734ms CPU, 0.50s total GPU, 1977x +Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2073x +Run: runtime_skip [Device=1 Duration=0.0005 Kramble=Foo] +Skip: Long 'Foo' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0.00075 Kramble=Foo] +Skip: Long 'Foo' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0.001 Kramble=Foo] +Skip: Long 'Foo' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0 Kramble=Bar] +Warn: Current measurement timed out (15.00s) while over noise threshold (6.80% > 0.50%) +Warn: Current measurement timed out (15.00s) before accumulating min_time (0.48s < 0.50s) +Pass: Cold: 0.003132ms GPU, 0.007898ms CPU, 0.48s total GPU, 152569x +Pass: Batch: 0.001441ms GPU, 0.50s total GPU, 346971x +Run: runtime_skip [Device=1 Duration=0.00025 Kramble=Bar] +Pass: Cold: 0.253034ms GPU, 0.257721ms CPU, 0.50s total GPU, 1977x +Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2074x +Run: runtime_skip [Device=1 Duration=0.0005 Kramble=Bar] +Pass: Cold: 0.502882ms GPU, 0.507600ms CPU, 0.50s total GPU, 995x +Pass: Batch: 0.501761ms GPU, 0.52s total GPU, 1045x +Run: runtime_skip [Device=1 Duration=0.00075 Kramble=Bar] +Pass: Cold: 0.752730ms GPU, 0.757417ms CPU, 0.50s total GPU, 665x +Pass: Batch: 0.751617ms GPU, 0.52s total GPU, 698x +Run: runtime_skip [Device=1 Duration=0.001 Kramble=Bar] +Pass: Cold: 1.002601ms GPU, 1.007330ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: runtime_skip [Device=1 Duration=0 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0.00025 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0.0005 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0.00075 Kramble=Baz] +Skip: Short 'Baz' benchmarks are skipped. +Run: runtime_skip [Device=1 Duration=0.001 Kramble=Baz] +Pass: Cold: 1.002592ms GPU, 1.007267ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: skip_overload [Device=0 In=I32 Out=I32] +Skip: InputType == OutputType. +Run: skip_overload [Device=0 In=I32 Out=I64] +Pass: Cold: 1.003637ms GPU, 1.010116ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x +Run: skip_overload [Device=0 In=I64 Out=I32] +Pass: Cold: 1.003626ms GPU, 1.010094ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: skip_overload [Device=0 In=I64 Out=I64] +Skip: InputType == OutputType. +Run: skip_overload [Device=1 In=I32 Out=I32] +Skip: InputType == OutputType. +Run: skip_overload [Device=1 In=I32 Out=I64] +Pass: Cold: 1.002573ms GPU, 1.007294ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x +Run: skip_overload [Device=1 In=I64 Out=I32] +Pass: Cold: 1.002588ms GPU, 1.007258ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: skip_overload [Device=1 In=I64 Out=I64] +Skip: InputType == OutputType. +Run: skip_sfinae [Device=0 In=I8 Out=I8] +Pass: Cold: 1.003653ms GPU, 1.010138ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=0 In=I8 Out=I16] +Pass: Cold: 1.003682ms GPU, 1.010137ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=0 In=I8 Out=I32] +Pass: Cold: 1.003715ms GPU, 1.010145ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x +Run: skip_sfinae [Device=0 In=I8 Out=I64] +Pass: Cold: 1.003687ms GPU, 1.010252ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=0 In=I16 Out=I8] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=0 In=I16 Out=I16] +Pass: Cold: 1.003672ms GPU, 1.010219ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: skip_sfinae [Device=0 In=I16 Out=I32] +Pass: Cold: 1.003689ms GPU, 1.010125ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=0 In=I16 Out=I64] +Pass: Cold: 1.003709ms GPU, 1.010160ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: skip_sfinae [Device=0 In=I32 Out=I8] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=0 In=I32 Out=I16] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=0 In=I32 Out=I32] +Pass: Cold: 1.003687ms GPU, 1.010157ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x +Run: skip_sfinae [Device=0 In=I32 Out=I64] +Pass: Cold: 1.003693ms GPU, 1.010184ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=0 In=I64 Out=I8] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=0 In=I64 Out=I16] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=0 In=I64 Out=I32] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=0 In=I64 Out=I64] +Pass: Cold: 1.003664ms GPU, 1.010159ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x +Run: skip_sfinae [Device=1 In=I8 Out=I8] +Pass: Cold: 1.002595ms GPU, 1.007262ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I8 Out=I16] +Pass: Cold: 1.002599ms GPU, 1.007262ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I8 Out=I32] +Pass: Cold: 1.002558ms GPU, 1.007394ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I8 Out=I64] +Pass: Cold: 1.002551ms GPU, 1.007314ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I16 Out=I8] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=1 In=I16 Out=I16] +Pass: Cold: 1.002570ms GPU, 1.007255ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I16 Out=I32] +Pass: Cold: 1.002602ms GPU, 1.007275ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I16 Out=I64] +Pass: Cold: 1.002582ms GPU, 1.007273ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I32 Out=I8] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=1 In=I32 Out=I16] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=1 In=I32 Out=I32] +Pass: Cold: 1.002593ms GPU, 1.007290ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I32 Out=I64] +Pass: Cold: 1.002575ms GPU, 1.007244ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +Run: skip_sfinae [Device=1 In=I64 Out=I8] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=1 In=I64 Out=I16] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=1 In=I64 Out=I32] +Skip: sizeof(InputType) > sizeof(OutputType). +Run: skip_sfinae [Device=1 In=I64 Out=I64] +Pass: Cold: 1.002587ms GPU, 1.007259ms CPU, 0.50s total GPU, 499x +Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x +``` + +# Benchmark Results + +## runtime_skip + +### [0] Quadro GV100 + +| Duration | Kramble | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|----------|---------|---------|------------|-------|------------|--------|------------|---------| +| 0 | Foo | 148083x | 10.558 us | 3.46% | 4.411 us | 10.69% | 2.086 us | 239683x | +| 0.00025 | Foo | 1967x | 260.565 us | 0.18% | 254.224 us | 0.23% | 251.905 us | 2064x | +| 0 | Bar | 147976x | 10.478 us | 5.19% | 4.310 us | 9.63% | 2.103 us | 237900x | +| 0.00025 | Bar | 1967x | 260.605 us | 0.44% | 254.251 us | 0.19% | 251.905 us | 2064x | +| 0.0005 | Bar | 993x | 510.339 us | 0.09% | 503.895 us | 0.09% | 501.761 us | 1044x | +| 0.00075 | Bar | 664x | 760.232 us | 0.12% | 753.776 us | 0.07% | 751.619 us | 697x | +| 0.001 | Bar | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| 0.001 | Baz | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | + +### [1] Quadro GP100 + +| Duration | Kramble | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|----------|---------|---------|------------|-------|------------|-------|------------|---------| +| 0 | Foo | 152833x | 7.790 us | 5.25% | 3.054 us | 4.20% | 1.348 us | 371096x | +| 0.00025 | Foo | 1977x | 257.734 us | 0.17% | 253.030 us | 0.12% | 251.905 us | 2073x | +| 0 | Bar | 152569x | 7.898 us | 6.09% | 3.132 us | 6.80% | 1.441 us | 346971x | +| 0.00025 | Bar | 1977x | 257.721 us | 0.27% | 253.034 us | 0.13% | 251.905 us | 2074x | +| 0.0005 | Bar | 995x | 507.600 us | 0.08% | 502.882 us | 0.06% | 501.761 us | 1045x | +| 0.00075 | Bar | 665x | 757.417 us | 0.05% | 752.730 us | 0.04% | 751.617 us | 698x | +| 0.001 | Bar | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| 0.001 | Baz | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## skip_overload + +### [0] Quadro GV100 + +| In | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|-----|-----|---------|----------|-------|----------|-------|-----------|-------| +| I32 | I64 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| I64 | I32 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | + +### [1] Quadro GP100 + +| In | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|-----|-----|---------|----------|-------|----------|-------|-----------|-------| +| I32 | I64 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I64 | I32 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | + +## skip_sfinae + +### [0] Quadro GV100 + +| In | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|-----|-----|---------|----------|-------|----------|-------|-----------|-------| +| I8 | I8 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | +| I8 | I16 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | +| I8 | I32 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| I8 | I64 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | +| I16 | I16 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| I16 | I32 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | +| I16 | I64 | 499x | 1.010 ms | 0.04% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| I32 | I32 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 523x | +| I32 | I64 | 499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% | 1.001 ms | 524x | +| I64 | I64 | 499x | 1.010 ms | 0.04% | 1.004 ms | 0.05% | 1.001 ms | 523x | + +### [1] Quadro GP100 + +| In | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch | +|-----|-----|---------|----------|-------|----------|-------|-----------|-------| +| I8 | I8 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I8 | I16 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I8 | I32 | 499x | 1.007 ms | 0.05% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I8 | I64 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I16 | I16 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I16 | I32 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I16 | I64 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I32 | I32 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I32 | I64 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | +| I64 | I64 | 499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% | 1.001 ms | 524x | diff --git a/examples/outputs/nvbench.example.throughput.csv b/examples/outputs/nvbench.example.throughput.csv new file mode 100644 index 0000000..30eca18 --- /dev/null +++ b/examples/outputs/nvbench.example.throughput.csv @@ -0,0 +1,3 @@ +Benchmark,Device,Device Name,Skipped,NumElements,DataSize (bytes),Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak,Batch GPU (sec),Batch +throughput_bench,0,Quadro GV100,No,16777216,67108864,47755,0.00027093838689142973,0.011175841617840646,0.00026478739019249176,0.011463549240955353,63361083727.603165,506888669820.8253,0.5823629019081173,0.0002632571401085129,47756 +throughput_bench,1,Quadro GP100,No,16777216,67108864,46734,0.00028047131987418375,0.009878517915727511,0.0002757727249773843,0.009988896437015563,60837111434.337364,486696891474.6989,0.664741165147917,0.0002754124925807889,46735 diff --git a/examples/outputs/nvbench.example.throughput.json b/examples/outputs/nvbench.example.throughput.json new file mode 100644 index 0000000..e13072e --- /dev/null +++ b/examples/outputs/nvbench.example.throughput.json @@ -0,0 +1,498 @@ +{ + "devices": [ + { + "id": 0, + "name": "Quadro GV100", + "sm_version": 700, + "ptx_version": 700, + "sm_default_clock_rate": 1627000000, + "number_of_sms": 80, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 34086060032, + "global_memory_bus_peak_clock_rate": 850000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 870400000000, + "l2_cache_size": 6291456, + "shared_memory_per_sm": 98304, + "shared_memory_per_block": 49152, + "ecc_state": false + }, + { + "id": 1, + "name": "Quadro GP100", + "sm_version": 600, + "ptx_version": 600, + "sm_default_clock_rate": 1442500000, + "number_of_sms": 56, + "max_blocks_per_sm": 32, + "max_threads_per_sm": 2048, + "max_threads_per_block": 1024, + "registers_per_sm": 65536, + "registers_per_block": 65536, + "global_memory_size": 17069309952, + "global_memory_bus_peak_clock_rate": 715000000, + "global_memory_bus_width": 4096, + "global_memory_bus_bandwidth": 732160000000, + "l2_cache_size": 4194304, + "shared_memory_per_sm": 65536, + "shared_memory_per_block": 49152, + "ecc_state": false + } + ], + "benchmarks": [ + { + "index": 0, + "name": "throughput_bench", + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "devices": [ + 0, + 1 + ], + "axes": null, + "states": { + "Device=0": { + "device": 0, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Element count: NumElements": { + "short_name": { + "type": "string", + "value": "NumElements" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "DataSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "47755" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00027093838689142973" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.011175841617840646" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.00026478739019249176" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.011463549240955353" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "63361083727.603165" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "506888669820.8253" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.5823629019081173" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002632571401085129" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "47756" + } + } + }, + "is_skipped": false + }, + "Device=1": { + "device": 1, + "type_config_index": 0, + "min_samples": 10, + "min_time": 0.5, + "max_noise": 0.005, + "skip_time": -1.0, + "timeout": 15.0, + "axis_values": null, + "summaries": { + "Element count: NumElements": { + "short_name": { + "type": "string", + "value": "NumElements" + }, + "value": { + "type": "int64", + "value": "16777216" + } + }, + "Input Buffer Size: ": { + "hint": { + "type": "string", + "value": "bytes" + }, + "short_name": { + "type": "string", + "value": "DataSize" + }, + "value": { + "type": "int64", + "value": "67108864" + } + }, + "Number of Samples (Cold)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Samples" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in cold time measurements." + }, + "value": { + "type": "int64", + "value": "46734" + } + }, + "Average CPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "CPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time observed from host." + }, + "value": { + "type": "float64", + "value": "0.00028047131987418375" + } + }, + "CPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold CPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009878517915727511" + } + }, + "Average GPU Time (Cold)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "GPU Time" + }, + "description": { + "type": "string", + "value": "Average isolated kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002757727249773843" + } + }, + "GPU Relative Standard Deviation (Cold)": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "Noise" + }, + "description": { + "type": "string", + "value": "Relative standard deviation of the cold GPU execution time measurements." + }, + "value": { + "type": "float64", + "value": "0.009988896437015563" + } + }, + "Element Throughput": { + "hint": { + "type": "string", + "value": "item_rate" + }, + "short_name": { + "type": "string", + "value": "Elem/s" + }, + "description": { + "type": "string", + "value": "Number of input elements handled per second." + }, + "value": { + "type": "float64", + "value": "60837111434.337364" + } + }, + "Average Global Memory Throughput": { + "hint": { + "type": "string", + "value": "byte_rate" + }, + "short_name": { + "type": "string", + "value": "GlobalMem BW" + }, + "description": { + "type": "string", + "value": "Number of bytes read/written per second to the CUDA device's global memory." + }, + "value": { + "type": "float64", + "value": "486696891474.6989" + } + }, + "Percent Peak Global Memory Throughput": { + "hint": { + "type": "string", + "value": "percentage" + }, + "short_name": { + "type": "string", + "value": "BWPeak" + }, + "description": { + "type": "string", + "value": "Global device memory throughput as a percentage of the device's peak bandwidth." + }, + "value": { + "type": "float64", + "value": "0.664741165147917" + } + }, + "Average GPU Time (Batch)": { + "hint": { + "type": "string", + "value": "duration" + }, + "short_name": { + "type": "string", + "value": "Batch GPU" + }, + "description": { + "type": "string", + "value": "Average back-to-back kernel execution time as measured by CUDA events." + }, + "value": { + "type": "float64", + "value": "0.0002754124925807889" + } + }, + "Number of Samples (Batch)": { + "hint": { + "type": "string", + "value": "sample_size" + }, + "short_name": { + "type": "string", + "value": "Batch" + }, + "description": { + "type": "string", + "value": "Number of kernel executions in hot time measurements." + }, + "value": { + "type": "int64", + "value": "46735" + } + } + }, + "is_skipped": false + } + } + } + ] +} diff --git a/examples/outputs/nvbench.example.throughput.list.md b/examples/outputs/nvbench.example.throughput.list.md new file mode 100644 index 0000000..b39ce69 --- /dev/null +++ b/examples/outputs/nvbench.example.throughput.list.md @@ -0,0 +1,32 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 30117 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 14939 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Benchmarks + +## [0] `throughput_bench` (1 configurations) + diff --git a/examples/outputs/nvbench.example.throughput.md b/examples/outputs/nvbench.example.throughput.md new file mode 100644 index 0000000..50198c6 --- /dev/null +++ b/examples/outputs/nvbench.example.throughput.md @@ -0,0 +1,56 @@ +# Devices + +## [0] `Quadro GV100` +* SM Version: 700 (PTX Version: 700) +* Number of SMs: 80 +* SM Default Clock Rate: 1627 MHz +* Global Memory: 32163 MiB Free / 32507 MiB Total +* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz) +* Max Shared Memory: 96 KiB/SM, 48 KiB/Block +* L2 Cache Size: 6144 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +## [1] `Quadro GP100` +* SM Version: 600 (PTX Version: 600) +* Number of SMs: 56 +* SM Default Clock Rate: 1442 MHz +* Global Memory: 15999 MiB Free / 16278 MiB Total +* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz) +* Max Shared Memory: 64 KiB/SM, 48 KiB/Block +* L2 Cache Size: 4096 KiB +* Maximum Active Blocks: 32/SM +* Maximum Active Threads: 2048/SM, 1024/Block +* Available Registers: 65536/SM, 65536/Block +* ECC Enabled: No + +# Log + +``` +Run: throughput_bench [Device=0] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.15% > 0.50%) +Pass: Cold: 0.264787ms GPU, 0.270938ms CPU, 12.64s total GPU, 47755x +Pass: Batch: 0.263257ms GPU, 12.57s total GPU, 47756x +Run: throughput_bench [Device=1] +Warn: Current measurement timed out (15.00s) while over noise threshold (1.00% > 0.50%) +Pass: Cold: 0.275773ms GPU, 0.280471ms CPU, 12.89s total GPU, 46734x +Pass: Batch: 0.275412ms GPU, 12.87s total GPU, 46735x +``` + +# Benchmark Results + +## throughput_bench + +### [0] Quadro GV100 + +| NumElements | DataSize | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-------------|------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------| +| 16777216 | 64.000 MiB | 47755x | 270.938 us | 1.12% | 264.787 us | 1.15% | 63.361G | 506.889 GB/s | 58.24% | 263.257 us | 47756x | + +### [1] Quadro GP100 + +| NumElements | DataSize | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | GlobalMem BW | BWPeak | Batch GPU | Batch | +|-------------|------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------| +| 16777216 | 64.000 MiB | 46734x | 280.471 us | 0.99% | 275.773 us | 1.00% | 60.837G | 486.697 GB/s | 66.47% | 275.412 us | 46735x | diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu index b254d61..4dd12b5 100644 --- a/nvbench/markdown_printer.cu +++ b/nvbench/markdown_printer.cu @@ -172,6 +172,11 @@ void markdown_printer::do_print_benchmark_list( bench_ptr->get_name(), num_configs); + if (axes.empty()) + { + continue; + } + fmt::format_to(buffer, "### Axes\n\n"); for (const auto &axis_ptr : axes) { diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 4eee401..2ede52a 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -353,7 +353,7 @@ void option_parser::parse_impl() if (m_exit_after_parsing) { - std::exit(0); + this->cleanup_and_exit(0); } if (m_benchmarks.empty()) @@ -406,22 +406,22 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->print_version(); fmt::print("\n"); this->print_help(); - std::exit(0); + this->cleanup_and_exit(0); } else if (arg == "--help-axes" || arg == "--help-axis") { this->print_help_axis(); - std::exit(0); + this->cleanup_and_exit(0); } else if (arg == "--version") { this->print_version(); - std::exit(0); + this->cleanup_and_exit(0); } else if (arg == "--list" || arg == "-l") { this->print_list(); - std::exit(0); + this->cleanup_and_exit(0); } else if (arg == "--persistence-mode" || arg == "--pm") { @@ -590,11 +590,28 @@ void option_parser::print_version() const void option_parser::print_list() const { - const auto &bench_mgr = nvbench::benchmark_manager::get(); + auto do_print = [](auto &&printer) { + printer.print_device_info(); - nvbench::markdown_printer printer{std::cout}; - printer.print_device_info(); - printer.print_benchmark_list(bench_mgr.get_benchmarks()); + const auto &bench_mgr = nvbench::benchmark_manager::get(); + printer.print_benchmark_list(bench_mgr.get_benchmarks()); + }; + + // Try to find a markdown printer in the current list: + for (const auto &printer : m_printer.get_printers()) + { + if (const auto *md_printer_const = + dynamic_cast(printer.get()); + md_printer_const) + { + auto &md_printer = const_cast(*md_printer_const); + do_print(md_printer); + return; + } + } + + // Fallback to a new stdout printer. + do_print(nvbench::markdown_printer{std::cout}); } void option_parser::print_help() const @@ -1012,4 +1029,11 @@ void option_parser::update_used_device_state() const nvbench::printer_base &option_parser::get_printer() { return m_printer; } +void option_parser::cleanup_and_exit(int exit_code) +{ + // Free all ofstreams to make sure they flush: + m_ofstream_storage.clear(); + std::exit(exit_code); +} + } // namespace nvbench diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh index 19d2984..8b6074b 100644 --- a/nvbench/option_parser.cuh +++ b/nvbench/option_parser.cuh @@ -121,6 +121,9 @@ private: void update_used_device_state() const; + // Releases any important resources and calls `std::exit(exit_code)` + [[noreturn]] void cleanup_and_exit(int exit_code); + // less gross argv: std::vector m_args; diff --git a/nvbench/printer_multiplex.cuh b/nvbench/printer_multiplex.cuh index d34ceb2..f607879 100644 --- a/nvbench/printer_multiplex.cuh +++ b/nvbench/printer_multiplex.cuh @@ -46,6 +46,9 @@ struct printer_multiplex : nvbench::printer_base return m_printers.size(); } + [[nodiscard]] const auto &get_printers() const { return m_printers; } + [[nodiscard]] auto &get_printers() { return m_printers; } + private: void do_print_device_info() override; void do_print_log_preamble() override;