diff --git a/README.md b/README.md
index a7240ec..c958ec9 100644
--- a/README.md
+++ b/README.md
@@ -58,11 +58,19 @@ various NVBench features and usecases:
 
 - [Runtime and compile-time parameter sweeps](examples/axes.cu)
 - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
-- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
+- [Reporting simple item/sec and byte/sec throughput statistics](examples/throughput.cu)
+- [Gathering and reporting CUPTI metrics](examples/auto_throughput.cu)
 - [Skipping benchmark configurations](examples/skip.cu)
 - [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
 - [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
 
+### Example Output Samples
+
+Sample outputs for `--list`, `--markdown`, `--json`, and `--csv` are provided
+for each example. These are located in the [examples/outputs](examples/outputs/)
+directory. See the associated [README](examples/outputs/README.md) for more
+information.
+
 ### Building Examples
 
 To build the examples:
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 34c8763..2e01fbb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ set(example_srcs
 # Metatarget for all examples:
 add_custom_target(nvbench.example.all)
 add_dependencies(nvbench.all nvbench.example.all)
+set(examples)
 
 foreach(example_src IN LISTS example_srcs)
   get_filename_component(example_name "${example_src}" NAME_WLE)
@@ -25,4 +26,7 @@ foreach(example_src IN LISTS example_srcs)
   )
 
   add_dependencies(nvbench.example.all ${example_name})
+  list(APPEND examples ${example_name})
 endforeach()
+
+add_subdirectory(outputs)
diff --git a/examples/outputs/CMakeLists.txt b/examples/outputs/CMakeLists.txt
new file mode 100644
index 0000000..179114d
--- /dev/null
+++ b/examples/outputs/CMakeLists.txt
@@ -0,0 +1,52 @@
+set(NVBench_EXAMPLE_OUTPUT_DEVICES "all" CACHE STRING
+  "--devices argument used when generating example outputs."
+)
+set(NVBench_EXAMPLE_OUTPUT_TIMEOUT "15.0" CACHE STRING
+  "--timeout argument used when generating example outputs."
+)
+mark_as_advanced(
+  NVBench_EXAMPLE_OUTPUT_DEVICES
+  NVBench_EXAMPLE_OUTPUT_TIMEOUT
+)
+
+set_property(GLOBAL PROPERTY JOB_POOLS exclusive=1)
+
+add_custom_target(nvbench.regenerate_example_outputs)
+add_custom_target(nvbench.example.all.list)
+
+set(results_dir "${CMAKE_CURRENT_BINARY_DIR}/results")
+
+foreach (example IN LISTS examples)
+  # Print --list
+  add_custom_target(${example}.list
+    COMMAND "$<TARGET_FILE:${example}>"
+      --md "${results_dir}/${example}.list.md"
+      --list
+    BYPRODUCTS "${results_dir}/${example}.list.md"
+    COMMENT "Generating ${example}.list.md..."
+    VERBATIM
+  )
+  add_dependencies(${example}.list nvbench.example.all)
+  add_dependencies(nvbench.regenerate_example_outputs ${example}.list)
+  add_dependencies(nvbench.example.all.list ${example}.list)
+
+  # Run and output all formats
+  add_custom_target(${example}.outputs
+    COMMAND "$<TARGET_FILE:${example}>"
+      --quiet
+      --devices ${NVBench_EXAMPLE_OUTPUT_DEVICES}
+      --timeout ${NVBench_EXAMPLE_OUTPUT_TIMEOUT}
+      --md "${results_dir}/${example}.md"
+      --csv "${results_dir}/${example}.csv"
+      --json "${results_dir}/${example}.json"
+    BYPRODUCTS
+      "${results_dir}/${example}.md"
+      "${results_dir}/${example}.csv"
+      "${results_dir}/${example}.json"
+    COMMENT "Generating ${example} outputs (json, csv, md)..."
+    JOB_POOL exclusive
+    VERBATIM
+  )
+  add_dependencies(${example}.outputs nvbench.example.all)
+  add_dependencies(nvbench.regenerate_example_outputs ${example}.outputs)
+endforeach()
diff --git a/examples/outputs/README.md b/examples/outputs/README.md
new file mode 100644
index 0000000..1256bd6
--- /dev/null
+++ b/examples/outputs/README.md
@@ -0,0 +1,32 @@
+# Example Outputs
+
+This directory contains sample outputs for each of the NVBench examples:
+
+- `${example}.list.md`: The output of invoking the example with `--list`.
+- `${example}.md`: The output of invoking the example with `--md stdout`.
+- `${example}.json`: The output of invoking the example with `--json stdout`.
+- `${example}.csv`: The output of invoking the example with `--csv stdout`.
+
+These files are only meant to provide samples of NVBench output formats. The
+results and measurements in these files are not intended to demonstrate peak
+performance of the devices and algorithms measured. The results may
+intentionally include errors (noisy results, timeouts, etc) to show how these
+are reported.
+
+# Generating
+
+The outputs are generated by building the `nvbench.regenerate_example_outputs`
+target. Ideally, CUPTI metrics should be enabled and the GPU clocks locked
+to `base`:
+
+```bash
+# Enable non-root users to collect CUPTI metrics:
+sudo rmmod nvidia
+sudo modprobe nvidia NVreg_RestrictProfilingToAdminUsers=0
+# Enable persistence mode and lock GPU clocks to base:
+sudo <any nvbench example> --pm 1 --lgc base
+
+ninja nvbench.regenerate_example_outputs
+```
+
+The files will be written to `<nvbench_build_dir>/outputs/results/`.
diff --git a/examples/outputs/nvbench.example.auto_throughput.csv b/examples/outputs/nvbench.example.auto_throughput.csv
new file mode 100644
index 0000000..aaed1be
--- /dev/null
+++ b/examples/outputs/nvbench.example.auto_throughput.csv
@@ -0,0 +1,9 @@
+Benchmark,Device,Device Name,T,Stride,Skipped,Elements,HBWPeak,LoadEff,StoreEff,L1HitRate,L2HitRate,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),Batch GPU (sec),Batch
+throughput_bench,0,Quadro GV100,1,1,No,33554432,0.6585438475423353,1,1,0,0.49999904648868265,1078,0.00047011594805194853,0.0014244727929907238,0.0004639086524829754,0.0015463138823336802,72329825754.28767,0.0004611097332451499,1134
+throughput_bench,0,Quadro GV100,1,4,No,33554432,0.6947096407739456,0.25,1,0,0.2000340992245478,452,0.0011125579800884962,0.0006852662282240376,0.0011062891657373588,0.0006886487256687341,30330616116.660107,0.001102912886702233,473
+throughput_bench,0,Quadro GV100,2,1,No,33554432,0.671727291393325,0.5069332663594991,1,0.2682065963745117,0.5437664402009483,548,0.0009194173795620432,0.0006406757855676574,0.0009131468997819581,0.0006974456101610215,36745929935.27345,0.0009104959699842665,576
+throughput_bench,0,Quadro GV100,2,4,No,33554432,0.5825672478320796,0.12669523778540573,1,0.20691384209526908,0.3812073432135215,175,0.0028746560114285722,0.002486847758258855,0.002868418363843645,0.0024933453791821615,11697886341.460135,0.002855257117229959,184
+throughput_bench,1,Quadro GP100,1,1,Yes,,,,,,,,,,,,,,
+throughput_bench,1,Quadro GP100,1,4,Yes,,,,,,,,,,,,,,
+throughput_bench,1,Quadro GP100,2,1,Yes,,,,,,,,,,,,,,
+throughput_bench,1,Quadro GP100,2,4,Yes,,,,,,,,,,,,,,
diff --git a/examples/outputs/nvbench.example.auto_throughput.json b/examples/outputs/nvbench.example.auto_throughput.json
new file mode 100644
index 0000000..7a71ae8
--- /dev/null
+++ b/examples/outputs/nvbench.example.auto_throughput.json
@@ -0,0 +1,1293 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "throughput_bench",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "T": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "1",
+              "description": "nvbench::enum_type<1, int>",
+              "is_active": true
+            },
+            {
+              "input_string": "2",
+              "description": "nvbench::enum_type<2, int>",
+              "is_active": true
+            }
+          ]
+        },
+        "Stride": {
+          "type": "int64",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "1",
+              "description": "",
+              "value": 1
+            },
+            {
+              "input_string": "4",
+              "description": "",
+              "value": 4
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 T=1 Stride=1": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "1"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "1"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Peak Sustained Global Memory Throughput (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "HBWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "The utilization level of the device memory relative to the peak utilization."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6585438475423353"
+              }
+            },
+            "Global Load Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "LoadEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1"
+              }
+            },
+            "Global Store Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "StoreEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1"
+              }
+            },
+            "L1 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L1HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L1 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0"
+              }
+            },
+            "L2 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L2HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L2 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.49999904648868265"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1078"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00047011594805194853"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014244727929907238"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004639086524829754"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015463138823336802"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "72329825754.28767"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004611097332451499"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1134"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=1 Stride=4": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "1"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "4"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Peak Sustained Global Memory Throughput (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "HBWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "The utilization level of the device memory relative to the peak utilization."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6947096407739456"
+              }
+            },
+            "Global Load Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "LoadEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.25"
+              }
+            },
+            "Global Store Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "StoreEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1"
+              }
+            },
+            "L1 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L1HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L1 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0"
+              }
+            },
+            "L2 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L2HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L2 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2000340992245478"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "452"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011125579800884962"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006852662282240376"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011062891657373588"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006886487256687341"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "30330616116.660107"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001102912886702233"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "473"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=2 Stride=1": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "2"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "1"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Peak Sustained Global Memory Throughput (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "HBWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "The utilization level of the device memory relative to the peak utilization."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.671727291393325"
+              }
+            },
+            "Global Load Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "LoadEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5069332663594991"
+              }
+            },
+            "Global Store Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "StoreEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1"
+              }
+            },
+            "L1 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L1HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L1 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2682065963745117"
+              }
+            },
+            "L2 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L2HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L2 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5437664402009483"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "548"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009194173795620432"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006406757855676574"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009131468997819581"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006974456101610215"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "36745929935.27345"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009104959699842665"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "576"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=2 Stride=4": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "2"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "4"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Peak Sustained Global Memory Throughput (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "HBWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "The utilization level of the device memory relative to the peak utilization."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5825672478320796"
+              }
+            },
+            "Global Load Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "LoadEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory load throughput to required global memory load throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.12669523778540573"
+              }
+            },
+            "Global Store Efficiency (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "StoreEff"
+              },
+              "description": {
+                "type": "string",
+                "value": "Ratio of requested global memory store throughput to required global memory store throughput expressed as percentage."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1"
+              }
+            },
+            "L1 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L1HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L1 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.20691384209526908"
+              }
+            },
+            "L2 Cache Hit Rate (HW)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "L2HitRate"
+              },
+              "description": {
+                "type": "string",
+                "value": "Hit rate at L2 cache."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.3812073432135215"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "175"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0028746560114285722"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002486847758258855"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002868418363843645"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024933453791821615"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "11697886341.460135"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002855257117229959"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "184"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=1 Stride=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "1"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "1"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            }
+          },
+          "is_skipped": true,
+          "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)"
+        },
+        "Device=1 T=1 Stride=4": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "1"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "4"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            }
+          },
+          "is_skipped": true,
+          "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)"
+        },
+        "Device=1 T=2 Stride=1": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "2"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "1"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            }
+          },
+          "is_skipped": true,
+          "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)"
+        },
+        "Device=1 T=2 Stride=4": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "2"
+            },
+            "Stride": {
+              "type": "int64",
+              "value": "4"
+            }
+          },
+          "summaries": {
+            "Element count: Elements": {
+              "short_name": {
+                "type": "string",
+                "value": "Elements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            }
+          },
+          "is_skipped": true,
+          "skip_reason": "Unexpected error: Device: 1 isn't supported (CC 600)"
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.auto_throughput.list.md b/examples/outputs/nvbench.example.auto_throughput.list.md
new file mode 100644
index 0000000..8f808da
--- /dev/null
+++ b/examples/outputs/nvbench.example.auto_throughput.list.md
@@ -0,0 +1,41 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 31601 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15563 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `throughput_bench` (4 configurations)
+
+### Axes
+
+* `T` : type
+  * `1` (nvbench::enum_type<1, int>)
+  * `2` (nvbench::enum_type<2, int>)
+* `Stride` : int64
+  * `1`
+  * `4`
+
diff --git a/examples/outputs/nvbench.example.auto_throughput.md b/examples/outputs/nvbench.example.auto_throughput.md
new file mode 100644
index 0000000..ded46d8
--- /dev/null
+++ b/examples/outputs/nvbench.example.auto_throughput.md
@@ -0,0 +1,73 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 32163 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15999 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  throughput_bench [Device=0 T=1 Stride=1]
+Pass: Cold: 0.463909ms GPU, 0.470116ms CPU, 0.50s total GPU, 1078x
+Pass: Batch: 0.461110ms GPU, 0.52s total GPU, 1134x
+Run:  throughput_bench [Device=0 T=1 Stride=4]
+Pass: Cold: 1.106289ms GPU, 1.112558ms CPU, 0.50s total GPU, 452x
+Pass: Batch: 1.102913ms GPU, 0.52s total GPU, 473x
+Run:  throughput_bench [Device=0 T=2 Stride=1]
+Pass: Cold: 0.913147ms GPU, 0.919417ms CPU, 0.50s total GPU, 548x
+Pass: Batch: 0.910496ms GPU, 0.52s total GPU, 576x
+Run:  throughput_bench [Device=0 T=2 Stride=4]
+Pass: Cold: 2.868418ms GPU, 2.874656ms CPU, 0.50s total GPU, 175x
+Pass: Batch: 2.855257ms GPU, 0.53s total GPU, 184x
+Run:  throughput_bench [Device=1 T=1 Stride=1]
+Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600)
+Fail: Unexpected error: Device: 1 isn't supported (CC 600)
+Run:  throughput_bench [Device=1 T=1 Stride=4]
+Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600)
+Fail: Unexpected error: Device: 1 isn't supported (CC 600)
+Run:  throughput_bench [Device=1 T=2 Stride=1]
+Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600)
+Fail: Unexpected error: Device: 1 isn't supported (CC 600)
+Run:  throughput_bench [Device=1 T=2 Stride=4]
+Warn: CUPTI failed to construct profiler: Device: 1 isn't supported (CC 600)
+Fail: Unexpected error: Device: 1 isn't supported (CC 600)
+```
+
+# Benchmark Results
+
+## throughput_bench
+
+### [0] Quadro GV100
+
+| T | Stride | Elements | HBWPeak | LoadEff | StoreEff | L1HitRate | L2HitRate | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | Batch GPU  | Batch |
+|---|--------|----------|---------|---------|----------|-----------|-----------|---------|------------|-------|------------|-------|---------|------------|-------|
+| 1 |      1 | 33554432 |  65.85% | 100.00% |  100.00% |     0.00% |    50.00% |   1078x | 470.116 us | 0.14% | 463.909 us | 0.15% | 72.330G | 461.110 us | 1134x |
+| 1 |      4 | 33554432 |  69.47% |  25.00% |  100.00% |     0.00% |    20.00% |    452x |   1.113 ms | 0.07% |   1.106 ms | 0.07% | 30.331G |   1.103 ms |  473x |
+| 2 |      1 | 33554432 |  67.17% |  50.69% |  100.00% |    26.82% |    54.38% |    548x | 919.417 us | 0.06% | 913.147 us | 0.07% | 36.746G | 910.496 us |  576x |
+| 2 |      4 | 33554432 |  58.26% |  12.67% |  100.00% |    20.69% |    38.12% |    175x |   2.875 ms | 0.25% |   2.868 ms | 0.25% | 11.698G |   2.855 ms |  184x |
+
+### [1] Quadro GP100
+
+No data -- check log.
diff --git a/examples/outputs/nvbench.example.axes.csv b/examples/outputs/nvbench.example.axes.csv
new file mode 100644
index 0000000..177dc56
--- /dev/null
+++ b/examples/outputs/nvbench.example.axes.csv
@@ -0,0 +1,127 @@
+Benchmark,Device,Device Name,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Batch GPU (sec),Batch,Duration,BlockSize (pow2),BlockSize,NumBlocks (pow2),NumBlocks,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak,T,In,Out,Items,InSize (bytes),OutSize (bytes)
+simple,0,Quadro GV100,No,499,0.0010102518997995992,0.0005320863289715677,0.0010037636295826922,0.0005606955085401353,0.001001475909284053,524,,,,,,,,,,,,,,
+simple,1,Quadro GP100,No,499,0.0010072372945891786,0.0004201092756117083,0.001002567436508759,0.0003010855735431417,0.001001474511532383,524,,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,147957,1.0618111072811744e-05,0.03254637275181478,4.42401244240246e-06,0.1078451537144948,2.0427748176574993e-06,244766,0,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,4831,0.00011004767191057754,0.004150807794866266,0.00010351461256001611,0.004781428015913556,0.00010137620362095862,5088,0.0001,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,2453,0.00021036899388503913,0.002247040450593631,0.0002039032309542721,0.0024717338672984117,0.00020172918129115027,2582,0.0002,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,1648,0.00030986622087378617,0.0014834014227566264,0.0003034121238635499,0.0016613620256970604,0.00030116395027406757,1736,0.00030000000000000003,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,1239,0.0004101481057304273,0.0012074050648415652,0.000403672849583562,0.00138227314754308,0.00040141034272550807,1304,0.0004,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,992,0.0005105290151209681,0.0009132925978805083,0.0005040890874881937,0.001126663137928465,0.0005017619947554283,1042,0.0005,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,829,0.0006098617478890229,0.0007603582722561039,0.000603470634925204,0.0009715240544303389,0.0006011044563503441,872,0.0006000000000000001,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,711,0.0007102935302390999,0.0006584784515618755,0.0007037439938168366,0.0008111324463740949,0.0007014426981064088,748,0.0007000000000000001,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,622,0.000810564897106109,0.0006072346537084304,0.0008041868904587526,0.000707547745748677,0.0008017951065694391,653,0.0008000000000000001,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,554,0.0009098726931407932,0.0005164841732610924,0.000903432962588876,0.0005681374617078215,0.0009011252491744524,582,0.0009000000000000002,,,,,,,,,,,,,
+single_float64_axis,0,Quadro GV100,No,499,0.0010102697054108218,0.00044759296847813034,0.0010038065348932821,0.0005354727313941588,0.0010014759304418617,523,0.0010000000000000002,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,152839,7.705229234684682e-06,0.05418085805591698,3.016308709558017e-06,0.04127500754098809,1.3434882326935044e-06,372166,0,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,4879,0.00010715639024390229,0.004066079001791793,0.0001024813676868744,0.00308685243303526,0.0001013762513461915,5107,0.0001,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,2466,0.00020754400243309012,0.0019319860083082698,0.00020283288218606642,0.0014786760562713035,0.00020172824844867798,2586,0.0002,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,1655,0.00030687972205438067,0.0013377397645028726,0.00030219129368978006,0.0010508732574554497,0.00030105650589762746,1736,0.00030000000000000003,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,1243,0.0004072136267095737,0.001056923897828408,0.0004025078938310397,0.0007582628414963917,0.0004014090936302682,1305,0.0004,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,995,0.0005075617336683415,0.0007933683824162227,0.0005028640134849749,0.0006232695683416088,0.0005017614337245814,1045,0.0005,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,831,0.0006069544103489773,0.0007196728740232229,0.0006022227483966297,0.0004991286373051354,0.0006010893901462271,873,0.0006000000000000001,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,712,0.0007072545856741567,0.0005896830799502328,0.000702559235139509,0.0004401401059381315,0.0007014417189327791,748,0.0007000000000000001,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,623,0.0008076356067415732,0.0005473369257973612,0.0008029095356384018,0.00040823678026317246,0.0008017940608599713,655,0.0008000000000000001,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,555,0.0009069345225225232,0.0004925253150638496,0.0009022477837296215,0.0003476692854287226,0.000901123046875,582,0.0009000000000000002,,,,,,,,,,,,,
+single_float64_axis,1,Quadro GP100,No,499,0.0010072963186372732,0.0004478227973886651,0.0010025936021116784,0.0003019453543048136,0.001001473230260019,524,0.0010000000000000002,,,,,,,,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,66,0.00762213684848485,0.001421350078274123,0.007615782766631154,0.001435096779193961,0.007614612524060236,69,,2^6,64,2^6,64,8811814367.137686,70494514937.10149,0.08099094087442726,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,206,0.0024424633495145616,0.004624145111239578,0.0024361206686612466,0.004621497870662145,0.0024339480377906972,215,,2^8,256,2^6,64,27547430167.685093,220379441341.48074,0.25319329198239976,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,13161,0.0011122250784894763,0.012946747378394284,0.0011059346540513448,0.013016477670773642,0.0011028085603954402,13162,,2^10,1024,2^6,64,60680677428.96169,485445419431.69354,0.5577268146044273,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,375,0.0024504410826666676,0.005006459599172983,0.0024441843128204348,0.004997852033101851,0.002444396898467488,376,,2^6,64,2^8,256,27456548038.53994,219652384308.31952,0.25235797829540385,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,13509,0.0010829132057147126,0.00962532025494488,0.0010766412656412294,0.009687962765161975,0.0010755151240936572,13510,,2^8,256,2^8,256,62331684788.27634,498653478306.2107,0.5729015145981281,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,15105,0.0009647508075471688,0.00509700309681285,0.0009584777770750065,0.0051422186996564374,0.0009572492433860444,15106,,2^10,1024,2^8,256,70016087597.56184,560128700780.4948,0.6435302168893552,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,13582,0.0010768811630834938,0.007374891553423134,0.0010706156819352108,0.007408403333483488,0.0010709149855748453,13583,,2^6,64,2^10,1024,62682496746.82156,501459973974.5725,0.5761258892171099,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,1782,0.0009628989545454543,0.004952339328546463,0.0009565676676170035,0.004999266556092292,0.0009545994625289475,1783,,2^8,256,2^10,1024,70155898293.3024,561247186346.4192,0.6448152416663824,,,,,,
+copy_sweep_grid_shape,0,Quadro GV100,No,14579,0.0010004705825502492,0.019254279361810064,0.000994218452561916,0.019368874235788265,0.0009928190472480985,14580,,2^10,1024,2^10,1024,67499113325.72127,539992906605.77014,0.6203962621849382,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,2236,0.006688950130143119,0.010949635482023172,0.006684225965911029,0.010951296261576433,0.006674802853478132,2237,,2^6,64,2^6,64,10039885596.66435,80319084773.3148,0.10970154716634999,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,218,0.002301079724770642,0.0028902989580355323,0.0022963436360752907,0.00289533405223059,0.0022982710453501917,228,,2^8,256,2^6,64,29224225392.80601,233793803142.4481,0.31932064458922654,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,426,0.0011791361924882624,0.003945300386512142,0.0011743737087003502,0.003937393066379565,0.0011721584260596465,449,,2^10,1024,2^6,64,57144385558.72278,457155084469.7822,0.6243923247238066,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,226,0.00221838060176991,0.0015555610970668225,0.002213621245021314,0.00156938592170116,0.002213029949976925,237,,2^6,64,2^8,256,30316326314.149483,242530610513.19586,0.33125356549551443,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,12933,0.0011352358293512678,0.006664341285751961,0.0011305139959895455,0.006686678312757843,0.0011301243156632502,12934,,2^8,256,2^8,256,59361373886.62687,474890991093.01495,0.6486164104745069,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,447,0.001123642310961969,0.0021689290927995966,0.001118954880392258,0.0021631706036555243,0.001117002699110243,468,,2^10,1024,2^8,256,59974593413.87786,479796747311.0229,0.6553167986656235,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,448,0.0011216752544642855,0.003016436766525619,0.0011169237145887954,0.0030169455335936853,0.0011148893315741358,470,,2^6,64,2^10,1024,60083659361.37964,480669274891.0371,0.6565085157493404,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,448,0.0011223883750000004,0.002772433370845318,0.001117700926693421,0.002767310086041235,0.001115604862286027,471,,2^8,256,2^10,1024,60041879180.089096,480335033440.71277,0.6560520015306938,,,,,,
+copy_sweep_grid_shape,1,Quadro GP100,No,474,0.001060387455696202,0.0015438063319203115,0.0010557063963845812,0.0015281670861921415,0.0010540968806868097,498,,2^10,1024,2^10,1024,63567734580.20524,508541876641.6419,0.6945775194515432,,,,,,
+copy_type_sweep,0,Quadro GV100,No,197,0.0025498305939086305,0.0029889374958217657,0.002543548455698237,0.0030039924801443835,0.002539370934940079,206,,,,,,105535813716.71214,211071627433.4243,0.24249957195935695,U8,,,,,
+copy_type_sweep,0,Quadro GV100,No,314,0.001601867525477707,0.0041088290293520464,0.001595620784789893,0.004121783467130656,0.0015915001814459024,331,,,,,,84116307132.25725,336465228529.029,0.3865639114533881,U16,,,,,
+copy_type_sweep,0,Quadro GV100,No,13509,0.0010828757243319257,0.009631338263504865,0.0010766108759909336,0.00968747141038187,0.0010754745522399284,13510,,,,,,62333444233.72251,498667553869.7801,0.5729176859717142,U32,,,,,
+copy_type_sweep,0,Quadro GV100,No,15542,0.000936680335671086,0.006014811602456382,0.0009304157742707227,0.006054554605786883,0.0009291894323484043,15543,,,,,,36063911347.91389,577022581566.6222,0.6629395468366523,U64,,,,,
+copy_type_sweep,0,Quadro GV100,No,13508,0.0010830444422564424,0.009669058622361396,0.0010767860333789413,0.009724931080690958,0.0010753850626975213,13509,,,,,,62323304648.94053,498586437191.52423,0.5728244912586445,F32,,,,,
+copy_type_sweep,0,Quadro GV100,No,15546,0.0009366829067284162,0.005915735845058088,0.000930411673893452,0.005957150780596753,0.000929181897058261,15547,,,,,,36064070283.626465,577025124538.0234,0.6629424684490159,F64,,,,,
+copy_type_sweep,1,Quadro GP100,No,5497,0.0027078292635983312,0.006395900728715414,0.002703092247003721,0.006391943780941085,0.0026997315984675927,5498,,,,,,99306805491.95126,198613610983.90253,0.2712707754915627,U8,,,,,
+copy_type_sweep,1,Quadro GP100,No,330,0.001520048475757576,0.004360351977281759,0.0015153354655612601,0.004371610333527513,0.0015136887122844827,348,,,,,,88572947080.2609,354291788321.0436,0.48389940494023653,U16,,,,,
+copy_type_sweep,1,Quadro GP100,No,12935,0.001135307039969079,0.006620691359921369,0.0011306220265028327,0.006643085804704841,0.0011301225775930074,12936,,,,,,59355701929.47401,474845615435.79205,0.6485544354182038,U32,,,,,
+copy_type_sweep,1,Quadro GP100,No,478,0.0010522014937238498,0.002705184693469705,0.0010475129038718955,0.0026904354098530054,0.0010449056396484376,500,,,,,,32032476044.899876,512519616718.398,0.7000104030791057,U64,,,,,
+copy_type_sweep,1,Quadro GP100,No,12933,0.0011353411816283944,0.0066894954953519185,0.0011306308270201252,0.006710880496820158,0.001130335244040081,12934,,,,,,59355239921.125435,474841919369.0035,0.6485493872500594,F32,,,,,
+copy_type_sweep,1,Quadro GP100,No,477,0.0010531248867924527,0.0028403001389236257,0.0010484168726943076,0.0028275833565767315,0.001045540454641914,497,,,,,,32004856917.047768,512077710672.7643,0.6994068382221977,F64,,,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I8,I8,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,712,0.0007091774676966297,0.002907153358900259,0.0007029326261764157,0.0029793074635008562,0.0006997137222698028,748,,,,,,95469838076.85379,286409514230.56134,0.3290550485185677,,I8,I16,67108864,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,No,622,0.0008109339212218649,0.0030780801739523534,0.0008046977720268298,0.003128636670235119,0.0008020894928445145,654,,,,,,83396358648.99908,416981793244.9954,0.4790691558421363,,I8,I32,67108864,67108864,268435456
+copy_type_conversion_sweep,0,Quadro GV100,No,614,0.0008210284218241037,0.0032226581332443155,0.0008147683657147577,0.0032802944549392547,0.0008120876105256783,645,,,,,,82365573853.77692,411827869268.8846,0.47314782774458247,,I8,F32,67108864,67108864,268435456
+copy_type_conversion_sweep,0,Quadro GV100,No,12047,0.001218285543537808,0.010379223139149431,0.0012120340953301184,0.010435925751575564,0.0012109437788209631,12048,,,,,,55368792229.992294,498319130069.93066,0.5725173828928432,,I8,I64,67108864,67108864,536870912
+copy_type_conversion_sweep,0,Quadro GV100,No,12345,0.0011880294179019875,0.007390662960171094,0.0011817586330519084,0.007441978357839151,0.00118048292704533,12346,,,,,,56787284749.24732,511085562743.2259,0.5871846998428606,,I8,F64,67108864,67108864,536870912
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I16,I8,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I16,I16,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,30679,0.00045337666328758926,0.005585454361874255,0.0004471040883478709,0.005691409845285877,0.00044639401537845537,30680,,,,,,75048367649.66653,450290205897.9992,0.5173370931732527,,I16,I32,33554432,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,No,1111,0.00045644497029703035,0.004630719301683377,0.000450117155493756,0.004699806594964763,0.0004474967917148505,1162,,,,,,74545996726.5466,447275980359.2796,0.513874058317187,,I16,F32,33554432,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,No,21586,0.0006636389210136234,0.00678406319186359,0.0006573809432517179,0.006858260008522721,0.0006561174402032673,21587,,,,,,51042599187.65498,510425991876.5498,0.5864269208140508,,I16,I64,33554432,67108864,268435456
+copy_type_conversion_sweep,0,Quadro GV100,No,21638,0.0006617334348830768,0.006935544837856402,0.0006554566019343122,0.0070075276540148375,0.000653902132173752,21639,,,,,,51192454086.1711,511924540861.71094,0.5881485993356054,,I16,F64,33554432,67108864,268435456
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I32,I8,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I32,I16,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I32,I32,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,47462,0.00027253003156209153,0.012292429580379986,0.0002662904619916644,0.01259432062607704,0.00026489117725947637,47463,,,,,,63003443212.04103,504027545696.32825,0.5790757648165535,,I32,F32,16777216,67108864,67108864
+copy_type_conversion_sweep,0,Quadro GV100,No,35529,0.0003843450104703185,0.007061698560844491,0.0003780983807571241,0.007193170430510475,0.0003773213782537234,35530,,,,,,44372620603.146774,532471447237.7613,0.6117548796389721,,I32,I64,16777216,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,No,35498,0.0003847829341653037,0.007946267454909624,0.0003785711211350411,0.008088117160474145,0.00037782656535605304,35499,,,,,,44317210329.45711,531806523953.4853,0.6109909512333241,,I32,F64,16777216,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F32,I8,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F32,I16,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,47607,0.00027168871905392146,0.01320649583700062,0.0002654402574849676,0.013542194342127112,0.00026387154338796694,47608,,,,,,63205243089.21048,505641944713.68384,0.580930543099361,,F32,I32,16777216,67108864,67108864
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F32,F32,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,35499,0.00038478567168089093,0.007793965803915478,0.00037854311241399495,0.007936841129938173,0.0003779212159950015,35500,,,,,,44320489396.863045,531845872762.35657,0.6110361589641045,,F32,I64,16777216,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,No,35509,0.00038455552378270424,0.008192214819970133,0.0003782884006425947,0.008364103785135068,0.0003775098086048749,35510,,,,,,44350331576.38646,532203978916.6376,0.6114475860715045,,F32,F64,16777216,67108864,134217728
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I8,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I16,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I32,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,F32,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,I64,I64,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,52100,0.00024245567445297397,0.008539487651977316,0.0002361851441692315,0.008809731550468568,0.00023509478066781645,52101,,,,,,35517085672.37146,568273370757.9434,0.6528876042715341,,I64,F64,8388608,67108864,67108864
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,I8,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,I16,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,I32,,,
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,F32,,,
+copy_type_conversion_sweep,0,Quadro GV100,No,51780,0.0002443157005407483,0.009752081536074006,0.00023802995230194252,0.010060889608084888,0.00023685153101656335,51781,,,,,,35241816917.89358,563869070686.2972,0.6478275168730437,,F64,I64,8388608,67108864,67108864
+copy_type_conversion_sweep,0,Quadro GV100,Yes,,,,,,,,,,,,,,,,,F64,F64,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I8,I8,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,21576,0.0006664836124397485,0.008032774520430731,0.0006617529590178744,0.008026552131843242,0.000660956536269952,21577,,,,,,101410750168.15656,304232250504.46967,0.415527002983596,,I8,I16,67108864,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,No,16864,0.0008629180932163245,0.007882137751662148,0.0008582004840137586,0.00791837780480227,0.0008575679956081893,16865,,,,,,78197187312.3811,390985936561.9055,0.534017068075155,,I8,I32,67108864,67108864,268435456
+copy_type_conversion_sweep,1,Quadro GP100,No,16866,0.0008626138966559889,0.007971447505119814,0.0008578826743068147,0.008007931525887958,0.0008570867944797,16867,,,,,,78226156104.88371,391130780524.4186,0.5342148990991294,,I8,F32,67108864,67108864,268435456
+copy_type_conversion_sweep,1,Quadro GP100,No,10113,0.0014600333443093004,0.005526790210274132,0.0014553281934728316,0.0055395074611650314,0.0014537668022591766,10114,,,,,,46112529325.676674,415012763931.09,0.5668334297572799,,I8,I64,67108864,67108864,536870912
+copy_type_conversion_sweep,1,Quadro GP100,No,10100,0.0014618894557425724,0.005445897883226132,0.001457197949886321,0.005462563281467926,0.0014559329674746807,10101,,,,,,46053361525.27205,414480253727.4484,0.5661061157772187,,I8,F64,67108864,67108864,536870912
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I16,I8,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I16,I16,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,30414,0.00046039200555665144,0.00783107553466636,0.00045566900502105564,0.00787891812842585,0.0004554145292263416,30415,,,,,,73637731841.00926,441826391046.0556,0.6034560629453338,,I16,I32,33554432,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,No,30506,0.00045893235589720245,0.007769133803136496,0.00045424179060767235,0.007825865470567534,0.00045380600078266583,30507,,,,,,73869099439.55573,443214596637.33435,0.6053521042358697,,I16,F32,33554432,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,No,19198,0.0007536445464110835,0.0055724274957070705,0.0007489312136715944,0.005596667585138299,0.0007480235320061677,19199,,,,,,44803089238.998634,448030892389.9864,0.6119303053840505,,I16,I64,33554432,67108864,268435456
+copy_type_conversion_sweep,1,Quadro GP100,No,19239,0.0007522073903529291,0.005390014038421145,0.0007475173373753664,0.00541460189354359,0.0007464590773358217,19240,,,,,,44887831120.83274,448878311208.3274,0.6130877283767584,,I16,F64,33554432,67108864,268435456
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I32,I8,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I32,I16,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I32,I32,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,47007,0.00027858482081392173,0.007845561244706417,0.00027388008982136936,0.007927787169621173,0.0002735391274529489,47008,,,,,,61257523359.73918,490060186877.91345,0.669334826920227,,I32,F32,16777216,67108864,67108864
+copy_type_conversion_sweep,1,Quadro GP100,No,1196,0.00042285716053511705,0.0044259706705914136,0.000418105284879638,0.004434430384125105,0.00041619211102596686,1254,,,,,,40126773343.29734,481521280119.5681,0.6576722029605115,,I32,I64,16777216,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,No,1195,0.0004233829054393305,0.004675323526829325,0.0004187026210409825,0.004672495644368017,0.000416603259004343,1252,,,,,,40069527050.69847,480834324608.3817,0.6567339442312906,,I32,F64,16777216,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F32,I8,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F32,I16,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,46545,0.000281580676807392,0.012343793517939674,0.00027684729846364837,0.012502080899567246,0.00027647899042774625,46546,,,,,,60600974230.5755,484807793844.604,0.6621609946522673,,F32,I32,16777216,67108864,67108864
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F32,F32,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,1196,0.00042307038377926434,0.004605601751545893,0.00041839050190105916,0.004593649085141324,0.00041637280448239853,1257,,,,,,40099418901.16681,481193026814.0017,0.6572238674797882,,F32,I64,16777216,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,No,1195,0.00042337769372384964,0.004694569298674352,0.00041869027609605687,0.004721621480904926,0.00041666047469429343,1265,,,,,,40070708487.50959,480848501850.1151,0.6567533078153889,,F32,F64,16777216,67108864,134217728
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I8,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I16,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I32,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,F32,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,I64,I64,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,1910,0.0002665688994764398,0.004225222015302186,0.0002618850015064806,0.004212590939733298,0.0002600365045472557,2011,,,,,,32031647294.594746,512506356713.5159,0.6999922922769831,,I64,F64,8388608,67108864,67108864
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,I8,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,I16,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,I32,,,
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,F32,,,
+copy_type_conversion_sweep,1,Quadro GP100,No,1912,0.00026628649320083707,0.004156825161653807,0.0002615861928238536,0.00409518011787359,0.0002597900572277251,2016,,,,,,32068236895.242805,513091790323.8849,0.7007918901932432,,F64,I64,8388608,67108864,67108864
+copy_type_conversion_sweep,1,Quadro GP100,Yes,,,,,,,,,,,,,,,,,F64,F64,,,
diff --git a/examples/outputs/nvbench.example.axes.json b/examples/outputs/nvbench.example.axes.json
new file mode 100644
index 0000000..b35a88e
--- /dev/null
+++ b/examples/outputs/nvbench.example.axes.json
@@ -0,0 +1,18797 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "simple",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": null,
+      "states": {
+        "Device=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010102518997995992"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005320863289715677"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010037636295826922"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005606955085401353"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072372945891786"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004201092756117083"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002567436508759"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003010855735431417"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001474511532383"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 1,
+      "name": "single_float64_axis",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "Duration": {
+          "type": "float64",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "0",
+              "description": "",
+              "value": 0.0
+            },
+            {
+              "input_string": "0.0001",
+              "description": "",
+              "value": 0.0001
+            },
+            {
+              "input_string": "0.0002",
+              "description": "",
+              "value": 0.0002
+            },
+            {
+              "input_string": "0.0003",
+              "description": "",
+              "value": 0.00030000000000000003
+            },
+            {
+              "input_string": "0.0004",
+              "description": "",
+              "value": 0.0004
+            },
+            {
+              "input_string": "0.0005",
+              "description": "",
+              "value": 0.0005
+            },
+            {
+              "input_string": "0.0006",
+              "description": "",
+              "value": 0.0006000000000000001
+            },
+            {
+              "input_string": "0.0007",
+              "description": "",
+              "value": 0.0007000000000000001
+            },
+            {
+              "input_string": "0.0008",
+              "description": "",
+              "value": 0.0008000000000000001
+            },
+            {
+              "input_string": "0.0009",
+              "description": "",
+              "value": 0.0009000000000000002
+            },
+            {
+              "input_string": "0.001",
+              "description": "",
+              "value": 0.0010000000000000002
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 Duration=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "147957"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.0618111072811744e-05"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.03254637275181478"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "4.42401244240246e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.1078451537144948"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "2.0427748176574993e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "244766"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0001": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "4831"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00011004767191057754"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004150807794866266"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010351461256001611"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004781428015913556"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010137620362095862"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5088"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0002": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2453"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00021036899388503913"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002247040450593631"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002039032309542721"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024717338672984117"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020172918129115027"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0003": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00030000000000000003"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1648"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030986622087378617"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014834014227566264"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003034121238635499"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0016613620256970604"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030116395027406757"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1736"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0004": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0004"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1239"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004101481057304273"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012074050648415652"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000403672849583562"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00138227314754308"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00040141034272550807"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1304"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0005": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "992"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005105290151209681"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009132925978805083"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005040890874881937"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001126663137928465"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017619947554283"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1042"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0006": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0006000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "829"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006098617478890229"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007603582722561039"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000603470634925204"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009715240544303389"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006011044563503441"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "872"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0007": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0007000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "711"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007102935302390999"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006584784515618755"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007037439938168366"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008111324463740949"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007014426981064088"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0008": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0008000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "622"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000810564897106109"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006072346537084304"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008041868904587526"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000707547745748677"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008017951065694391"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "653"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0009": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0009000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "554"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009098726931407932"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005164841732610924"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000903432962588876"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005681374617078215"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009011252491744524"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.001": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0010000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010102697054108218"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00044759296847813034"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010038065348932821"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005354727313941588"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "152839"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "7.705229234684682e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05418085805591698"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.016308709558017e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.04127500754098809"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.3434882326935044e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "372166"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0001": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "4879"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010715639024390229"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004066079001791793"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0001024813676868744"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00308685243303526"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0001013762513461915"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5107"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0002": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2466"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020754400243309012"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0019319860083082698"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020283288218606642"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014786760562713035"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00020172824844867798"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2586"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0003": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00030000000000000003"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1655"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030687972205438067"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0013377397645028726"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030219129368978006"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010508732574554497"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030105650589762746"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1736"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0004": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0004"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1243"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004072136267095737"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001056923897828408"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004025078938310397"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007582628414963917"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004014090936302682"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1305"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0005": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "995"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005075617336683415"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007933683824162227"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005028640134849749"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006232695683416088"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017614337245814"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1045"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0006": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0006000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "831"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006069544103489773"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007196728740232229"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006022227483966297"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004991286373051354"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006010893901462271"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "873"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0007": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0007000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "712"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007072545856741567"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005896830799502328"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000702559235139509"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004401401059381315"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007014417189327791"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0008": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0008000000000000001"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "623"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008076356067415732"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005473369257973612"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008029095356384018"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00040823678026317246"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008017940608599713"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "655"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0009": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0009000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "555"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009069345225225232"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004925253150638496"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009022477837296215"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003476692854287226"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000901123046875"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "582"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.001": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0010000000000000002"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072963186372732"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004478227973886651"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025936021116784"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003019453543048136"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001473230260019"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 2,
+      "name": "copy_sweep_grid_shape",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "BlockSize": {
+          "type": "int64",
+          "flags": "pow2",
+          "values": [
+            {
+              "input_string": "6",
+              "description": "2^6 = 64",
+              "value": 64
+            },
+            {
+              "input_string": "8",
+              "description": "2^8 = 256",
+              "value": 256
+            },
+            {
+              "input_string": "10",
+              "description": "2^10 = 1024",
+              "value": 1024
+            }
+          ]
+        },
+        "NumBlocks": {
+          "type": "int64",
+          "flags": "pow2",
+          "values": [
+            {
+              "input_string": "6",
+              "description": "2^6 = 64",
+              "value": 64
+            },
+            {
+              "input_string": "8",
+              "description": "2^8 = 256",
+              "value": 256
+            },
+            {
+              "input_string": "10",
+              "description": "2^10 = 1024",
+              "value": 1024
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 BlockSize=2^6 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "66"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00762213684848485"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001421350078274123"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007615782766631154"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001435096779193961"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "8811814367.137686"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "70494514937.10149"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.08099094087442726"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007614612524060236"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "69"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "206"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024424633495145616"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004624145111239578"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024361206686612466"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004621497870662145"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "27547430167.685093"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "220379441341.48074"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.25319329198239976"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024339480377906972"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "215"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^6": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13161"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011122250784894763"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012946747378394284"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011059346540513448"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013016477670773642"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60680677428.96169"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "485445419431.69354"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5577268146044273"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011028085603954402"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13162"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^6 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "375"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024504410826666676"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005006459599172983"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0024441843128204348"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004997852033101851"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "27456548038.53994"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "219652384308.31952"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.25235797829540385"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002444396898467488"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "376"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13509"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010829132057147126"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00962532025494488"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010766412656412294"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009687962765161975"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62331684788.27634"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "498653478306.2107"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5729015145981281"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010755151240936572"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13510"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15105"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009647508075471688"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00509700309681285"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009584777770750065"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0051422186996564374"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "70016087597.56184"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "560128700780.4948"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6435302168893552"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009572492433860444"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15106"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^6 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13582"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010768811630834938"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007374891553423134"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010706156819352108"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007408403333483488"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62682496746.82156"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "501459973974.5725"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5761258892171099"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010709149855748453"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13583"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^8 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1782"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009628989545454543"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004952339328546463"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009565676676170035"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004999266556092292"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "70155898293.3024"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "561247186346.4192"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6448152416663824"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009545994625289475"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1783"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 BlockSize=2^10 NumBlocks=2^10": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "14579"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010004705825502492"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.019254279361810064"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000994218452561916"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.019368874235788265"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "67499113325.72127"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "539992906605.77014"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6203962621849382"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009928190472480985"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "14580"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2236"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006688950130143119"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010949635482023172"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006684225965911029"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010951296261576433"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "10039885596.66435"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "80319084773.3148"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.10970154716634999"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006674802853478132"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2237"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "218"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002301079724770642"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0028902989580355323"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022963436360752907"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00289533405223059"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "29224225392.80601"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "233793803142.4481"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.31932064458922654"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022982710453501917"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "228"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^6": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "426"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011791361924882624"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003945300386512142"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011743737087003502"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003937393066379565"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "57144385558.72278"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "457155084469.7822"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6243923247238066"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011721584260596465"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "449"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "226"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00221838060176991"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015555610970668225"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002213621245021314"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00156938592170116"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "30316326314.149483"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "242530610513.19586"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.33125356549551443"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002213029949976925"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "237"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12933"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011352358293512678"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006664341285751961"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011305139959895455"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006686678312757843"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59361373886.62687"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "474890991093.01495"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6486164104745069"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011301243156632502"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12934"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "256"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "447"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001123642310961969"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021689290927995966"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001118954880392258"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0021631706036555243"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59974593413.87786"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "479796747311.0229"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6553167986656235"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001117002699110243"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "468"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^6 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "64"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "448"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011216752544642855"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003016436766525619"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011169237145887954"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030169455335936853"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60083659361.37964"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480669274891.0371"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6565085157493404"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011148893315741358"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "470"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^8 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "256"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "448"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011223883750000004"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002772433370845318"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001117700926693421"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002767310086041235"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60041879180.089096"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480335033440.71277"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6560520015306938"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001115604862286027"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "471"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 BlockSize=2^10 NumBlocks=2^10": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "BlockSize": {
+              "type": "int64",
+              "value": "1024"
+            },
+            "NumBlocks": {
+              "type": "int64",
+              "value": "1024"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "474"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001060387455696202"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015438063319203115"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010557063963845812"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015281670861921415"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63567734580.20524"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "508541876641.6419"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6945775194515432"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010540968806868097"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "498"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 3,
+      "name": "copy_type_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "T": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "U8",
+              "description": "uint8_t",
+              "is_active": true
+            },
+            {
+              "input_string": "U16",
+              "description": "uint16_t",
+              "is_active": true
+            },
+            {
+              "input_string": "U32",
+              "description": "uint32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "U64",
+              "description": "uint64_t",
+              "is_active": true
+            },
+            {
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
+            },
+            {
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 T=U8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "197"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0025498305939086305"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0029889374958217657"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002543548455698237"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030039924801443835"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "105535813716.71214"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "211071627433.4243"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.24249957195935695"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002539370934940079"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "206"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U16": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "314"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001601867525477707"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0041088290293520464"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001595620784789893"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004121783467130656"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "84116307132.25725"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "336465228529.029"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.3865639114533881"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015915001814459024"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "331"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U32": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13509"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010828757243319257"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009631338263504865"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010766108759909336"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00968747141038187"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62333444233.72251"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "498667553869.7801"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5729176859717142"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010754745522399284"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13510"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=U64": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15542"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000936680335671086"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006014811602456382"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009304157742707227"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006054554605786883"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "36063911347.91389"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "577022581566.6222"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6629395468366523"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009291894323484043"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15543"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=F32": {
+          "device": 0,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13508"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010830444422564424"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009669058622361396"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010767860333789413"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009724931080690958"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62323304648.94053"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "498586437191.52423"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5728244912586445"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010753850626975213"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "13509"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 T=F64": {
+          "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15546"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009366829067284162"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005915735845058088"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000930411673893452"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005957150780596753"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "36064070283.626465"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "577025124538.0234"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6629424684490159"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000929181897058261"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "15547"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5497"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0027078292635983312"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006395900728715414"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002703092247003721"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006391943780941085"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "99306805491.95126"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "198613610983.90253"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.2712707754915627"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0026997315984675927"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "5498"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "330"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001520048475757576"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004360351977281759"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015153354655612601"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004371610333527513"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "88572947080.2609"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "354291788321.0436"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.48389940494023653"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0015136887122844827"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "348"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12935"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001135307039969079"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006620691359921369"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011306220265028327"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006643085804704841"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59355701929.47401"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "474845615435.79205"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6485544354182038"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011301225775930074"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12936"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=U64": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "U64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "478"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010522014937238498"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002705184693469705"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010475129038718955"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0026904354098530054"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32032476044.899876"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512519616718.398"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7000104030791057"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010449056396484376"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "500"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=F32": {
+          "device": 1,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12933"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011353411816283944"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0066894954953519185"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011306308270201252"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006710880496820158"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "59355239921.125435"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "474841919369.0035"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6485493872500594"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001130335244040081"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12934"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 T=F64": {
+          "device": 1,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "T": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "477"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010531248867924527"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0028403001389236257"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010484168726943076"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0028275833565767315"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32004856917.047768"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512077710672.7643"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6994068382221977"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001045540454641914"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "497"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 4,
+      "name": "copy_type_conversion_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "In": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "I8",
+              "description": "int8_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I16",
+              "description": "int16_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
+            },
+            {
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
+            },
+            {
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
+            }
+          ]
+        },
+        "Out": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "I8",
+              "description": "int8_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I16",
+              "description": "int16_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "F32",
+              "description": "float",
+              "is_active": true
+            },
+            {
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
+            },
+            {
+              "input_string": "F64",
+              "description": "double",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 In=I8 Out=I8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I8 Out=I16": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "712"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007091774676966297"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.002907153358900259"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007029326261764157"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0029793074635008562"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "95469838076.85379"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "286409514230.56134"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.3290550485185677"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006997137222698028"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "748"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=I32": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "622"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008109339212218649"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030780801739523534"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008046977720268298"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.003128636670235119"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "83396358648.99908"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "416981793244.9954"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.4790691558421363"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008020894928445145"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "654"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=F32": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "614"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008210284218241037"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0032226581332443155"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008147683657147577"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0032802944549392547"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "82365573853.77692"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "411827869268.8846"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.47314782774458247"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008120876105256783"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "645"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=I64": {
+          "device": 0,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12047"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001218285543537808"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010379223139149431"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012120340953301184"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010435925751575564"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "55368792229.992294"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "498319130069.93066"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5725173828928432"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012109437788209631"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12048"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=F64": {
+          "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12345"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011880294179019875"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007390662960171094"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0011817586330519084"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007441978357839151"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "56787284749.24732"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "511085562743.2259"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5871846998428606"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00118048292704533"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "12346"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I8": {
+          "device": 0,
+          "type_config_index": 6,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I16 Out=I16": {
+          "device": 0,
+          "type_config_index": 7,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I16 Out=I32": {
+          "device": 0,
+          "type_config_index": 8,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "30679"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045337666328758926"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005585454361874255"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004471040883478709"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005691409845285877"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "75048367649.66653"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "450290205897.9992"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5173370931732527"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00044639401537845537"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "30680"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=F32": {
+          "device": 0,
+          "type_config_index": 9,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1111"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045644497029703035"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004630719301683377"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000450117155493756"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004699806594964763"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "74545996726.5466"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "447275980359.2796"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.513874058317187"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004474967917148505"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1162"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I64": {
+          "device": 0,
+          "type_config_index": 10,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "21586"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006636389210136234"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00678406319186359"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006573809432517179"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006858260008522721"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "51042599187.65498"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "510425991876.5498"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5864269208140508"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006561174402032673"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "21587"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=F64": {
+          "device": 0,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "21638"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006617334348830768"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.006935544837856402"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006554566019343122"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0070075276540148375"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "51192454086.1711"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "511924540861.71094"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5881485993356054"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000653902132173752"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "21639"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I8": {
+          "device": 0,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I16": {
+          "device": 0,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I32": {
+          "device": 0,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I32 Out=F32": {
+          "device": 0,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47462"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027253003156209153"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012292429580379986"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002662904619916644"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01259432062607704"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63003443212.04103"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "504027545696.32825"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5790757648165535"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026489117725947637"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47463"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I64": {
+          "device": 0,
+          "type_config_index": 16,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35529"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003843450104703185"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007061698560844491"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003780983807571241"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007193170430510475"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44372620603.146774"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "532471447237.7613"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6117548796389721"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003773213782537234"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35530"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=F64": {
+          "device": 0,
+          "type_config_index": 17,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35498"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003847829341653037"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007946267454909624"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003785711211350411"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008088117160474145"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44317210329.45711"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531806523953.4853"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6109909512333241"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037782656535605304"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35499"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=I8": {
+          "device": 0,
+          "type_config_index": 18,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F32 Out=I16": {
+          "device": 0,
+          "type_config_index": 19,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F32 Out=I32": {
+          "device": 0,
+          "type_config_index": 20,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47607"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027168871905392146"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.01320649583700062"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002654402574849676"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.013542194342127112"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63205243089.21048"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "505641944713.68384"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.580930543099361"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026387154338796694"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47608"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=F32": {
+          "device": 0,
+          "type_config_index": 21,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=F32 Out=I64": {
+          "device": 0,
+          "type_config_index": 22,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00038478567168089093"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007793965803915478"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00037854311241399495"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007936841129938173"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44320489396.863045"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "531845872762.35657"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6110361589641045"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003779212159950015"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35500"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F32 Out=F64": {
+          "device": 0,
+          "type_config_index": 23,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35509"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00038455552378270424"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008192214819970133"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003782884006425947"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008364103785135068"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44350331576.38646"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "532203978916.6376"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6114475860715045"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003775098086048749"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "35510"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I64 Out=I8": {
+          "device": 0,
+          "type_config_index": 24,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I16": {
+          "device": 0,
+          "type_config_index": 25,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I32": {
+          "device": 0,
+          "type_config_index": 26,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=F32": {
+          "device": 0,
+          "type_config_index": 27,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I64": {
+          "device": 0,
+          "type_config_index": 28,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=0 In=I64 Out=F64": {
+          "device": 0,
+          "type_config_index": 29,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "52100"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00024245567445297397"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008539487651977316"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002361851441692315"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008809731550468568"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "35517085672.37146"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "568273370757.9434"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6528876042715341"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00023509478066781645"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "52101"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F64 Out=I8": {
+          "device": 0,
+          "type_config_index": 30,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I16": {
+          "device": 0,
+          "type_config_index": 31,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I32": {
+          "device": 0,
+          "type_config_index": 32,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=F32": {
+          "device": 0,
+          "type_config_index": 33,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=F64 Out=I64": {
+          "device": 0,
+          "type_config_index": 34,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "51780"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002443157005407483"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009752081536074006"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00023802995230194252"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.010060889608084888"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "35241816917.89358"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "563869070686.2972"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6478275168730437"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00023685153101656335"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "51781"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=F64 Out=F64": {
+          "device": 0,
+          "type_config_index": 35,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I8 Out=I8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I8 Out=I16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "21576"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006664836124397485"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008032774520430731"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006617529590178744"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008026552131843242"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "101410750168.15656"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "304232250504.46967"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.415527002983596"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000660956536269952"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "21577"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "16864"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008629180932163245"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007882137751662148"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008582004840137586"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00791837780480227"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78197187312.3811"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "390985936561.9055"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.534017068075155"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008575679956081893"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "16865"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=F32": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "16866"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008626138966559889"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007971447505119814"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008578826743068147"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.008007931525887958"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "78226156104.88371"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "391130780524.4186"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5342148990991294"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008570867944797"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "16867"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I64": {
+          "device": 1,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "10113"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014600333443093004"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005526790210274132"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014553281934728316"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0055395074611650314"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "46112529325.676674"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "415012763931.09"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5668334297572799"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014537668022591766"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "10114"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=F64": {
+          "device": 1,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "536870912"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "10100"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014618894557425724"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005445897883226132"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001457197949886321"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005462563281467926"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "46053361525.27205"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "414480253727.4484"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5661061157772187"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0014559329674746807"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "10101"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=I8": {
+          "device": 1,
+          "type_config_index": 6,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I16 Out=I16": {
+          "device": 1,
+          "type_config_index": 7,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I16 Out=I32": {
+          "device": 1,
+          "type_config_index": 8,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "30414"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00046039200555665144"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00783107553466636"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045566900502105564"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00787891812842585"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "73637731841.00926"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "441826391046.0556"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6034560629453338"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004554145292263416"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "30415"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=F32": {
+          "device": 1,
+          "type_config_index": 9,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "30506"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045893235589720245"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007769133803136496"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045424179060767235"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007825865470567534"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "73869099439.55573"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "443214596637.33435"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6053521042358697"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045380600078266583"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "30507"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=I64": {
+          "device": 1,
+          "type_config_index": 10,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "19198"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007536445464110835"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0055724274957070705"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007489312136715944"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005596667585138299"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44803089238.998634"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "448030892389.9864"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6119303053840505"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007480235320061677"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "19199"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=F64": {
+          "device": 1,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "33554432"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "268435456"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "19239"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007522073903529291"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005390014038421145"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007475173373753664"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00541460189354359"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "44887831120.83274"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "448878311208.3274"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6130877283767584"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007464590773358217"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "19240"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I8": {
+          "device": 1,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I16": {
+          "device": 1,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I32": {
+          "device": 1,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I32 Out=F32": {
+          "device": 1,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47007"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027858482081392173"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007845561244706417"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027388008982136936"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.007927787169621173"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "61257523359.73918"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "490060186877.91345"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.669334826920227"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002735391274529489"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47008"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I64": {
+          "device": 1,
+          "type_config_index": 16,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1196"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042285716053511705"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0044259706705914136"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000418105284879638"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004434430384125105"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40126773343.29734"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481521280119.5681"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6576722029605115"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041619211102596686"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1254"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=F64": {
+          "device": 1,
+          "type_config_index": 17,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1195"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004233829054393305"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004675323526829325"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004187026210409825"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004672495644368017"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40069527050.69847"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480834324608.3817"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6567339442312906"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000416603259004343"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1252"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=I8": {
+          "device": 1,
+          "type_config_index": 18,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F32 Out=I16": {
+          "device": 1,
+          "type_config_index": 19,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F32 Out=I32": {
+          "device": 1,
+          "type_config_index": 20,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "46545"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000281580676807392"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012343793517939674"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027684729846364837"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.012502080899567246"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60600974230.5755"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "484807793844.604"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6621609946522673"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027647899042774625"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "46546"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=F32": {
+          "device": 1,
+          "type_config_index": 21,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=F32 Out=I64": {
+          "device": 1,
+          "type_config_index": 22,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1196"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042307038377926434"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004605601751545893"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041839050190105916"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004593649085141324"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40099418901.16681"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "481193026814.0017"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6572238674797882"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041637280448239853"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1257"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F32 Out=F64": {
+          "device": 1,
+          "type_config_index": 23,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "134217728"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1195"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042337769372384964"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004694569298674352"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041869027609605687"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004721621480904926"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "40070708487.50959"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "480848501850.1151"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6567533078153889"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041666047469429343"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1265"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I64 Out=I8": {
+          "device": 1,
+          "type_config_index": 24,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I16": {
+          "device": 1,
+          "type_config_index": 25,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I32": {
+          "device": 1,
+          "type_config_index": 26,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=F32": {
+          "device": 1,
+          "type_config_index": 27,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I64": {
+          "device": 1,
+          "type_config_index": 28,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        },
+        "Device=1 In=I64 Out=F64": {
+          "device": 1,
+          "type_config_index": 29,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1910"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002665688994764398"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004225222015302186"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002618850015064806"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004212590939733298"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32031647294.594746"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "512506356713.5159"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6999922922769831"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002600365045472557"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2011"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F64 Out=I8": {
+          "device": 1,
+          "type_config_index": 30,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F64 Out=I16": {
+          "device": 1,
+          "type_config_index": 31,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F64 Out=I32": {
+          "device": 1,
+          "type_config_index": 32,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F64 Out=F32": {
+          "device": 1,
+          "type_config_index": 33,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Narrowing conversion: sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=F64 Out=I64": {
+          "device": 1,
+          "type_config_index": 34,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "8388608"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "InSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "OutSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1912"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026628649320083707"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004156825161653807"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002615861928238536"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00409518011787359"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "32068236895.242805"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "513091790323.8849"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7007918901932432"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002597900572277251"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2016"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=F64 Out=F64": {
+          "device": 1,
+          "type_config_index": 35,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "F64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "F64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Not a conversion: InputType == OutputType."
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.axes.list.md b/examples/outputs/nvbench.example.axes.list.md
new file mode 100644
index 0000000..7cb1d2c
--- /dev/null
+++ b/examples/outputs/nvbench.example.axes.list.md
@@ -0,0 +1,93 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 30117 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 14891 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `simple` (1 configurations)
+
+## [1] `single_float64_axis` (11 configurations)
+
+### Axes
+
+* `Duration` : float64
+  * `0`
+  * `0.0001`
+  * `0.0002`
+  * `0.0003`
+  * `0.0004`
+  * `0.0005`
+  * `0.0006`
+  * `0.0007`
+  * `0.0008`
+  * `0.0009`
+  * `0.001`
+
+## [2] `copy_sweep_grid_shape` (9 configurations)
+
+### Axes
+
+* `BlockSize` : int64 [pow2]
+  * `6` (2^6 = 64)
+  * `8` (2^8 = 256)
+  * `10` (2^10 = 1024)
+* `NumBlocks` : int64 [pow2]
+  * `6` (2^6 = 64)
+  * `8` (2^8 = 256)
+  * `10` (2^10 = 1024)
+
+## [3] `copy_type_sweep` (6 configurations)
+
+### Axes
+
+* `T` : type
+  * `U8` (uint8_t)
+  * `U16` (uint16_t)
+  * `U32` (uint32_t)
+  * `U64` (uint64_t)
+  * `F32` (float)
+  * `F64` (double)
+
+## [4] `copy_type_conversion_sweep` (36 configurations)
+
+### Axes
+
+* `In` : type
+  * `I8` (int8_t)
+  * `I16` (int16_t)
+  * `I32` (int32_t)
+  * `F32` (float)
+  * `I64` (int64_t)
+  * `F64` (double)
+* `Out` : type
+  * `I8` (int8_t)
+  * `I16` (int16_t)
+  * `I32` (int32_t)
+  * `F32` (float)
+  * `I64` (int64_t)
+  * `F64` (double)
+
diff --git a/examples/outputs/nvbench.example.axes.md b/examples/outputs/nvbench.example.axes.md
new file mode 100644
index 0000000..3c0ec03
--- /dev/null
+++ b/examples/outputs/nvbench.example.axes.md
@@ -0,0 +1,563 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 29776 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 14335 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  simple [Device=0]
+Pass: Cold: 1.003764ms GPU, 1.010252ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  simple [Device=1]
+Pass: Cold: 1.002567ms GPU, 1.007237ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  single_float64_axis [Device=0 Duration=0]
+Warn: Current measurement timed out (15.00s) while over noise threshold (10.78% > 0.50%)
+Pass: Cold: 0.004424ms GPU, 0.010618ms CPU, 0.65s total GPU, 147957x
+Pass: Batch: 0.002043ms GPU, 0.50s total GPU, 244766x
+Run:  single_float64_axis [Device=0 Duration=0.0001]
+Pass: Cold: 0.103515ms GPU, 0.110048ms CPU, 0.50s total GPU, 4831x
+Pass: Batch: 0.101376ms GPU, 0.52s total GPU, 5088x
+Run:  single_float64_axis [Device=0 Duration=0.0002]
+Pass: Cold: 0.203903ms GPU, 0.210369ms CPU, 0.50s total GPU, 2453x
+Pass: Batch: 0.201729ms GPU, 0.52s total GPU, 2582x
+Run:  single_float64_axis [Device=0 Duration=0.0003]
+Pass: Cold: 0.303412ms GPU, 0.309866ms CPU, 0.50s total GPU, 1648x
+Pass: Batch: 0.301164ms GPU, 0.52s total GPU, 1736x
+Run:  single_float64_axis [Device=0 Duration=0.0004]
+Pass: Cold: 0.403673ms GPU, 0.410148ms CPU, 0.50s total GPU, 1239x
+Pass: Batch: 0.401410ms GPU, 0.52s total GPU, 1304x
+Run:  single_float64_axis [Device=0 Duration=0.0005]
+Pass: Cold: 0.504089ms GPU, 0.510529ms CPU, 0.50s total GPU, 992x
+Pass: Batch: 0.501762ms GPU, 0.52s total GPU, 1042x
+Run:  single_float64_axis [Device=0 Duration=0.0006]
+Pass: Cold: 0.603471ms GPU, 0.609862ms CPU, 0.50s total GPU, 829x
+Pass: Batch: 0.601104ms GPU, 0.52s total GPU, 872x
+Run:  single_float64_axis [Device=0 Duration=0.0007]
+Pass: Cold: 0.703744ms GPU, 0.710294ms CPU, 0.50s total GPU, 711x
+Pass: Batch: 0.701443ms GPU, 0.52s total GPU, 748x
+Run:  single_float64_axis [Device=0 Duration=0.0008]
+Pass: Cold: 0.804187ms GPU, 0.810565ms CPU, 0.50s total GPU, 622x
+Pass: Batch: 0.801795ms GPU, 0.52s total GPU, 653x
+Run:  single_float64_axis [Device=0 Duration=0.0009]
+Pass: Cold: 0.903433ms GPU, 0.909873ms CPU, 0.50s total GPU, 554x
+Pass: Batch: 0.901125ms GPU, 0.52s total GPU, 582x
+Run:  single_float64_axis [Device=0 Duration=0.001]
+Pass: Cold: 1.003807ms GPU, 1.010270ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  single_float64_axis [Device=1 Duration=0]
+Warn: Current measurement timed out (15.00s) while over noise threshold (4.13% > 0.50%)
+Warn: Current measurement timed out (15.00s) before accumulating min_time (0.46s < 0.50s)
+Pass: Cold: 0.003016ms GPU, 0.007705ms CPU, 0.46s total GPU, 152839x
+Pass: Batch: 0.001343ms GPU, 0.50s total GPU, 372166x
+Run:  single_float64_axis [Device=1 Duration=0.0001]
+Pass: Cold: 0.102481ms GPU, 0.107156ms CPU, 0.50s total GPU, 4879x
+Pass: Batch: 0.101376ms GPU, 0.52s total GPU, 5107x
+Run:  single_float64_axis [Device=1 Duration=0.0002]
+Pass: Cold: 0.202833ms GPU, 0.207544ms CPU, 0.50s total GPU, 2466x
+Pass: Batch: 0.201728ms GPU, 0.52s total GPU, 2586x
+Run:  single_float64_axis [Device=1 Duration=0.0003]
+Pass: Cold: 0.302191ms GPU, 0.306880ms CPU, 0.50s total GPU, 1655x
+Pass: Batch: 0.301057ms GPU, 0.52s total GPU, 1736x
+Run:  single_float64_axis [Device=1 Duration=0.0004]
+Pass: Cold: 0.402508ms GPU, 0.407214ms CPU, 0.50s total GPU, 1243x
+Pass: Batch: 0.401409ms GPU, 0.52s total GPU, 1305x
+Run:  single_float64_axis [Device=1 Duration=0.0005]
+Pass: Cold: 0.502864ms GPU, 0.507562ms CPU, 0.50s total GPU, 995x
+Pass: Batch: 0.501761ms GPU, 0.52s total GPU, 1045x
+Run:  single_float64_axis [Device=1 Duration=0.0006]
+Pass: Cold: 0.602223ms GPU, 0.606954ms CPU, 0.50s total GPU, 831x
+Pass: Batch: 0.601089ms GPU, 0.52s total GPU, 873x
+Run:  single_float64_axis [Device=1 Duration=0.0007]
+Pass: Cold: 0.702559ms GPU, 0.707255ms CPU, 0.50s total GPU, 712x
+Pass: Batch: 0.701442ms GPU, 0.52s total GPU, 748x
+Run:  single_float64_axis [Device=1 Duration=0.0008]
+Pass: Cold: 0.802910ms GPU, 0.807636ms CPU, 0.50s total GPU, 623x
+Pass: Batch: 0.801794ms GPU, 0.53s total GPU, 655x
+Run:  single_float64_axis [Device=1 Duration=0.0009]
+Pass: Cold: 0.902248ms GPU, 0.906935ms CPU, 0.50s total GPU, 555x
+Pass: Batch: 0.901123ms GPU, 0.52s total GPU, 582x
+Run:  single_float64_axis [Device=1 Duration=0.001]
+Pass: Cold: 1.002594ms GPU, 1.007296ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^6 NumBlocks=2^6]
+Pass: Cold: 7.615783ms GPU, 7.622137ms CPU, 0.50s total GPU, 66x
+Pass: Batch: 7.614613ms GPU, 0.53s total GPU, 69x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^8 NumBlocks=2^6]
+Pass: Cold: 2.436121ms GPU, 2.442463ms CPU, 0.50s total GPU, 206x
+Pass: Batch: 2.433948ms GPU, 0.52s total GPU, 215x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^10 NumBlocks=2^6]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.30% > 0.50%)
+Pass: Cold: 1.105935ms GPU, 1.112225ms CPU, 14.56s total GPU, 13161x
+Pass: Batch: 1.102809ms GPU, 14.52s total GPU, 13162x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^6 NumBlocks=2^8]
+Pass: Cold: 2.444184ms GPU, 2.450441ms CPU, 0.92s total GPU, 375x
+Pass: Batch: 2.444397ms GPU, 0.92s total GPU, 376x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^8 NumBlocks=2^8]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.97% > 0.50%)
+Pass: Cold: 1.076641ms GPU, 1.082913ms CPU, 14.54s total GPU, 13509x
+Pass: Batch: 1.075515ms GPU, 14.53s total GPU, 13510x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^10 NumBlocks=2^8]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.51% > 0.50%)
+Pass: Cold: 0.958478ms GPU, 0.964751ms CPU, 14.48s total GPU, 15105x
+Pass: Batch: 0.957249ms GPU, 14.46s total GPU, 15106x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^6 NumBlocks=2^10]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.74% > 0.50%)
+Pass: Cold: 1.070616ms GPU, 1.076881ms CPU, 14.54s total GPU, 13582x
+Pass: Batch: 1.070915ms GPU, 14.55s total GPU, 13583x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^8 NumBlocks=2^10]
+Pass: Cold: 0.956568ms GPU, 0.962899ms CPU, 1.70s total GPU, 1782x
+Pass: Batch: 0.954599ms GPU, 1.70s total GPU, 1783x
+Run:  copy_sweep_grid_shape [Device=0 BlockSize=2^10 NumBlocks=2^10]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.94% > 0.50%)
+Pass: Cold: 0.994218ms GPU, 1.000471ms CPU, 14.49s total GPU, 14579x
+Pass: Batch: 0.992819ms GPU, 14.48s total GPU, 14580x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^6 NumBlocks=2^6]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.10% > 0.50%)
+Pass: Cold: 6.684226ms GPU, 6.688950ms CPU, 14.95s total GPU, 2236x
+Pass: Batch: 6.674803ms GPU, 14.93s total GPU, 2237x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^8 NumBlocks=2^6]
+Pass: Cold: 2.296344ms GPU, 2.301080ms CPU, 0.50s total GPU, 218x
+Pass: Batch: 2.298271ms GPU, 0.52s total GPU, 228x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^10 NumBlocks=2^6]
+Pass: Cold: 1.174374ms GPU, 1.179136ms CPU, 0.50s total GPU, 426x
+Pass: Batch: 1.172158ms GPU, 0.53s total GPU, 449x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^6 NumBlocks=2^8]
+Pass: Cold: 2.213621ms GPU, 2.218381ms CPU, 0.50s total GPU, 226x
+Pass: Batch: 2.213030ms GPU, 0.52s total GPU, 237x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^8 NumBlocks=2^8]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.67% > 0.50%)
+Pass: Cold: 1.130514ms GPU, 1.135236ms CPU, 14.62s total GPU, 12933x
+Pass: Batch: 1.130124ms GPU, 14.62s total GPU, 12934x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^10 NumBlocks=2^8]
+Pass: Cold: 1.118955ms GPU, 1.123642ms CPU, 0.50s total GPU, 447x
+Pass: Batch: 1.117003ms GPU, 0.52s total GPU, 468x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^6 NumBlocks=2^10]
+Pass: Cold: 1.116924ms GPU, 1.121675ms CPU, 0.50s total GPU, 448x
+Pass: Batch: 1.114889ms GPU, 0.52s total GPU, 470x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^8 NumBlocks=2^10]
+Pass: Cold: 1.117701ms GPU, 1.122388ms CPU, 0.50s total GPU, 448x
+Pass: Batch: 1.115605ms GPU, 0.53s total GPU, 471x
+Run:  copy_sweep_grid_shape [Device=1 BlockSize=2^10 NumBlocks=2^10]
+Pass: Cold: 1.055706ms GPU, 1.060387ms CPU, 0.50s total GPU, 474x
+Pass: Batch: 1.054097ms GPU, 0.52s total GPU, 498x
+Run:  copy_type_sweep [Device=0 T=U8]
+Pass: Cold: 2.543548ms GPU, 2.549831ms CPU, 0.50s total GPU, 197x
+Pass: Batch: 2.539371ms GPU, 0.52s total GPU, 206x
+Run:  copy_type_sweep [Device=0 T=U16]
+Pass: Cold: 1.595621ms GPU, 1.601868ms CPU, 0.50s total GPU, 314x
+Pass: Batch: 1.591500ms GPU, 0.53s total GPU, 331x
+Run:  copy_type_sweep [Device=0 T=U32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.97% > 0.50%)
+Pass: Cold: 1.076611ms GPU, 1.082876ms CPU, 14.54s total GPU, 13509x
+Pass: Batch: 1.075475ms GPU, 14.53s total GPU, 13510x
+Run:  copy_type_sweep [Device=0 T=U64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.61% > 0.50%)
+Pass: Cold: 0.930416ms GPU, 0.936680ms CPU, 14.46s total GPU, 15542x
+Pass: Batch: 0.929189ms GPU, 14.44s total GPU, 15543x
+Run:  copy_type_sweep [Device=0 T=F32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.97% > 0.50%)
+Pass: Cold: 1.076786ms GPU, 1.083044ms CPU, 14.55s total GPU, 13508x
+Pass: Batch: 1.075385ms GPU, 14.53s total GPU, 13509x
+Run:  copy_type_sweep [Device=0 T=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.60% > 0.50%)
+Pass: Cold: 0.930412ms GPU, 0.936683ms CPU, 14.46s total GPU, 15546x
+Pass: Batch: 0.929182ms GPU, 14.45s total GPU, 15547x
+Run:  copy_type_sweep [Device=1 T=U8]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.64% > 0.50%)
+Pass: Cold: 2.703092ms GPU, 2.707829ms CPU, 14.86s total GPU, 5497x
+Pass: Batch: 2.699732ms GPU, 14.84s total GPU, 5498x
+Run:  copy_type_sweep [Device=1 T=U16]
+Pass: Cold: 1.515335ms GPU, 1.520048ms CPU, 0.50s total GPU, 330x
+Pass: Batch: 1.513689ms GPU, 0.53s total GPU, 348x
+Run:  copy_type_sweep [Device=1 T=U32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.66% > 0.50%)
+Pass: Cold: 1.130622ms GPU, 1.135307ms CPU, 14.62s total GPU, 12935x
+Pass: Batch: 1.130123ms GPU, 14.62s total GPU, 12936x
+Run:  copy_type_sweep [Device=1 T=U64]
+Pass: Cold: 1.047513ms GPU, 1.052201ms CPU, 0.50s total GPU, 478x
+Pass: Batch: 1.044906ms GPU, 0.52s total GPU, 500x
+Run:  copy_type_sweep [Device=1 T=F32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.67% > 0.50%)
+Pass: Cold: 1.130631ms GPU, 1.135341ms CPU, 14.62s total GPU, 12933x
+Pass: Batch: 1.130335ms GPU, 14.62s total GPU, 12934x
+Run:  copy_type_sweep [Device=1 T=F64]
+Pass: Cold: 1.048417ms GPU, 1.053125ms CPU, 0.50s total GPU, 477x
+Pass: Batch: 1.045540ms GPU, 0.52s total GPU, 497x
+Run:  copy_type_conversion_sweep [Device=0 In=I8 Out=I8]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=0 In=I8 Out=I16]
+Pass: Cold: 0.702933ms GPU, 0.709177ms CPU, 0.50s total GPU, 712x
+Pass: Batch: 0.699714ms GPU, 0.52s total GPU, 748x
+Run:  copy_type_conversion_sweep [Device=0 In=I8 Out=I32]
+Pass: Cold: 0.804698ms GPU, 0.810934ms CPU, 0.50s total GPU, 622x
+Pass: Batch: 0.802089ms GPU, 0.52s total GPU, 654x
+Run:  copy_type_conversion_sweep [Device=0 In=I8 Out=F32]
+Pass: Cold: 0.814768ms GPU, 0.821028ms CPU, 0.50s total GPU, 614x
+Pass: Batch: 0.812088ms GPU, 0.52s total GPU, 645x
+Run:  copy_type_conversion_sweep [Device=0 In=I8 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.04% > 0.50%)
+Pass: Cold: 1.212034ms GPU, 1.218286ms CPU, 14.60s total GPU, 12047x
+Pass: Batch: 1.210944ms GPU, 14.59s total GPU, 12048x
+Run:  copy_type_conversion_sweep [Device=0 In=I8 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.74% > 0.50%)
+Pass: Cold: 1.181759ms GPU, 1.188029ms CPU, 14.59s total GPU, 12345x
+Pass: Batch: 1.180483ms GPU, 14.57s total GPU, 12346x
+Run:  copy_type_conversion_sweep [Device=0 In=I16 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I16 Out=I16]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=0 In=I16 Out=I32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.57% > 0.50%)
+Pass: Cold: 0.447104ms GPU, 0.453377ms CPU, 13.72s total GPU, 30679x
+Pass: Batch: 0.446394ms GPU, 13.70s total GPU, 30680x
+Run:  copy_type_conversion_sweep [Device=0 In=I16 Out=F32]
+Pass: Cold: 0.450117ms GPU, 0.456445ms CPU, 0.50s total GPU, 1111x
+Pass: Batch: 0.447497ms GPU, 0.52s total GPU, 1162x
+Run:  copy_type_conversion_sweep [Device=0 In=I16 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.69% > 0.50%)
+Pass: Cold: 0.657381ms GPU, 0.663639ms CPU, 14.19s total GPU, 21586x
+Pass: Batch: 0.656117ms GPU, 14.16s total GPU, 21587x
+Run:  copy_type_conversion_sweep [Device=0 In=I16 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.70% > 0.50%)
+Pass: Cold: 0.655457ms GPU, 0.661733ms CPU, 14.18s total GPU, 21638x
+Pass: Batch: 0.653902ms GPU, 14.15s total GPU, 21639x
+Run:  copy_type_conversion_sweep [Device=0 In=I32 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I32 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I32 Out=I32]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=0 In=I32 Out=F32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.26% > 0.50%)
+Pass: Cold: 0.266290ms GPU, 0.272530ms CPU, 12.64s total GPU, 47462x
+Pass: Batch: 0.264891ms GPU, 12.57s total GPU, 47463x
+Run:  copy_type_conversion_sweep [Device=0 In=I32 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.72% > 0.50%)
+Pass: Cold: 0.378098ms GPU, 0.384345ms CPU, 13.43s total GPU, 35529x
+Pass: Batch: 0.377321ms GPU, 13.41s total GPU, 35530x
+Run:  copy_type_conversion_sweep [Device=0 In=I32 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.81% > 0.50%)
+Pass: Cold: 0.378571ms GPU, 0.384783ms CPU, 13.44s total GPU, 35498x
+Pass: Batch: 0.377827ms GPU, 13.41s total GPU, 35499x
+Run:  copy_type_conversion_sweep [Device=0 In=F32 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=F32 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=F32 Out=I32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.35% > 0.50%)
+Pass: Cold: 0.265440ms GPU, 0.271689ms CPU, 12.64s total GPU, 47607x
+Pass: Batch: 0.263872ms GPU, 12.56s total GPU, 47608x
+Run:  copy_type_conversion_sweep [Device=0 In=F32 Out=F32]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=0 In=F32 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%)
+Pass: Cold: 0.378543ms GPU, 0.384786ms CPU, 13.44s total GPU, 35499x
+Pass: Batch: 0.377921ms GPU, 13.42s total GPU, 35500x
+Run:  copy_type_conversion_sweep [Device=0 In=F32 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.84% > 0.50%)
+Pass: Cold: 0.378288ms GPU, 0.384556ms CPU, 13.43s total GPU, 35509x
+Pass: Batch: 0.377510ms GPU, 13.41s total GPU, 35510x
+Run:  copy_type_conversion_sweep [Device=0 In=I64 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I64 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I64 Out=I32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I64 Out=F32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=I64 Out=I64]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=0 In=I64 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.88% > 0.50%)
+Pass: Cold: 0.236185ms GPU, 0.242456ms CPU, 12.31s total GPU, 52100x
+Pass: Batch: 0.235095ms GPU, 12.25s total GPU, 52101x
+Run:  copy_type_conversion_sweep [Device=0 In=F64 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=F64 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=F64 Out=I32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=F64 Out=F32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=0 In=F64 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.01% > 0.50%)
+Pass: Cold: 0.238030ms GPU, 0.244316ms CPU, 12.33s total GPU, 51780x
+Pass: Batch: 0.236852ms GPU, 12.26s total GPU, 51781x
+Run:  copy_type_conversion_sweep [Device=0 In=F64 Out=F64]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=1 In=I8 Out=I8]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=1 In=I8 Out=I16]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.80% > 0.50%)
+Pass: Cold: 0.661753ms GPU, 0.666484ms CPU, 14.28s total GPU, 21576x
+Pass: Batch: 0.660957ms GPU, 14.26s total GPU, 21577x
+Run:  copy_type_conversion_sweep [Device=1 In=I8 Out=I32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%)
+Pass: Cold: 0.858200ms GPU, 0.862918ms CPU, 14.47s total GPU, 16864x
+Pass: Batch: 0.857568ms GPU, 14.46s total GPU, 16865x
+Run:  copy_type_conversion_sweep [Device=1 In=I8 Out=F32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.80% > 0.50%)
+Pass: Cold: 0.857883ms GPU, 0.862614ms CPU, 14.47s total GPU, 16866x
+Pass: Batch: 0.857087ms GPU, 14.46s total GPU, 16867x
+Run:  copy_type_conversion_sweep [Device=1 In=I8 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.55% > 0.50%)
+Pass: Cold: 1.455328ms GPU, 1.460033ms CPU, 14.72s total GPU, 10113x
+Pass: Batch: 1.453767ms GPU, 14.70s total GPU, 10114x
+Run:  copy_type_conversion_sweep [Device=1 In=I8 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.55% > 0.50%)
+Pass: Cold: 1.457198ms GPU, 1.461889ms CPU, 14.72s total GPU, 10100x
+Pass: Batch: 1.455933ms GPU, 14.71s total GPU, 10101x
+Run:  copy_type_conversion_sweep [Device=1 In=I16 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I16 Out=I16]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=1 In=I16 Out=I32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%)
+Pass: Cold: 0.455669ms GPU, 0.460392ms CPU, 13.86s total GPU, 30414x
+Pass: Batch: 0.455415ms GPU, 13.85s total GPU, 30415x
+Run:  copy_type_conversion_sweep [Device=1 In=I16 Out=F32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.78% > 0.50%)
+Pass: Cold: 0.454242ms GPU, 0.458932ms CPU, 13.86s total GPU, 30506x
+Pass: Batch: 0.453806ms GPU, 13.84s total GPU, 30507x
+Run:  copy_type_conversion_sweep [Device=1 In=I16 Out=I64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.56% > 0.50%)
+Pass: Cold: 0.748931ms GPU, 0.753645ms CPU, 14.38s total GPU, 19198x
+Pass: Batch: 0.748024ms GPU, 14.36s total GPU, 19199x
+Run:  copy_type_conversion_sweep [Device=1 In=I16 Out=F64]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.54% > 0.50%)
+Pass: Cold: 0.747517ms GPU, 0.752207ms CPU, 14.38s total GPU, 19239x
+Pass: Batch: 0.746459ms GPU, 14.36s total GPU, 19240x
+Run:  copy_type_conversion_sweep [Device=1 In=I32 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I32 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I32 Out=I32]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=1 In=I32 Out=F32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.79% > 0.50%)
+Pass: Cold: 0.273880ms GPU, 0.278585ms CPU, 12.87s total GPU, 47007x
+Pass: Batch: 0.273539ms GPU, 12.86s total GPU, 47008x
+Run:  copy_type_conversion_sweep [Device=1 In=I32 Out=I64]
+Pass: Cold: 0.418105ms GPU, 0.422857ms CPU, 0.50s total GPU, 1196x
+Pass: Batch: 0.416192ms GPU, 0.52s total GPU, 1254x
+Run:  copy_type_conversion_sweep [Device=1 In=I32 Out=F64]
+Pass: Cold: 0.418703ms GPU, 0.423383ms CPU, 0.50s total GPU, 1195x
+Pass: Batch: 0.416603ms GPU, 0.52s total GPU, 1252x
+Run:  copy_type_conversion_sweep [Device=1 In=F32 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=F32 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=F32 Out=I32]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.25% > 0.50%)
+Pass: Cold: 0.276847ms GPU, 0.281581ms CPU, 12.89s total GPU, 46545x
+Pass: Batch: 0.276479ms GPU, 12.87s total GPU, 46546x
+Run:  copy_type_conversion_sweep [Device=1 In=F32 Out=F32]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=1 In=F32 Out=I64]
+Pass: Cold: 0.418391ms GPU, 0.423070ms CPU, 0.50s total GPU, 1196x
+Pass: Batch: 0.416373ms GPU, 0.52s total GPU, 1257x
+Run:  copy_type_conversion_sweep [Device=1 In=F32 Out=F64]
+Pass: Cold: 0.418690ms GPU, 0.423378ms CPU, 0.50s total GPU, 1195x
+Pass: Batch: 0.416660ms GPU, 0.53s total GPU, 1265x
+Run:  copy_type_conversion_sweep [Device=1 In=I64 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I64 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I64 Out=I32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I64 Out=F32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=I64 Out=I64]
+Skip: Not a conversion: InputType == OutputType.
+Run:  copy_type_conversion_sweep [Device=1 In=I64 Out=F64]
+Pass: Cold: 0.261885ms GPU, 0.266569ms CPU, 0.50s total GPU, 1910x
+Pass: Batch: 0.260037ms GPU, 0.52s total GPU, 2011x
+Run:  copy_type_conversion_sweep [Device=1 In=F64 Out=I8]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=F64 Out=I16]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=F64 Out=I32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=F64 Out=F32]
+Skip: Narrowing conversion: sizeof(InputType) > sizeof(OutputType).
+Run:  copy_type_conversion_sweep [Device=1 In=F64 Out=I64]
+Pass: Cold: 0.261586ms GPU, 0.266286ms CPU, 0.50s total GPU, 1912x
+Pass: Batch: 0.259790ms GPU, 0.52s total GPU, 2016x
+Run:  copy_type_conversion_sweep [Device=1 In=F64 Out=F64]
+Skip: Not a conversion: InputType == OutputType.
+```
+
+# Benchmark Results
+
+## simple
+
+### [0] Quadro GV100
+
+| Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|---------|----------|-------|----------|-------|-----------|-------|
+|    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  524x |
+
+### [1] Quadro GP100
+
+| Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|---------|----------|-------|----------|-------|-----------|-------|
+|    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+
+## single_float64_axis
+
+### [0] Quadro GV100
+
+| Duration | Samples |  CPU Time  | Noise |  GPU Time  | Noise  | Batch GPU  |  Batch  |
+|----------|---------|------------|-------|------------|--------|------------|---------|
+|        0 | 147957x |  10.618 us | 3.25% |   4.424 us | 10.78% |   2.043 us | 244766x |
+|   0.0001 |   4831x | 110.048 us | 0.42% | 103.515 us |  0.48% | 101.376 us |   5088x |
+|   0.0002 |   2453x | 210.369 us | 0.22% | 203.903 us |  0.25% | 201.729 us |   2582x |
+|   0.0003 |   1648x | 309.866 us | 0.15% | 303.412 us |  0.17% | 301.164 us |   1736x |
+|   0.0004 |   1239x | 410.148 us | 0.12% | 403.673 us |  0.14% | 401.410 us |   1304x |
+|   0.0005 |    992x | 510.529 us | 0.09% | 504.089 us |  0.11% | 501.762 us |   1042x |
+|   0.0006 |    829x | 609.862 us | 0.08% | 603.471 us |  0.10% | 601.104 us |    872x |
+|   0.0007 |    711x | 710.294 us | 0.07% | 703.744 us |  0.08% | 701.443 us |    748x |
+|   0.0008 |    622x | 810.565 us | 0.06% | 804.187 us |  0.07% | 801.795 us |    653x |
+|   0.0009 |    554x | 909.873 us | 0.05% | 903.433 us |  0.06% | 901.125 us |    582x |
+|    0.001 |    499x |   1.010 ms | 0.04% |   1.004 ms |  0.05% |   1.001 ms |    523x |
+
+### [1] Quadro GP100
+
+| Duration | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Batch GPU  |  Batch  |
+|----------|---------|------------|-------|------------|-------|------------|---------|
+|        0 | 152839x |   7.705 us | 5.42% |   3.016 us | 4.13% |   1.343 us | 372166x |
+|   0.0001 |   4879x | 107.156 us | 0.41% | 102.481 us | 0.31% | 101.376 us |   5107x |
+|   0.0002 |   2466x | 207.544 us | 0.19% | 202.833 us | 0.15% | 201.728 us |   2586x |
+|   0.0003 |   1655x | 306.880 us | 0.13% | 302.191 us | 0.11% | 301.057 us |   1736x |
+|   0.0004 |   1243x | 407.214 us | 0.11% | 402.508 us | 0.08% | 401.409 us |   1305x |
+|   0.0005 |    995x | 507.562 us | 0.08% | 502.864 us | 0.06% | 501.761 us |   1045x |
+|   0.0006 |    831x | 606.954 us | 0.07% | 602.223 us | 0.05% | 601.089 us |    873x |
+|   0.0007 |    712x | 707.255 us | 0.06% | 702.559 us | 0.04% | 701.442 us |    748x |
+|   0.0008 |    623x | 807.636 us | 0.05% | 802.910 us | 0.04% | 801.794 us |    655x |
+|   0.0009 |    555x | 906.935 us | 0.05% | 902.248 us | 0.03% | 901.123 us |    582x |
+|    0.001 |    499x |   1.007 ms | 0.04% |   1.003 ms | 0.03% |   1.001 ms |    524x |
+
+## copy_sweep_grid_shape
+
+### [0] Quadro GV100
+
+| BlockSize | (BlockSize) | NumBlocks | (NumBlocks) | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | GlobalMem BW | BWPeak | Batch GPU  | Batch  |
+|-----------|-------------|-----------|-------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------|
+|       2^6 |          64 |       2^6 |          64 |     66x |   7.622 ms | 0.14% |   7.616 ms | 0.14% |  8.812G |  70.495 GB/s |  8.10% |   7.615 ms |    69x |
+|       2^8 |         256 |       2^6 |          64 |    206x |   2.442 ms | 0.46% |   2.436 ms | 0.46% | 27.547G | 220.379 GB/s | 25.32% |   2.434 ms |   215x |
+|      2^10 |        1024 |       2^6 |          64 |  13161x |   1.112 ms | 1.29% |   1.106 ms | 1.30% | 60.681G | 485.445 GB/s | 55.77% |   1.103 ms | 13162x |
+|       2^6 |          64 |       2^8 |         256 |    375x |   2.450 ms | 0.50% |   2.444 ms | 0.50% | 27.457G | 219.652 GB/s | 25.24% |   2.444 ms |   376x |
+|       2^8 |         256 |       2^8 |         256 |  13509x |   1.083 ms | 0.96% |   1.077 ms | 0.97% | 62.332G | 498.653 GB/s | 57.29% |   1.076 ms | 13510x |
+|      2^10 |        1024 |       2^8 |         256 |  15105x | 964.751 us | 0.51% | 958.478 us | 0.51% | 70.016G | 560.129 GB/s | 64.35% | 957.249 us | 15106x |
+|       2^6 |          64 |      2^10 |        1024 |  13582x |   1.077 ms | 0.74% |   1.071 ms | 0.74% | 62.682G | 501.460 GB/s | 57.61% |   1.071 ms | 13583x |
+|       2^8 |         256 |      2^10 |        1024 |   1782x | 962.899 us | 0.50% | 956.568 us | 0.50% | 70.156G | 561.247 GB/s | 64.48% | 954.599 us |  1783x |
+|      2^10 |        1024 |      2^10 |        1024 |  14579x |   1.000 ms | 1.93% | 994.218 us | 1.94% | 67.499G | 539.993 GB/s | 62.04% | 992.819 us | 14580x |
+
+### [1] Quadro GP100
+
+| BlockSize | (BlockSize) | NumBlocks | (NumBlocks) | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s  | GlobalMem BW | BWPeak | Batch GPU | Batch  |
+|-----------|-------------|-----------|-------------|---------|----------|-------|----------|-------|---------|--------------|--------|-----------|--------|
+|       2^6 |          64 |       2^6 |          64 |   2236x | 6.689 ms | 1.09% | 6.684 ms | 1.10% | 10.040G |  80.319 GB/s | 10.97% |  6.675 ms |  2237x |
+|       2^8 |         256 |       2^6 |          64 |    218x | 2.301 ms | 0.29% | 2.296 ms | 0.29% | 29.224G | 233.794 GB/s | 31.93% |  2.298 ms |   228x |
+|      2^10 |        1024 |       2^6 |          64 |    426x | 1.179 ms | 0.39% | 1.174 ms | 0.39% | 57.144G | 457.155 GB/s | 62.44% |  1.172 ms |   449x |
+|       2^6 |          64 |       2^8 |         256 |    226x | 2.218 ms | 0.16% | 2.214 ms | 0.16% | 30.316G | 242.531 GB/s | 33.13% |  2.213 ms |   237x |
+|       2^8 |         256 |       2^8 |         256 |  12933x | 1.135 ms | 0.67% | 1.131 ms | 0.67% | 59.361G | 474.891 GB/s | 64.86% |  1.130 ms | 12934x |
+|      2^10 |        1024 |       2^8 |         256 |    447x | 1.124 ms | 0.22% | 1.119 ms | 0.22% | 59.975G | 479.797 GB/s | 65.53% |  1.117 ms |   468x |
+|       2^6 |          64 |      2^10 |        1024 |    448x | 1.122 ms | 0.30% | 1.117 ms | 0.30% | 60.084G | 480.669 GB/s | 65.65% |  1.115 ms |   470x |
+|       2^8 |         256 |      2^10 |        1024 |    448x | 1.122 ms | 0.28% | 1.118 ms | 0.28% | 60.042G | 480.335 GB/s | 65.61% |  1.116 ms |   471x |
+|      2^10 |        1024 |      2^10 |        1024 |    474x | 1.060 ms | 0.15% | 1.056 ms | 0.15% | 63.568G | 508.542 GB/s | 69.46% |  1.054 ms |   498x |
+
+## copy_type_sweep
+
+### [0] Quadro GV100
+
+|  T  | Samples |  CPU Time  | Noise |  GPU Time  | Noise |  Elem/s  | GlobalMem BW | BWPeak | Batch GPU  | Batch  |
+|-----|---------|------------|-------|------------|-------|----------|--------------|--------|------------|--------|
+|  U8 |    197x |   2.550 ms | 0.30% |   2.544 ms | 0.30% | 105.536G | 211.072 GB/s | 24.25% |   2.539 ms |   206x |
+| U16 |    314x |   1.602 ms | 0.41% |   1.596 ms | 0.41% |  84.116G | 336.465 GB/s | 38.66% |   1.592 ms |   331x |
+| U32 |  13509x |   1.083 ms | 0.96% |   1.077 ms | 0.97% |  62.333G | 498.668 GB/s | 57.29% |   1.075 ms | 13510x |
+| U64 |  15542x | 936.680 us | 0.60% | 930.416 us | 0.61% |  36.064G | 577.023 GB/s | 66.29% | 929.189 us | 15543x |
+| F32 |  13508x |   1.083 ms | 0.97% |   1.077 ms | 0.97% |  62.323G | 498.586 GB/s | 57.28% |   1.075 ms | 13509x |
+| F64 |  15546x | 936.683 us | 0.59% | 930.412 us | 0.60% |  36.064G | 577.025 GB/s | 66.29% | 929.182 us | 15547x |
+
+### [1] Quadro GP100
+
+|  T  | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s  | GlobalMem BW | BWPeak | Batch GPU | Batch  |
+|-----|---------|----------|-------|----------|-------|---------|--------------|--------|-----------|--------|
+|  U8 |   5497x | 2.708 ms | 0.64% | 2.703 ms | 0.64% | 99.307G | 198.614 GB/s | 27.13% |  2.700 ms |  5498x |
+| U16 |    330x | 1.520 ms | 0.44% | 1.515 ms | 0.44% | 88.573G | 354.292 GB/s | 48.39% |  1.514 ms |   348x |
+| U32 |  12935x | 1.135 ms | 0.66% | 1.131 ms | 0.66% | 59.356G | 474.846 GB/s | 64.86% |  1.130 ms | 12936x |
+| U64 |    478x | 1.052 ms | 0.27% | 1.048 ms | 0.27% | 32.032G | 512.520 GB/s | 70.00% |  1.045 ms |   500x |
+| F32 |  12933x | 1.135 ms | 0.67% | 1.131 ms | 0.67% | 59.355G | 474.842 GB/s | 64.85% |  1.130 ms | 12934x |
+| F64 |    477x | 1.053 ms | 0.28% | 1.048 ms | 0.28% | 32.005G | 512.078 GB/s | 69.94% |  1.046 ms |   497x |
+
+## copy_type_conversion_sweep
+
+### [0] Quadro GV100
+
+| In  | Out |  Items   |   InSize   |   OutSize   | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | GlobalMem BW | BWPeak | Batch GPU  | Batch  |
+|-----|-----|----------|------------|-------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------|
+|  I8 | I16 | 67108864 | 64.000 MiB | 128.000 MiB |    712x | 709.177 us | 0.29% | 702.933 us | 0.30% | 95.470G | 286.410 GB/s | 32.91% | 699.714 us |   748x |
+|  I8 | I32 | 67108864 | 64.000 MiB | 256.000 MiB |    622x | 810.934 us | 0.31% | 804.698 us | 0.31% | 83.396G | 416.982 GB/s | 47.91% | 802.089 us |   654x |
+|  I8 | F32 | 67108864 | 64.000 MiB | 256.000 MiB |    614x | 821.028 us | 0.32% | 814.768 us | 0.33% | 82.366G | 411.828 GB/s | 47.31% | 812.088 us |   645x |
+|  I8 | I64 | 67108864 | 64.000 MiB | 512.000 MiB |  12047x |   1.218 ms | 1.04% |   1.212 ms | 1.04% | 55.369G | 498.319 GB/s | 57.25% |   1.211 ms | 12048x |
+|  I8 | F64 | 67108864 | 64.000 MiB | 512.000 MiB |  12345x |   1.188 ms | 0.74% |   1.182 ms | 0.74% | 56.787G | 511.086 GB/s | 58.72% |   1.180 ms | 12346x |
+| I16 | I32 | 33554432 | 64.000 MiB | 128.000 MiB |  30679x | 453.377 us | 0.56% | 447.104 us | 0.57% | 75.048G | 450.290 GB/s | 51.73% | 446.394 us | 30680x |
+| I16 | F32 | 33554432 | 64.000 MiB | 128.000 MiB |   1111x | 456.445 us | 0.46% | 450.117 us | 0.47% | 74.546G | 447.276 GB/s | 51.39% | 447.497 us |  1162x |
+| I16 | I64 | 33554432 | 64.000 MiB | 256.000 MiB |  21586x | 663.639 us | 0.68% | 657.381 us | 0.69% | 51.043G | 510.426 GB/s | 58.64% | 656.117 us | 21587x |
+| I16 | F64 | 33554432 | 64.000 MiB | 256.000 MiB |  21638x | 661.733 us | 0.69% | 655.457 us | 0.70% | 51.192G | 511.925 GB/s | 58.81% | 653.902 us | 21639x |
+| I32 | F32 | 16777216 | 64.000 MiB |  64.000 MiB |  47462x | 272.530 us | 1.23% | 266.290 us | 1.26% | 63.003G | 504.028 GB/s | 57.91% | 264.891 us | 47463x |
+| I32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB |  35529x | 384.345 us | 0.71% | 378.098 us | 0.72% | 44.373G | 532.471 GB/s | 61.18% | 377.321 us | 35530x |
+| I32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB |  35498x | 384.783 us | 0.79% | 378.571 us | 0.81% | 44.317G | 531.807 GB/s | 61.10% | 377.827 us | 35499x |
+| F32 | I32 | 16777216 | 64.000 MiB |  64.000 MiB |  47607x | 271.689 us | 1.32% | 265.440 us | 1.35% | 63.205G | 505.642 GB/s | 58.09% | 263.872 us | 47608x |
+| F32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB |  35499x | 384.786 us | 0.78% | 378.543 us | 0.79% | 44.320G | 531.846 GB/s | 61.10% | 377.921 us | 35500x |
+| F32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB |  35509x | 384.556 us | 0.82% | 378.288 us | 0.84% | 44.350G | 532.204 GB/s | 61.14% | 377.510 us | 35510x |
+| I64 | F64 |  8388608 | 64.000 MiB |  64.000 MiB |  52100x | 242.456 us | 0.85% | 236.185 us | 0.88% | 35.517G | 568.273 GB/s | 65.29% | 235.095 us | 52101x |
+| F64 | I64 |  8388608 | 64.000 MiB |  64.000 MiB |  51780x | 244.316 us | 0.98% | 238.030 us | 1.01% | 35.242G | 563.869 GB/s | 64.78% | 236.852 us | 51781x |
+
+### [1] Quadro GP100
+
+| In  | Out |  Items   |   InSize   |   OutSize   | Samples |  CPU Time  | Noise |  GPU Time  | Noise |  Elem/s  | GlobalMem BW | BWPeak | Batch GPU  | Batch  |
+|-----|-----|----------|------------|-------------|---------|------------|-------|------------|-------|----------|--------------|--------|------------|--------|
+|  I8 | I16 | 67108864 | 64.000 MiB | 128.000 MiB |  21576x | 666.484 us | 0.80% | 661.753 us | 0.80% | 101.411G | 304.232 GB/s | 41.55% | 660.957 us | 21577x |
+|  I8 | I32 | 67108864 | 64.000 MiB | 256.000 MiB |  16864x | 862.918 us | 0.79% | 858.200 us | 0.79% |  78.197G | 390.986 GB/s | 53.40% | 857.568 us | 16865x |
+|  I8 | F32 | 67108864 | 64.000 MiB | 256.000 MiB |  16866x | 862.614 us | 0.80% | 857.883 us | 0.80% |  78.226G | 391.131 GB/s | 53.42% | 857.087 us | 16867x |
+|  I8 | I64 | 67108864 | 64.000 MiB | 512.000 MiB |  10113x |   1.460 ms | 0.55% |   1.455 ms | 0.55% |  46.113G | 415.013 GB/s | 56.68% |   1.454 ms | 10114x |
+|  I8 | F64 | 67108864 | 64.000 MiB | 512.000 MiB |  10100x |   1.462 ms | 0.54% |   1.457 ms | 0.55% |  46.053G | 414.480 GB/s | 56.61% |   1.456 ms | 10101x |
+| I16 | I32 | 33554432 | 64.000 MiB | 128.000 MiB |  30414x | 460.392 us | 0.78% | 455.669 us | 0.79% |  73.638G | 441.826 GB/s | 60.35% | 455.415 us | 30415x |
+| I16 | F32 | 33554432 | 64.000 MiB | 128.000 MiB |  30506x | 458.932 us | 0.78% | 454.242 us | 0.78% |  73.869G | 443.215 GB/s | 60.54% | 453.806 us | 30507x |
+| I16 | I64 | 33554432 | 64.000 MiB | 256.000 MiB |  19198x | 753.645 us | 0.56% | 748.931 us | 0.56% |  44.803G | 448.031 GB/s | 61.19% | 748.024 us | 19199x |
+| I16 | F64 | 33554432 | 64.000 MiB | 256.000 MiB |  19239x | 752.207 us | 0.54% | 747.517 us | 0.54% |  44.888G | 448.878 GB/s | 61.31% | 746.459 us | 19240x |
+| I32 | F32 | 16777216 | 64.000 MiB |  64.000 MiB |  47007x | 278.585 us | 0.78% | 273.880 us | 0.79% |  61.258G | 490.060 GB/s | 66.93% | 273.539 us | 47008x |
+| I32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB |   1196x | 422.857 us | 0.44% | 418.105 us | 0.44% |  40.127G | 481.521 GB/s | 65.77% | 416.192 us |  1254x |
+| I32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB |   1195x | 423.383 us | 0.47% | 418.703 us | 0.47% |  40.070G | 480.834 GB/s | 65.67% | 416.603 us |  1252x |
+| F32 | I32 | 16777216 | 64.000 MiB |  64.000 MiB |  46545x | 281.581 us | 1.23% | 276.847 us | 1.25% |  60.601G | 484.808 GB/s | 66.22% | 276.479 us | 46546x |
+| F32 | I64 | 16777216 | 64.000 MiB | 128.000 MiB |   1196x | 423.070 us | 0.46% | 418.391 us | 0.46% |  40.099G | 481.193 GB/s | 65.72% | 416.373 us |  1257x |
+| F32 | F64 | 16777216 | 64.000 MiB | 128.000 MiB |   1195x | 423.378 us | 0.47% | 418.690 us | 0.47% |  40.071G | 480.849 GB/s | 65.68% | 416.660 us |  1265x |
+| I64 | F64 |  8388608 | 64.000 MiB |  64.000 MiB |   1910x | 266.569 us | 0.42% | 261.885 us | 0.42% |  32.032G | 512.506 GB/s | 70.00% | 260.037 us |  2011x |
+| F64 | I64 |  8388608 | 64.000 MiB |  64.000 MiB |   1912x | 266.286 us | 0.42% | 261.586 us | 0.41% |  32.068G | 513.092 GB/s | 70.08% | 259.790 us |  2016x |
diff --git a/examples/outputs/nvbench.example.enums.csv b/examples/outputs/nvbench.example.enums.csv
new file mode 100644
index 0000000..e644155
--- /dev/null
+++ b/examples/outputs/nvbench.example.enums.csv
@@ -0,0 +1,27 @@
+Benchmark,Device,Device Name,MyEnum,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Batch GPU (sec),Batch,SomeInts
+runtime_enum_sweep_string,0,Quadro GV100,A,No,499,0.0010101454729458924,0.00046483104049603716,0.001003841868144471,0.0005749097846815492,0.001001477914376195,523,
+runtime_enum_sweep_string,0,Quadro GV100,B,No,499,0.001010159408817635,0.0004613137827456428,0.0010038492447866406,0.0005555267067795092,0.001001477914376195,523,
+runtime_enum_sweep_string,0,Quadro GV100,C,No,499,0.0010101243687374746,0.0005092828188229102,0.001003840521246726,0.0005589757153765308,0.0010014759304418617,523,
+runtime_enum_sweep_string,1,Quadro GP100,A,No,499,0.0010076531543086172,0.0004980799779298182,0.0010027744387815845,0.0003286485696207647,0.0010014733467393249,524,
+runtime_enum_sweep_string,1,Quadro GP100,B,No,499,0.001007305567134269,0.0004149742922173277,0.0010026098881551395,0.0003090849629556725,0.0010014743950530773,524,
+runtime_enum_sweep_string,1,Quadro GP100,C,No,499,0.0010073099939879758,0.00041034663323030794,0.001002610978000388,0.0003063685132670013,0.0010014735796979365,524,
+runtime_enum_sweep_int64,0,Quadro GV100,0,No,499,0.0010101685511022043,0.00048173554195341087,0.001003846292266381,0.0005760736687258556,0.001001475909284053,524,
+runtime_enum_sweep_int64,0,Quadro GV100,1,No,499,0.0010101069098196389,0.000452307701913514,0.0010036457628907521,0.0004970727279364392,0.0010014759304418617,523,
+runtime_enum_sweep_int64,0,Quadro GV100,2,No,499,0.0010101263466933872,0.0004551387873866542,0.001003668336447824,0.0004936151206687181,0.0010014759304418617,523,
+runtime_enum_sweep_int64,1,Quadro GP100,0,No,499,0.0010076479038076157,0.0005063884103258074,0.0010027646934818884,0.0003189744202423173,0.0010014735796979365,524,
+runtime_enum_sweep_int64,1,Quadro GP100,1,No,499,0.0010073362545090188,0.0006783144138303011,0.001002585007575805,0.00032119540844936307,0.0010014733467393249,524,
+runtime_enum_sweep_int64,1,Quadro GP100,2,No,499,0.0010073002645290582,0.0004221934097940968,0.0010025900736122663,0.00030148599118709333,0.0010014748609703007,524,
+compile_time_enum_sweep,0,Quadro GV100,A,No,499,0.001010085394789578,0.0004655817990209206,0.001003753755278955,0.0005493729335684191,0.0010014759304418617,523,
+compile_time_enum_sweep,0,Quadro GV100,B,No,499,0.001010053595190381,0.0005071877844696576,0.0010036782112293535,0.0005775279883320534,0.001001473929135854,524,
+compile_time_enum_sweep,0,Quadro GV100,C,No,499,0.0010101191442885773,0.000491671020220968,0.001003674685835594,0.0005387103704343772,0.001001475909284053,524,
+compile_time_enum_sweep,1,Quadro GP100,A,No,499,0.0010072828116232475,0.0004397728983271746,0.001002574038171099,0.0003125584355538476,0.0010014731137807133,524,
+compile_time_enum_sweep,1,Quadro GP100,B,No,499,0.0010073154749498997,0.00040217119470421287,0.0010025884063544867,0.0002963185452948694,0.0010014750939289121,524,
+compile_time_enum_sweep,1,Quadro GP100,C,No,499,0.0010073215450901796,0.00042671482068318837,0.0010026036672936165,0.000296913783133073,0.0010014753268875237,524,
+compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010101242705410818,0.00046763787302286317,0.0010036719920401053,0.0005214982620209435,0.001001477889432252,524,0
+compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010101197975951911,0.00047310755052367136,0.0010037048890260939,0.0005711753095722302,0.0010014759304418617,523,16
+compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010100482024048096,0.00048230228130997395,0.0010036862909435461,0.0005703633321237566,0.001001475909284053,524,4096
+compile_time_int_sweep,0,Quadro GV100,,No,499,0.0010101047835671339,0.0004779495034261126,0.0010036583329011495,0.0004905372193961659,0.0010014759304418617,523,-12
+compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010073524008016035,0.00045954094220701026,0.001002591226765053,0.0003132035131857924,0.0010014731137807133,524,0
+compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010072802204408825,0.00041173035101990657,0.001002568971417949,0.00030006995446635803,0.0010014742785737715,524,16
+compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010072856112224454,0.00041573006780551387,0.0010025766641200192,0.0003017176216493314,0.0010014750939289121,524,4096
+compile_time_int_sweep,1,Quadro GP100,,No,499,0.0010073294088176355,0.00042082782124641986,0.001002588534402943,0.0002955692521037092,0.0010014733467393249,524,-12
diff --git a/examples/outputs/nvbench.example.enums.json b/examples/outputs/nvbench.example.enums.json
new file mode 100644
index 0000000..e4430f1
--- /dev/null
+++ b/examples/outputs/nvbench.example.enums.json
@@ -0,0 +1,3949 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "runtime_enum_sweep_string",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "MyEnum": {
+          "type": "string",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "A",
+              "description": "",
+              "value": "A"
+            },
+            {
+              "input_string": "B",
+              "description": "",
+              "value": "B"
+            },
+            {
+              "input_string": "C",
+              "description": "",
+              "value": "C"
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 MyEnum=A": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "A"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101454729458924"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00046483104049603716"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003841868144471"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005749097846815492"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001477914376195"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 MyEnum=B": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "B"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001010159408817635"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004613137827456428"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010038492447866406"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005555267067795092"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001477914376195"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 MyEnum=C": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "C"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101243687374746"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005092828188229102"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003840521246726"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005589757153765308"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=A": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "A"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010076531543086172"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004980799779298182"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010027744387815845"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003286485696207647"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014733467393249"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=B": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "B"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001007305567134269"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004149742922173277"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010026098881551395"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003090849629556725"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014743950530773"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=C": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "C"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073099939879758"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041034663323030794"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002610978000388"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003063685132670013"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014735796979365"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 1,
+      "name": "runtime_enum_sweep_int64",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "MyEnum": {
+          "type": "int64",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "0",
+              "description": "",
+              "value": 0
+            },
+            {
+              "input_string": "1",
+              "description": "",
+              "value": 1
+            },
+            {
+              "input_string": "2",
+              "description": "",
+              "value": 2
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 MyEnum=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "int64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101685511022043"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00048173554195341087"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003846292266381"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005760736687258556"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 MyEnum=1": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "int64",
+              "value": "1"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101069098196389"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000452307701913514"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036457628907521"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004970727279364392"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 MyEnum=2": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "int64",
+              "value": "2"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101263466933872"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004551387873866542"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003668336447824"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004936151206687181"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=0": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "int64",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010076479038076157"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005063884103258074"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010027646934818884"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003189744202423173"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014735796979365"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "int64",
+              "value": "1"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073362545090188"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006783144138303011"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002585007575805"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00032119540844936307"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014733467393249"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=2": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "int64",
+              "value": "2"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073002645290582"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004221934097940968"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025900736122663"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030148599118709333"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014748609703007"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 2,
+      "name": "compile_time_enum_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "MyEnum": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "A",
+              "description": "MyEnum::ValueA",
+              "is_active": true
+            },
+            {
+              "input_string": "B",
+              "description": "MyEnum::ValueB",
+              "is_active": true
+            },
+            {
+              "input_string": "C",
+              "description": "MyEnum::ValueC",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 MyEnum=A": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "A"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001010085394789578"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004655817990209206"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003753755278955"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005493729335684191"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 MyEnum=B": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "B"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001010053595190381"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005071877844696576"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036782112293535"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005775279883320534"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001473929135854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 MyEnum=C": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "C"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101191442885773"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000491671020220968"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003674685835594"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005387103704343772"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=A": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "A"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072828116232475"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004397728983271746"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002574038171099"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003125584355538476"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014731137807133"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=B": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "B"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073154749498997"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00040217119470421287"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025884063544867"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002963185452948694"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014750939289121"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 MyEnum=C": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "MyEnum": {
+              "type": "string",
+              "value": "C"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073215450901796"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042671482068318837"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010026036672936165"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000296913783133073"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014753268875237"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 3,
+      "name": "compile_time_int_sweep",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "SomeInts": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "0",
+              "description": "nvbench::enum_type<0, int>",
+              "is_active": true
+            },
+            {
+              "input_string": "16",
+              "description": "nvbench::enum_type<16, int>",
+              "is_active": true
+            },
+            {
+              "input_string": "4096",
+              "description": "nvbench::enum_type<4096, int>",
+              "is_active": true
+            },
+            {
+              "input_string": "-12",
+              "description": "nvbench::enum_type<-12, int>",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 SomeInts=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101242705410818"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00046763787302286317"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036719920401053"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005214982620209435"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001477889432252"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 SomeInts=16": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101197975951911"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00047310755052367136"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010037048890260939"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005711753095722302"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 SomeInts=4096": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "4096"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010100482024048096"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00048230228130997395"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036862909435461"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005703633321237566"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 SomeInts=-12": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "-12"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101047835671339"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004779495034261126"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036583329011495"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004905372193961659"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 SomeInts=0": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "0"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073524008016035"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045954094220701026"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002591226765053"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003132035131857924"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014731137807133"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 SomeInts=16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072802204408825"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041173035101990657"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002568971417949"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00030006995446635803"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014742785737715"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 SomeInts=4096": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "4096"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072856112224454"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041573006780551387"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025766641200192"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003017176216493314"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014750939289121"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 SomeInts=-12": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "SomeInts": {
+              "type": "string",
+              "value": "-12"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073294088176355"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042082782124641986"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001002588534402943"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002955692521037092"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014733467393249"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.enums.list.md b/examples/outputs/nvbench.example.enums.list.md
new file mode 100644
index 0000000..9cb3a73
--- /dev/null
+++ b/examples/outputs/nvbench.example.enums.list.md
@@ -0,0 +1,67 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 30117 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 14939 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `runtime_enum_sweep_string` (3 configurations)
+
+### Axes
+
+* `MyEnum` : string
+  * `A`
+  * `B`
+  * `C`
+
+## [1] `runtime_enum_sweep_int64` (3 configurations)
+
+### Axes
+
+* `MyEnum` : int64
+  * `0`
+  * `1`
+  * `2`
+
+## [2] `compile_time_enum_sweep` (3 configurations)
+
+### Axes
+
+* `MyEnum` : type
+  * `A` (MyEnum::ValueA)
+  * `B` (MyEnum::ValueB)
+  * `C` (MyEnum::ValueC)
+
+## [3] `compile_time_int_sweep` (4 configurations)
+
+### Axes
+
+* `SomeInts` : type
+  * `0` (nvbench::enum_type<0, int>)
+  * `16` (nvbench::enum_type<16, int>)
+  * `4096` (nvbench::enum_type<4096, int>)
+  * `-12` (nvbench::enum_type<-12, int>)
+
diff --git a/examples/outputs/nvbench.example.enums.md b/examples/outputs/nvbench.example.enums.md
new file mode 100644
index 0000000..583f140
--- /dev/null
+++ b/examples/outputs/nvbench.example.enums.md
@@ -0,0 +1,186 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 32163 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15999 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  runtime_enum_sweep_string [Device=0 MyEnum=A]
+Pass: Cold: 1.003842ms GPU, 1.010145ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x
+Run:  runtime_enum_sweep_string [Device=0 MyEnum=B]
+Pass: Cold: 1.003849ms GPU, 1.010159ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x
+Run:  runtime_enum_sweep_string [Device=0 MyEnum=C]
+Pass: Cold: 1.003841ms GPU, 1.010124ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  runtime_enum_sweep_string [Device=1 MyEnum=A]
+Pass: Cold: 1.002774ms GPU, 1.007653ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+Run:  runtime_enum_sweep_string [Device=1 MyEnum=B]
+Pass: Cold: 1.002610ms GPU, 1.007306ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  runtime_enum_sweep_string [Device=1 MyEnum=C]
+Pass: Cold: 1.002611ms GPU, 1.007310ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  runtime_enum_sweep_int64 [Device=0 MyEnum=0]
+Pass: Cold: 1.003846ms GPU, 1.010169ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  runtime_enum_sweep_int64 [Device=0 MyEnum=1]
+Pass: Cold: 1.003646ms GPU, 1.010107ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  runtime_enum_sweep_int64 [Device=0 MyEnum=2]
+Pass: Cold: 1.003668ms GPU, 1.010126ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  runtime_enum_sweep_int64 [Device=1 MyEnum=0]
+Pass: Cold: 1.002765ms GPU, 1.007648ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  runtime_enum_sweep_int64 [Device=1 MyEnum=1]
+Pass: Cold: 1.002585ms GPU, 1.007336ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+Run:  runtime_enum_sweep_int64 [Device=1 MyEnum=2]
+Pass: Cold: 1.002590ms GPU, 1.007300ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_enum_sweep [Device=0 MyEnum=A]
+Pass: Cold: 1.003754ms GPU, 1.010085ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  compile_time_enum_sweep [Device=0 MyEnum=B]
+Pass: Cold: 1.003678ms GPU, 1.010054ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_enum_sweep [Device=0 MyEnum=C]
+Pass: Cold: 1.003675ms GPU, 1.010119ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_enum_sweep [Device=1 MyEnum=A]
+Pass: Cold: 1.002574ms GPU, 1.007283ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_enum_sweep [Device=1 MyEnum=B]
+Pass: Cold: 1.002588ms GPU, 1.007315ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_enum_sweep [Device=1 MyEnum=C]
+Pass: Cold: 1.002604ms GPU, 1.007322ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_int_sweep [Device=0 SomeInts=0]
+Pass: Cold: 1.003672ms GPU, 1.010124ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_int_sweep [Device=0 SomeInts=16]
+Pass: Cold: 1.003705ms GPU, 1.010120ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  compile_time_int_sweep [Device=0 SomeInts=4096]
+Pass: Cold: 1.003686ms GPU, 1.010048ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_int_sweep [Device=0 SomeInts=-12]
+Pass: Cold: 1.003658ms GPU, 1.010105ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  compile_time_int_sweep [Device=1 SomeInts=0]
+Pass: Cold: 1.002591ms GPU, 1.007352ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_int_sweep [Device=1 SomeInts=16]
+Pass: Cold: 1.002569ms GPU, 1.007280ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_int_sweep [Device=1 SomeInts=4096]
+Pass: Cold: 1.002577ms GPU, 1.007286ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  compile_time_int_sweep [Device=1 SomeInts=-12]
+Pass: Cold: 1.002589ms GPU, 1.007329ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+```
+
+# Benchmark Results
+
+## runtime_enum_sweep_string
+
+### [0] Quadro GV100
+
+| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|--------|---------|----------|-------|----------|-------|-----------|-------|
+|      A |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  523x |
+|      B |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  523x |
+|      C |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  523x |
+
+### [1] Quadro GP100
+
+| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|--------|---------|----------|-------|----------|-------|-----------|-------|
+|      A |    499x | 1.008 ms | 0.05% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      B |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      C |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+
+## runtime_enum_sweep_int64
+
+### [0] Quadro GV100
+
+| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|--------|---------|----------|-------|----------|-------|-----------|-------|
+|      0 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  524x |
+|      1 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+|      2 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+
+### [1] Quadro GP100
+
+| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|--------|---------|----------|-------|----------|-------|-----------|-------|
+|      0 |    499x | 1.008 ms | 0.05% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      1 |    499x | 1.007 ms | 0.07% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      2 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+
+## compile_time_enum_sweep
+
+### [0] Quadro GV100
+
+| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|--------|---------|----------|-------|----------|-------|-----------|-------|
+|      A |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+|      B |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  524x |
+|      C |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+
+### [1] Quadro GP100
+
+| MyEnum | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|--------|---------|----------|-------|----------|-------|-----------|-------|
+|      A |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      B |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      C |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+
+## compile_time_int_sweep
+
+### [0] Quadro GV100
+
+| SomeInts | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|----------|---------|----------|-------|----------|-------|-----------|-------|
+|        0 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+|       16 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  523x |
+|     4096 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.06% |  1.001 ms |  524x |
+|      -12 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+
+### [1] Quadro GP100
+
+| SomeInts | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|----------|---------|----------|-------|----------|-------|-----------|-------|
+|        0 |    499x | 1.007 ms | 0.05% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|       16 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|     4096 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|      -12 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
diff --git a/examples/outputs/nvbench.example.exec_tag_sync.csv b/examples/outputs/nvbench.example.exec_tag_sync.csv
new file mode 100644
index 0000000..d8eb773
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_sync.csv
@@ -0,0 +1,3 @@
+Benchmark,Device,Device Name,Skipped,Items,Size (bytes),Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak
+sequence_bench,0,Quadro GV100,No,16777216,67108864,88096,0.00011210838060751815,0.0043948813906645855,0.00010738264021815305,0.005226831698093829,156237693224.12143,624950772896.4857,0.7180041048902639
+sequence_bench,1,Quadro GP100,No,16777216,67108864,4236,0.0001220395009442869,0.0031913153957058224,0.00011805303109505499,0.0030945635166685077,142115927430.03076,568463709720.123,0.7764200580749058
diff --git a/examples/outputs/nvbench.example.exec_tag_sync.json b/examples/outputs/nvbench.example.exec_tag_sync.json
new file mode 100644
index 0000000..4d4aa0a
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_sync.json
@@ -0,0 +1,426 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "sequence_bench",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": null,
+      "states": {
+        "Device=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Size"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "88096"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00011210838060751815"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0043948813906645855"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00010738264021815305"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005226831698093829"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "156237693224.12143"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "624950772896.4857"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7180041048902639"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Element count: Items": {
+              "short_name": {
+                "type": "string",
+                "value": "Items"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Output Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Size"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "4236"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0001220395009442869"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0031913153957058224"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00011805303109505499"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0030945635166685077"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "142115927430.03076"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "568463709720.123"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.7764200580749058"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.exec_tag_sync.list.md b/examples/outputs/nvbench.example.exec_tag_sync.list.md
new file mode 100644
index 0000000..29a2bc7
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_sync.list.md
@@ -0,0 +1,32 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 30269 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 14939 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `sequence_bench` (1 configurations)
+
diff --git a/examples/outputs/nvbench.example.exec_tag_sync.md b/examples/outputs/nvbench.example.exec_tag_sync.md
new file mode 100644
index 0000000..4b2059c
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_sync.md
@@ -0,0 +1,53 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 32163 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15999 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  sequence_bench [Device=0]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.52% > 0.50%)
+Pass: Cold: 0.107383ms GPU, 0.112108ms CPU, 9.46s total GPU, 88096x
+Run:  sequence_bench [Device=1]
+Pass: Cold: 0.118053ms GPU, 0.122040ms CPU, 0.50s total GPU, 4236x
+```
+
+# Benchmark Results
+
+## sequence_bench
+
+### [0] Quadro GV100
+
+|  Items   |    Size    | Samples |  CPU Time  | Noise |  GPU Time  | Noise |  Elem/s  | GlobalMem BW | BWPeak |
+|----------|------------|---------|------------|-------|------------|-------|----------|--------------|--------|
+| 16777216 | 64.000 MiB |  88096x | 112.108 us | 0.44% | 107.383 us | 0.52% | 156.238G | 624.951 GB/s | 71.80% |
+
+### [1] Quadro GP100
+
+|  Items   |    Size    | Samples |  CPU Time  | Noise |  GPU Time  | Noise |  Elem/s  | GlobalMem BW | BWPeak |
+|----------|------------|---------|------------|-------|------------|-------|----------|--------------|--------|
+| 16777216 | 64.000 MiB |   4236x | 122.040 us | 0.32% | 118.053 us | 0.31% | 142.116G | 568.464 GB/s | 77.64% |
diff --git a/examples/outputs/nvbench.example.exec_tag_timer.csv b/examples/outputs/nvbench.example.exec_tag_timer.csv
new file mode 100644
index 0000000..9680594
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_timer.csv
@@ -0,0 +1,3 @@
+Benchmark,Device,Device Name,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak
+mod2_inplace,0,Quadro GV100,No,27572,0.00026979653764688707,0.009574712799615451,0.0002636220670364504,0.009810298459991085,63641167025.97682,509129336207.8146,0.5849371969299341
+mod2_inplace,1,Quadro GP100,No,26721,0.0002731037747090319,0.005791152713027165,0.000268388258513477,0.005821839019989222,62510990953.64315,500087927629.1452,0.6830309326228491
diff --git a/examples/outputs/nvbench.example.exec_tag_timer.json b/examples/outputs/nvbench.example.exec_tag_timer.json
new file mode 100644
index 0000000..d5650c9
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_timer.json
@@ -0,0 +1,378 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "mod2_inplace",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": null,
+      "states": {
+        "Device=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "27572"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026979653764688707"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009574712799615451"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002636220670364504"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009810298459991085"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63641167025.97682"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "509129336207.8146"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5849371969299341"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "26721"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002731037747090319"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005791152713027165"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000268388258513477"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.005821839019989222"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "62510990953.64315"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "500087927629.1452"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.6830309326228491"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.exec_tag_timer.list.md b/examples/outputs/nvbench.example.exec_tag_timer.list.md
new file mode 100644
index 0000000..09c2b13
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_timer.list.md
@@ -0,0 +1,32 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 30117 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 14891 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `mod2_inplace` (1 configurations)
+
diff --git a/examples/outputs/nvbench.example.exec_tag_timer.md b/examples/outputs/nvbench.example.exec_tag_timer.md
new file mode 100644
index 0000000..8f842a2
--- /dev/null
+++ b/examples/outputs/nvbench.example.exec_tag_timer.md
@@ -0,0 +1,54 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 32163 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15999 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  mod2_inplace [Device=0]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.98% > 0.50%)
+Pass: Cold: 0.263622ms GPU, 0.269797ms CPU, 7.27s total GPU, 27572x
+Run:  mod2_inplace [Device=1]
+Warn: Current measurement timed out (15.00s) while over noise threshold (0.58% > 0.50%)
+Pass: Cold: 0.268388ms GPU, 0.273104ms CPU, 7.17s total GPU, 26721x
+```
+
+# Benchmark Results
+
+## mod2_inplace
+
+### [0] Quadro GV100
+
+| Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | GlobalMem BW | BWPeak |
+|---------|------------|-------|------------|-------|---------|--------------|--------|
+|  27572x | 269.797 us | 0.96% | 263.622 us | 0.98% | 63.641G | 509.129 GB/s | 58.49% |
+
+### [1] Quadro GP100
+
+| Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | GlobalMem BW | BWPeak |
+|---------|------------|-------|------------|-------|---------|--------------|--------|
+|  26721x | 273.104 us | 0.58% | 268.388 us | 0.58% | 62.511G | 500.088 GB/s | 68.30% |
diff --git a/examples/outputs/nvbench.example.skip.csv b/examples/outputs/nvbench.example.skip.csv
new file mode 100644
index 0000000..4d85c73
--- /dev/null
+++ b/examples/outputs/nvbench.example.skip.csv
@@ -0,0 +1,71 @@
+Benchmark,Device,Device Name,Duration,Kramble,Skipped,Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Batch GPU (sec),Batch,In,Out
+runtime_skip,0,Quadro GV100,0,Foo,No,148083,1.0558313054165725e-05,0.03460045726144196,4.411147755695762e-06,0.10691914249190457,2.086331167986729e-06,239683,,
+runtime_skip,0,Quadro GV100,0.00025,Foo,No,1967,0.00026056515149974605,0.001785414177412443,0.0002542243968822252,0.0022747905334427393,0.0002519049829290819,2064,,
+runtime_skip,0,Quadro GV100,0.0005,Foo,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0.00075,Foo,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0.001,Foo,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0,Bar,No,147976,1.0477963967129967e-05,0.05193537216616394,4.309996180719637e-06,0.09631511021814754,2.1032374904355167e-06,237900,,
+runtime_skip,0,Quadro GV100,0.00025,Bar,No,1967,0.0002606047381799697,0.004402412277520235,0.00025425122354802296,0.0019316173072071725,0.00025190548564112467,2064,,
+runtime_skip,0,Quadro GV100,0.0005,Bar,No,993,0.0005103389546827793,0.0009044135054133329,0.0005038951597545129,0.0009132341980037791,0.0005017609577982818,1044,,
+runtime_skip,0,Quadro GV100,0.00075,Bar,No,664,0.0007602315828313258,0.0012393600029445503,0.0007537762098104041,0.0006523899466894487,0.0007516189640871593,697,,
+runtime_skip,0,Quadro GV100,0.001,Bar,No,499,0.0010100405511022032,0.0004675770563736675,0.0010036521122785227,0.000506242755428486,0.0010014759304418617,523,,
+runtime_skip,0,Quadro GV100,0,Baz,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0.00025,Baz,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0.0005,Baz,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0.00075,Baz,Yes,,,,,,,,,
+runtime_skip,0,Quadro GV100,0.001,Baz,No,499,0.0010100214969939881,0.0004890967396052713,0.0010036210096431835,0.0005031795348101782,0.001001475909284053,524,,
+runtime_skip,1,Quadro GP100,0,Foo,No,152833,7.790139459409939e-06,0.052486926520143126,3.0540317811368876e-06,0.041965307035547426,1.3476766561262745e-06,371096,,
+runtime_skip,1,Quadro GP100,0.00025,Foo,No,1977,0.0002577336697015675,0.0016745726043248602,0.0002530303939943309,0.0012463205002970757,0.0002519046914100187,2073,,
+runtime_skip,1,Quadro GP100,0.0005,Foo,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0.00075,Foo,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0.001,Foo,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0,Bar,No,152569,7.89847543734305e-06,0.060900564134115376,3.1315013016758387e-06,0.06800325272918997,1.4410427557047706e-06,346971,,
+runtime_skip,1,Quadro GP100,0.00025,Bar,No,1977,0.00025772118614061663,0.0027168225148294085,0.0002530340679558949,0.0012784718063768952,0.0002519045087617526,2074,,
+runtime_skip,1,Quadro GP100,0.0005,Bar,No,995,0.0005076002613065325,0.0008452471055542708,0.0005028815761283412,0.0006166721165867504,0.0005017611416903409,1045,,
+runtime_skip,1,Quadro GP100,0.00075,Bar,No,665,0.0007574173563909778,0.0005404728977728138,0.0007527303210774766,0.00042097281680210553,0.0007516172567547905,698,,
+runtime_skip,1,Quadro GP100,0.001,Bar,No,499,0.0010073301743486979,0.0004181556191860385,0.0010026005258063262,0.0003018745447224176,0.0010014736961772425,524,,
+runtime_skip,1,Quadro GP100,0,Baz,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0.00025,Baz,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0.0005,Baz,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0.00075,Baz,Yes,,,,,,,,,
+runtime_skip,1,Quadro GP100,0.001,Baz,No,499,0.0010072666492985964,0.0004034748822624355,0.0010025921845006082,0.0002988324879359347,0.001001474511532383,524,,
+skip_overload,0,Quadro GV100,,,Yes,,,,,,,,I32,I32
+skip_overload,0,Quadro GV100,,,No,499,0.0010101158116232471,0.00045457872179042216,0.00100363749086498,0.0004920954065745232,0.001001477914376195,523,I32,I64
+skip_overload,0,Quadro GV100,,,No,499,0.0010100936012024046,0.0004656012199546097,0.0010036259481089817,0.00048536100052596265,0.0010014759304418617,523,I64,I32
+skip_overload,0,Quadro GV100,,,Yes,,,,,,,,I64,I64
+skip_overload,1,Quadro GP100,,,Yes,,,,,,,,I32,I32
+skip_overload,1,Quadro GP100,,,No,499,0.0010072941002004009,0.00039832922996069914,0.0010025732686858855,0.0003053582386332306,0.0010014733467393249,524,I32,I64
+skip_overload,1,Quadro GP100,,,No,499,0.0010072575310621243,0.0004222831525597227,0.0010025880217313281,0.0003175275654782765,0.001001473929135854,524,I64,I32
+skip_overload,1,Quadro GP100,,,Yes,,,,,,,,I64,I64
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101381663326664,0.0004740828150298835,0.0010036533932408684,0.0005311247776469751,0.001001475909284053,524,I8,I8
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101366533066124,0.0004762668191055286,0.0010036824432785813,0.0005225074502850834,0.001001475909284053,524,I8,I16
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101454048096201,0.00045304078278456247,0.0010037151510586427,0.0005255710579717081,0.0010014777976741756,523,I8,I32
+skip_sfinae,0,Quadro GV100,,,No,499,0.001010251853707415,0.0004599262349912718,0.001003687190388386,0.0005303166463057176,0.001001477889432252,524,I8,I64
+skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I16,I8
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010102188396793595,0.0004716222422331537,0.0010036723759465746,0.0004916019059506635,0.0010014759304418617,523,I16,I16
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101248837675358,0.0004725693580783232,0.0010036886648567929,0.0005032992730078731,0.0010014777729529462,524,I16,I32
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101596753507014,0.0004462484178997469,0.0010037086084037075,0.0005439265200637692,0.0010014759304418617,523,I16,I64
+skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I32,I8
+skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I32,I16
+skip_sfinae,0,Quadro GV100,,,No,499,0.001010157290581163,0.00048501754313990346,0.001003686870267248,0.0005466560867161739,0.0010014759304418617,523,I32,I32
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101838817635274,0.00045302096806037583,0.0010036930261489573,0.0004856757915143649,0.001001475909284053,524,I32,I64
+skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I64,I8
+skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I64,I16
+skip_sfinae,0,Quadro GV100,,,Yes,,,,,,,,I64,I32
+skip_sfinae,0,Quadro GV100,,,No,499,0.0010101592084168342,0.00044733655275106617,0.0010036638461516183,0.00048495653843800993,0.0010014777976741756,523,I64,I64
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072615390781564,0.00041755776345504055,0.0010025953974179124,0.00031401673972615964,0.0010014743950530773,524,I8,I8
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072623366733464,0.0004115227504783334,0.0010025991167955256,0.0003150799184063617,0.0010014747444909947,524,I8,I16
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010073937254509017,0.00047436520440331174,0.0010025578190186219,0.0003174654268713629,0.0010014748609703007,524,I8,I32
+skip_sfinae,1,Quadro GP100,,,No,499,0.001007314288577154,0.00044943976467369485,0.0010025514015453848,0.00031771350129621625,0.0010014748609703007,524,I8,I64
+skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I16,I8
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072553206412834,0.00041747568393080776,0.0010025701303042489,0.0003128921530447738,0.0010014740456151597,524,I16,I16
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072746412825657,0.00041073548319461465,0.0010026024477276384,0.00031052785600836483,0.0010014741620944655,524,I16,I32
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072725270541083,0.0004158366079958995,0.0010025823137803163,0.0003051386556782456,0.0010014741620944655,524,I16,I64
+skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I32,I8
+skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I32,I16
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072895851703403,0.00041834159303612994,0.0010025929590026458,0.0002996916722121692,0.0010014740456151597,524,I32,I32
+skip_sfinae,1,Quadro GP100,,,No,499,0.0010072442464929862,0.0004215890640515976,0.0010025749361825603,0.0003141386734389898,0.001001474628011689,524,I32,I64
+skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I64,I8
+skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I64,I16
+skip_sfinae,1,Quadro GP100,,,Yes,,,,,,,,I64,I32
+skip_sfinae,1,Quadro GP100,,,No,499,0.001007259180360722,0.000416945457190629,0.0010025870577844691,0.0003166772418677883,0.001001475210408218,524,I64,I64
diff --git a/examples/outputs/nvbench.example.skip.json b/examples/outputs/nvbench.example.skip.json
new file mode 100644
index 0000000..f17f617
--- /dev/null
+++ b/examples/outputs/nvbench.example.skip.json
@@ -0,0 +1,6815 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "runtime_skip",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "Duration": {
+          "type": "float64",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "0",
+              "description": "",
+              "value": 0.0
+            },
+            {
+              "input_string": "0.00025",
+              "description": "",
+              "value": 0.00025
+            },
+            {
+              "input_string": "0.0005",
+              "description": "",
+              "value": 0.0005
+            },
+            {
+              "input_string": "0.00075",
+              "description": "",
+              "value": 0.00075
+            },
+            {
+              "input_string": "0.001",
+              "description": "",
+              "value": 0.001
+            }
+          ]
+        },
+        "Kramble": {
+          "type": "string",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "Foo",
+              "description": "",
+              "value": "Foo"
+            },
+            {
+              "input_string": "Bar",
+              "description": "",
+              "value": "Bar"
+            },
+            {
+              "input_string": "Baz",
+              "description": "",
+              "value": "Baz"
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 Duration=0 Kramble=Foo": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "148083"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.0558313054165725e-05"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.03460045726144196"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "4.411147755695762e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.10691914249190457"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "2.086331167986729e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "239683"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.00025 Kramble=Foo": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00025"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1967"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026056515149974605"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001785414177412443"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002542243968822252"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0022747905334427393"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002519049829290819"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2064"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0005 Kramble=Foo": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Long 'Foo' benchmarks are skipped."
+        },
+        "Device=0 Duration=0.00075 Kramble=Foo": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00075"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Long 'Foo' benchmarks are skipped."
+        },
+        "Device=0 Duration=0.001 Kramble=Foo": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.001"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Long 'Foo' benchmarks are skipped."
+        },
+        "Device=0 Duration=0 Kramble=Bar": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "147976"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.0477963967129967e-05"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.05193537216616394"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "4.309996180719637e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.09631511021814754"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "2.1032374904355167e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "237900"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.00025 Kramble=Bar": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00025"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1967"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002606047381799697"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.004402412277520235"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00025425122354802296"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0019316173072071725"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00025190548564112467"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2064"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.0005 Kramble=Bar": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "993"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005103389546827793"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009044135054133329"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005038951597545129"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0009132341980037791"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017609577982818"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1044"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.00075 Kramble=Bar": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00075"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "664"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007602315828313258"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012393600029445503"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007537762098104041"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006523899466894487"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007516189640871593"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "697"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0.001 Kramble=Bar": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.001"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010100405511022032"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004675770563736675"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036521122785227"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000506242755428486"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 Duration=0 Kramble=Baz": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=0 Duration=0.00025 Kramble=Baz": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00025"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=0 Duration=0.0005 Kramble=Baz": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=0 Duration=0.00075 Kramble=Baz": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00075"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=0 Duration=0.001 Kramble=Baz": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.001"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010100214969939881"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004890967396052713"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036210096431835"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005031795348101782"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0 Kramble=Foo": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "152833"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "7.790139459409939e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.052486926520143126"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.0540317811368876e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.041965307035547426"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.3476766561262745e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "371096"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.00025 Kramble=Foo": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00025"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1977"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002577336697015675"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0016745726043248602"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002530303939943309"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012463205002970757"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002519046914100187"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2073"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0005 Kramble=Foo": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Long 'Foo' benchmarks are skipped."
+        },
+        "Device=1 Duration=0.00075 Kramble=Foo": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00075"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Long 'Foo' benchmarks are skipped."
+        },
+        "Device=1 Duration=0.001 Kramble=Foo": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.001"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Foo"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Long 'Foo' benchmarks are skipped."
+        },
+        "Device=1 Duration=0 Kramble=Bar": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "152569"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "7.89847543734305e-06"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.060900564134115376"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "3.1315013016758387e-06"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.06800325272918997"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "1.4410427557047706e-06"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "346971"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.00025 Kramble=Bar": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00025"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1977"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00025772118614061663"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0027168225148294085"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002530340679558949"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0012784718063768952"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002519045087617526"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "2074"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.0005 Kramble=Bar": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "995"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005076002613065325"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0008452471055542708"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005028815761283412"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0006166721165867504"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005017611416903409"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "1045"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.00075 Kramble=Bar": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00075"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "665"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007574173563909778"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005404728977728138"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007527303210774766"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00042097281680210553"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0007516172567547905"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "698"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0.001 Kramble=Bar": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.001"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Bar"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073301743486979"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004181556191860385"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010026005258063262"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003018745447224176"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014736961772425"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 Duration=0 Kramble=Baz": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=1 Duration=0.00025 Kramble=Baz": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00025"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=1 Duration=0.0005 Kramble=Baz": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.0005"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=1 Duration=0.00075 Kramble=Baz": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.00075"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "Short 'Baz' benchmarks are skipped."
+        },
+        "Device=1 Duration=0.001 Kramble=Baz": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "Duration": {
+              "type": "float64",
+              "value": "0.001"
+            },
+            "Kramble": {
+              "type": "string",
+              "value": "Baz"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072666492985964"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004034748822624355"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025921845006082"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002988324879359347"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001474511532383"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    },
+    {
+      "index": 1,
+      "name": "skip_overload",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "In": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
+            }
+          ]
+        },
+        "Out": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 In=I32 Out=I32": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "InputType == OutputType."
+        },
+        "Device=0 In=I32 Out=I64": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101158116232471"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045457872179042216"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00100363749086498"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004920954065745232"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001477914376195"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I64 Out=I32": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010100936012024046"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004656012199546097"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036259481089817"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00048536100052596265"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I64 Out=I64": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "InputType == OutputType."
+        },
+        "Device=1 In=I32 Out=I32": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "InputType == OutputType."
+        },
+        "Device=1 In=I32 Out=I64": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072941002004009"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00039832922996069914"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025732686858855"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003053582386332306"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014733467393249"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I64 Out=I32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072575310621243"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004222831525597227"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025880217313281"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003175275654782765"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001473929135854"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I64 Out=I64": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "InputType == OutputType."
+        }
+      }
+    },
+    {
+      "index": 2,
+      "name": "skip_sfinae",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": {
+        "In": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "I8",
+              "description": "int8_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I16",
+              "description": "int16_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
+            }
+          ]
+        },
+        "Out": {
+          "type": "type",
+          "flags": "",
+          "values": [
+            {
+              "input_string": "I8",
+              "description": "int8_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I16",
+              "description": "int16_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I32",
+              "description": "int32_t",
+              "is_active": true
+            },
+            {
+              "input_string": "I64",
+              "description": "int64_t",
+              "is_active": true
+            }
+          ]
+        }
+      },
+      "states": {
+        "Device=0 In=I8 Out=I8": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101381663326664"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004740828150298835"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036533932408684"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005311247776469751"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=I16": {
+          "device": 0,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101366533066124"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004762668191055286"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036824432785813"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005225074502850834"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=I32": {
+          "device": 0,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101454048096201"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045304078278456247"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010037151510586427"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005255710579717081"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014777976741756"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I8 Out=I64": {
+          "device": 0,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001010251853707415"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004599262349912718"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003687190388386"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005303166463057176"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001477889432252"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I8": {
+          "device": 0,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I16 Out=I16": {
+          "device": 0,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010102188396793595"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004716222422331537"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036723759465746"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004916019059506635"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I32": {
+          "device": 0,
+          "type_config_index": 6,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101248837675358"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004725693580783232"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036886648567929"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005032992730078731"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014777729529462"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I16 Out=I64": {
+          "device": 0,
+          "type_config_index": 7,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101596753507014"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004462484178997469"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010037086084037075"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005439265200637692"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I8": {
+          "device": 0,
+          "type_config_index": 8,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I16": {
+          "device": 0,
+          "type_config_index": 9,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I32 Out=I32": {
+          "device": 0,
+          "type_config_index": 10,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001010157290581163"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00048501754313990346"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001003686870267248"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0005466560867161739"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014759304418617"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I32 Out=I64": {
+          "device": 0,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101838817635274"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00045302096806037583"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036930261489573"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004856757915143649"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475909284053"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=0 In=I64 Out=I8": {
+          "device": 0,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I16": {
+          "device": 0,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I32": {
+          "device": 0,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=0 In=I64 Out=I64": {
+          "device": 0,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010101592084168342"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00044733655275106617"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010036638461516183"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00048495653843800993"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014777976741756"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "523"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I8": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072615390781564"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041755776345504055"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025953974179124"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00031401673972615964"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014743950530773"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I16": {
+          "device": 1,
+          "type_config_index": 1,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072623366733464"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004115227504783334"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025991167955256"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003150799184063617"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014747444909947"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I32": {
+          "device": 1,
+          "type_config_index": 2,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010073937254509017"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00047436520440331174"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025578190186219"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003174654268713629"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014748609703007"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I8 Out=I64": {
+          "device": 1,
+          "type_config_index": 3,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I8"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001007314288577154"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00044943976467369485"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025514015453848"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00031771350129621625"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014748609703007"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=I8": {
+          "device": 1,
+          "type_config_index": 4,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I16 Out=I16": {
+          "device": 1,
+          "type_config_index": 5,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072553206412834"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041747568393080776"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025701303042489"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003128921530447738"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014740456151597"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=I32": {
+          "device": 1,
+          "type_config_index": 6,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072746412825657"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041073548319461465"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010026024477276384"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00031052785600836483"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014741620944655"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I16 Out=I64": {
+          "device": 1,
+          "type_config_index": 7,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I16"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072725270541083"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004158366079958995"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025823137803163"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003051386556782456"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014741620944655"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I8": {
+          "device": 1,
+          "type_config_index": 8,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I16": {
+          "device": 1,
+          "type_config_index": 9,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I32 Out=I32": {
+          "device": 1,
+          "type_config_index": 10,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072895851703403"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00041834159303612994"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025929590026458"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002996916722121692"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010014740456151597"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I32 Out=I64": {
+          "device": 1,
+          "type_config_index": 11,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I32"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010072442464929862"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0004215890640515976"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025749361825603"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003141386734389898"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001474628011689"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1 In=I64 Out=I8": {
+          "device": 1,
+          "type_config_index": 12,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I8"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I16": {
+          "device": 1,
+          "type_config_index": 13,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I16"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I32": {
+          "device": 1,
+          "type_config_index": 14,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I32"
+            }
+          },
+          "summaries": null,
+          "is_skipped": true,
+          "skip_reason": "sizeof(InputType) > sizeof(OutputType)."
+        },
+        "Device=1 In=I64 Out=I64": {
+          "device": 1,
+          "type_config_index": 15,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": {
+            "In": {
+              "type": "string",
+              "value": "I64"
+            },
+            "Out": {
+              "type": "string",
+              "value": "I64"
+            }
+          },
+          "summaries": {
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "499"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001007259180360722"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.000416945457190629"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0010025870577844691"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0003166772418677883"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.001001475210408218"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "524"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.skip.list.md b/examples/outputs/nvbench.example.skip.list.md
new file mode 100644
index 0000000..a096004
--- /dev/null
+++ b/examples/outputs/nvbench.example.skip.list.md
@@ -0,0 +1,71 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 31309 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15467 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `runtime_skip` (15 configurations)
+
+### Axes
+
+* `Duration` : float64
+  * `0`
+  * `0.00025`
+  * `0.0005`
+  * `0.00075`
+  * `0.001`
+* `Kramble` : string
+  * `Foo`
+  * `Bar`
+  * `Baz`
+
+## [1] `skip_overload` (4 configurations)
+
+### Axes
+
+* `In` : type
+  * `I32` (int32_t)
+  * `I64` (int64_t)
+* `Out` : type
+  * `I32` (int32_t)
+  * `I64` (int64_t)
+
+## [2] `skip_sfinae` (16 configurations)
+
+### Axes
+
+* `In` : type
+  * `I8` (int8_t)
+  * `I16` (int16_t)
+  * `I32` (int32_t)
+  * `I64` (int64_t)
+* `Out` : type
+  * `I8` (int8_t)
+  * `I16` (int16_t)
+  * `I32` (int32_t)
+  * `I64` (int64_t)
+
diff --git a/examples/outputs/nvbench.example.skip.md b/examples/outputs/nvbench.example.skip.md
new file mode 100644
index 0000000..62e5ca7
--- /dev/null
+++ b/examples/outputs/nvbench.example.skip.md
@@ -0,0 +1,296 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 32163 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15999 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  runtime_skip [Device=0 Duration=0 Kramble=Foo]
+Warn: Current measurement timed out (15.00s) while over noise threshold (10.69% > 0.50%)
+Pass: Cold: 0.004411ms GPU, 0.010558ms CPU, 0.65s total GPU, 148083x
+Pass: Batch: 0.002086ms GPU, 0.50s total GPU, 239683x
+Run:  runtime_skip [Device=0 Duration=0.00025 Kramble=Foo]
+Pass: Cold: 0.254224ms GPU, 0.260565ms CPU, 0.50s total GPU, 1967x
+Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2064x
+Run:  runtime_skip [Device=0 Duration=0.0005 Kramble=Foo]
+Skip: Long 'Foo' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0.00075 Kramble=Foo]
+Skip: Long 'Foo' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0.001 Kramble=Foo]
+Skip: Long 'Foo' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0 Kramble=Bar]
+Warn: Current measurement timed out (15.00s) while over noise threshold (9.63% > 0.50%)
+Pass: Cold: 0.004310ms GPU, 0.010478ms CPU, 0.64s total GPU, 147976x
+Pass: Batch: 0.002103ms GPU, 0.50s total GPU, 237900x
+Run:  runtime_skip [Device=0 Duration=0.00025 Kramble=Bar]
+Pass: Cold: 0.254251ms GPU, 0.260605ms CPU, 0.50s total GPU, 1967x
+Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2064x
+Run:  runtime_skip [Device=0 Duration=0.0005 Kramble=Bar]
+Pass: Cold: 0.503895ms GPU, 0.510339ms CPU, 0.50s total GPU, 993x
+Pass: Batch: 0.501761ms GPU, 0.52s total GPU, 1044x
+Run:  runtime_skip [Device=0 Duration=0.00075 Kramble=Bar]
+Pass: Cold: 0.753776ms GPU, 0.760232ms CPU, 0.50s total GPU, 664x
+Pass: Batch: 0.751619ms GPU, 0.52s total GPU, 697x
+Run:  runtime_skip [Device=0 Duration=0.001 Kramble=Bar]
+Pass: Cold: 1.003652ms GPU, 1.010041ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  runtime_skip [Device=0 Duration=0 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0.00025 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0.0005 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0.00075 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=0 Duration=0.001 Kramble=Baz]
+Pass: Cold: 1.003621ms GPU, 1.010021ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  runtime_skip [Device=1 Duration=0 Kramble=Foo]
+Warn: Current measurement timed out (15.00s) while over noise threshold (4.20% > 0.50%)
+Warn: Current measurement timed out (15.00s) before accumulating min_time (0.47s < 0.50s)
+Pass: Cold: 0.003054ms GPU, 0.007790ms CPU, 0.47s total GPU, 152833x
+Pass: Batch: 0.001348ms GPU, 0.50s total GPU, 371096x
+Run:  runtime_skip [Device=1 Duration=0.00025 Kramble=Foo]
+Pass: Cold: 0.253030ms GPU, 0.257734ms CPU, 0.50s total GPU, 1977x
+Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2073x
+Run:  runtime_skip [Device=1 Duration=0.0005 Kramble=Foo]
+Skip: Long 'Foo' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0.00075 Kramble=Foo]
+Skip: Long 'Foo' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0.001 Kramble=Foo]
+Skip: Long 'Foo' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0 Kramble=Bar]
+Warn: Current measurement timed out (15.00s) while over noise threshold (6.80% > 0.50%)
+Warn: Current measurement timed out (15.00s) before accumulating min_time (0.48s < 0.50s)
+Pass: Cold: 0.003132ms GPU, 0.007898ms CPU, 0.48s total GPU, 152569x
+Pass: Batch: 0.001441ms GPU, 0.50s total GPU, 346971x
+Run:  runtime_skip [Device=1 Duration=0.00025 Kramble=Bar]
+Pass: Cold: 0.253034ms GPU, 0.257721ms CPU, 0.50s total GPU, 1977x
+Pass: Batch: 0.251905ms GPU, 0.52s total GPU, 2074x
+Run:  runtime_skip [Device=1 Duration=0.0005 Kramble=Bar]
+Pass: Cold: 0.502882ms GPU, 0.507600ms CPU, 0.50s total GPU, 995x
+Pass: Batch: 0.501761ms GPU, 0.52s total GPU, 1045x
+Run:  runtime_skip [Device=1 Duration=0.00075 Kramble=Bar]
+Pass: Cold: 0.752730ms GPU, 0.757417ms CPU, 0.50s total GPU, 665x
+Pass: Batch: 0.751617ms GPU, 0.52s total GPU, 698x
+Run:  runtime_skip [Device=1 Duration=0.001 Kramble=Bar]
+Pass: Cold: 1.002601ms GPU, 1.007330ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  runtime_skip [Device=1 Duration=0 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0.00025 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0.0005 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0.00075 Kramble=Baz]
+Skip: Short 'Baz' benchmarks are skipped.
+Run:  runtime_skip [Device=1 Duration=0.001 Kramble=Baz]
+Pass: Cold: 1.002592ms GPU, 1.007267ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  skip_overload [Device=0 In=I32 Out=I32]
+Skip: InputType == OutputType.
+Run:  skip_overload [Device=0 In=I32 Out=I64]
+Pass: Cold: 1.003637ms GPU, 1.010116ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x
+Run:  skip_overload [Device=0 In=I64 Out=I32]
+Pass: Cold: 1.003626ms GPU, 1.010094ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  skip_overload [Device=0 In=I64 Out=I64]
+Skip: InputType == OutputType.
+Run:  skip_overload [Device=1 In=I32 Out=I32]
+Skip: InputType == OutputType.
+Run:  skip_overload [Device=1 In=I32 Out=I64]
+Pass: Cold: 1.002573ms GPU, 1.007294ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001473ms GPU, 0.52s total GPU, 524x
+Run:  skip_overload [Device=1 In=I64 Out=I32]
+Pass: Cold: 1.002588ms GPU, 1.007258ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  skip_overload [Device=1 In=I64 Out=I64]
+Skip: InputType == OutputType.
+Run:  skip_sfinae [Device=0 In=I8 Out=I8]
+Pass: Cold: 1.003653ms GPU, 1.010138ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=0 In=I8 Out=I16]
+Pass: Cold: 1.003682ms GPU, 1.010137ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=0 In=I8 Out=I32]
+Pass: Cold: 1.003715ms GPU, 1.010145ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x
+Run:  skip_sfinae [Device=0 In=I8 Out=I64]
+Pass: Cold: 1.003687ms GPU, 1.010252ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=0 In=I16 Out=I8]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=0 In=I16 Out=I16]
+Pass: Cold: 1.003672ms GPU, 1.010219ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  skip_sfinae [Device=0 In=I16 Out=I32]
+Pass: Cold: 1.003689ms GPU, 1.010125ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=0 In=I16 Out=I64]
+Pass: Cold: 1.003709ms GPU, 1.010160ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  skip_sfinae [Device=0 In=I32 Out=I8]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=0 In=I32 Out=I16]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=0 In=I32 Out=I32]
+Pass: Cold: 1.003687ms GPU, 1.010157ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 523x
+Run:  skip_sfinae [Device=0 In=I32 Out=I64]
+Pass: Cold: 1.003693ms GPU, 1.010184ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001476ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=0 In=I64 Out=I8]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=0 In=I64 Out=I16]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=0 In=I64 Out=I32]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=0 In=I64 Out=I64]
+Pass: Cold: 1.003664ms GPU, 1.010159ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001478ms GPU, 0.52s total GPU, 523x
+Run:  skip_sfinae [Device=1 In=I8 Out=I8]
+Pass: Cold: 1.002595ms GPU, 1.007262ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I8 Out=I16]
+Pass: Cold: 1.002599ms GPU, 1.007262ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I8 Out=I32]
+Pass: Cold: 1.002558ms GPU, 1.007394ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I8 Out=I64]
+Pass: Cold: 1.002551ms GPU, 1.007314ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I16 Out=I8]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=1 In=I16 Out=I16]
+Pass: Cold: 1.002570ms GPU, 1.007255ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I16 Out=I32]
+Pass: Cold: 1.002602ms GPU, 1.007275ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I16 Out=I64]
+Pass: Cold: 1.002582ms GPU, 1.007273ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I32 Out=I8]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=1 In=I32 Out=I16]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=1 In=I32 Out=I32]
+Pass: Cold: 1.002593ms GPU, 1.007290ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001474ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I32 Out=I64]
+Pass: Cold: 1.002575ms GPU, 1.007244ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+Run:  skip_sfinae [Device=1 In=I64 Out=I8]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=1 In=I64 Out=I16]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=1 In=I64 Out=I32]
+Skip: sizeof(InputType) > sizeof(OutputType).
+Run:  skip_sfinae [Device=1 In=I64 Out=I64]
+Pass: Cold: 1.002587ms GPU, 1.007259ms CPU, 0.50s total GPU, 499x
+Pass: Batch: 1.001475ms GPU, 0.52s total GPU, 524x
+```
+
+# Benchmark Results
+
+## runtime_skip
+
+### [0] Quadro GV100
+
+| Duration | Kramble | Samples |  CPU Time  | Noise |  GPU Time  | Noise  | Batch GPU  |  Batch  |
+|----------|---------|---------|------------|-------|------------|--------|------------|---------|
+|        0 |     Foo | 148083x |  10.558 us | 3.46% |   4.411 us | 10.69% |   2.086 us | 239683x |
+|  0.00025 |     Foo |   1967x | 260.565 us | 0.18% | 254.224 us |  0.23% | 251.905 us |   2064x |
+|        0 |     Bar | 147976x |  10.478 us | 5.19% |   4.310 us |  9.63% |   2.103 us | 237900x |
+|  0.00025 |     Bar |   1967x | 260.605 us | 0.44% | 254.251 us |  0.19% | 251.905 us |   2064x |
+|   0.0005 |     Bar |    993x | 510.339 us | 0.09% | 503.895 us |  0.09% | 501.761 us |   1044x |
+|  0.00075 |     Bar |    664x | 760.232 us | 0.12% | 753.776 us |  0.07% | 751.619 us |    697x |
+|    0.001 |     Bar |    499x |   1.010 ms | 0.05% |   1.004 ms |  0.05% |   1.001 ms |    523x |
+|    0.001 |     Baz |    499x |   1.010 ms | 0.05% |   1.004 ms |  0.05% |   1.001 ms |    524x |
+
+### [1] Quadro GP100
+
+| Duration | Kramble | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Batch GPU  |  Batch  |
+|----------|---------|---------|------------|-------|------------|-------|------------|---------|
+|        0 |     Foo | 152833x |   7.790 us | 5.25% |   3.054 us | 4.20% |   1.348 us | 371096x |
+|  0.00025 |     Foo |   1977x | 257.734 us | 0.17% | 253.030 us | 0.12% | 251.905 us |   2073x |
+|        0 |     Bar | 152569x |   7.898 us | 6.09% |   3.132 us | 6.80% |   1.441 us | 346971x |
+|  0.00025 |     Bar |   1977x | 257.721 us | 0.27% | 253.034 us | 0.13% | 251.905 us |   2074x |
+|   0.0005 |     Bar |    995x | 507.600 us | 0.08% | 502.882 us | 0.06% | 501.761 us |   1045x |
+|  0.00075 |     Bar |    665x | 757.417 us | 0.05% | 752.730 us | 0.04% | 751.617 us |    698x |
+|    0.001 |     Bar |    499x |   1.007 ms | 0.04% |   1.003 ms | 0.03% |   1.001 ms |    524x |
+|    0.001 |     Baz |    499x |   1.007 ms | 0.04% |   1.003 ms | 0.03% |   1.001 ms |    524x |
+
+## skip_overload
+
+### [0] Quadro GV100
+
+| In  | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|-----|-----|---------|----------|-------|----------|-------|-----------|-------|
+| I32 | I64 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+| I64 | I32 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+
+### [1] Quadro GP100
+
+| In  | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|-----|-----|---------|----------|-------|----------|-------|-----------|-------|
+| I32 | I64 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I64 | I32 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+
+## skip_sfinae
+
+### [0] Quadro GV100
+
+| In  | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|-----|-----|---------|----------|-------|----------|-------|-----------|-------|
+|  I8 |  I8 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+|  I8 | I16 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+|  I8 | I32 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+|  I8 | I64 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+| I16 | I16 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+| I16 | I32 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+| I16 | I64 |    499x | 1.010 ms | 0.04% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+| I32 | I32 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+| I32 | I64 |    499x | 1.010 ms | 0.05% | 1.004 ms | 0.05% |  1.001 ms |  524x |
+| I64 | I64 |    499x | 1.010 ms | 0.04% | 1.004 ms | 0.05% |  1.001 ms |  523x |
+
+### [1] Quadro GP100
+
+| In  | Out | Samples | CPU Time | Noise | GPU Time | Noise | Batch GPU | Batch |
+|-----|-----|---------|----------|-------|----------|-------|-----------|-------|
+|  I8 |  I8 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|  I8 | I16 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|  I8 | I32 |    499x | 1.007 ms | 0.05% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+|  I8 | I64 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I16 | I16 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I16 | I32 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I16 | I64 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I32 | I32 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I32 | I64 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
+| I64 | I64 |    499x | 1.007 ms | 0.04% | 1.003 ms | 0.03% |  1.001 ms |  524x |
diff --git a/examples/outputs/nvbench.example.throughput.csv b/examples/outputs/nvbench.example.throughput.csv
new file mode 100644
index 0000000..30eca18
--- /dev/null
+++ b/examples/outputs/nvbench.example.throughput.csv
@@ -0,0 +1,3 @@
+Benchmark,Device,Device Name,Skipped,NumElements,DataSize (bytes),Samples,CPU Time (sec),Noise,GPU Time (sec),Noise,Elem/s (elem/sec),GlobalMem BW (bytes/sec),BWPeak,Batch GPU (sec),Batch
+throughput_bench,0,Quadro GV100,No,16777216,67108864,47755,0.00027093838689142973,0.011175841617840646,0.00026478739019249176,0.011463549240955353,63361083727.603165,506888669820.8253,0.5823629019081173,0.0002632571401085129,47756
+throughput_bench,1,Quadro GP100,No,16777216,67108864,46734,0.00028047131987418375,0.009878517915727511,0.0002757727249773843,0.009988896437015563,60837111434.337364,486696891474.6989,0.664741165147917,0.0002754124925807889,46735
diff --git a/examples/outputs/nvbench.example.throughput.json b/examples/outputs/nvbench.example.throughput.json
new file mode 100644
index 0000000..e13072e
--- /dev/null
+++ b/examples/outputs/nvbench.example.throughput.json
@@ -0,0 +1,498 @@
+{
+  "devices": [
+    {
+      "id": 0,
+      "name": "Quadro GV100",
+      "sm_version": 700,
+      "ptx_version": 700,
+      "sm_default_clock_rate": 1627000000,
+      "number_of_sms": 80,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 34086060032,
+      "global_memory_bus_peak_clock_rate": 850000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 870400000000,
+      "l2_cache_size": 6291456,
+      "shared_memory_per_sm": 98304,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    },
+    {
+      "id": 1,
+      "name": "Quadro GP100",
+      "sm_version": 600,
+      "ptx_version": 600,
+      "sm_default_clock_rate": 1442500000,
+      "number_of_sms": 56,
+      "max_blocks_per_sm": 32,
+      "max_threads_per_sm": 2048,
+      "max_threads_per_block": 1024,
+      "registers_per_sm": 65536,
+      "registers_per_block": 65536,
+      "global_memory_size": 17069309952,
+      "global_memory_bus_peak_clock_rate": 715000000,
+      "global_memory_bus_width": 4096,
+      "global_memory_bus_bandwidth": 732160000000,
+      "l2_cache_size": 4194304,
+      "shared_memory_per_sm": 65536,
+      "shared_memory_per_block": 49152,
+      "ecc_state": false
+    }
+  ],
+  "benchmarks": [
+    {
+      "index": 0,
+      "name": "throughput_bench",
+      "min_samples": 10,
+      "min_time": 0.5,
+      "max_noise": 0.005,
+      "skip_time": -1.0,
+      "timeout": 15.0,
+      "devices": [
+        0,
+        1
+      ],
+      "axes": null,
+      "states": {
+        "Device=0": {
+          "device": 0,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Element count: NumElements": {
+              "short_name": {
+                "type": "string",
+                "value": "NumElements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "DataSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47755"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00027093838689142973"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011175841617840646"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00026478739019249176"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.011463549240955353"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "63361083727.603165"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "506888669820.8253"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.5823629019081173"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002632571401085129"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "47756"
+              }
+            }
+          },
+          "is_skipped": false
+        },
+        "Device=1": {
+          "device": 1,
+          "type_config_index": 0,
+          "min_samples": 10,
+          "min_time": 0.5,
+          "max_noise": 0.005,
+          "skip_time": -1.0,
+          "timeout": 15.0,
+          "axis_values": null,
+          "summaries": {
+            "Element count: NumElements": {
+              "short_name": {
+                "type": "string",
+                "value": "NumElements"
+              },
+              "value": {
+                "type": "int64",
+                "value": "16777216"
+              }
+            },
+            "Input Buffer Size: ": {
+              "hint": {
+                "type": "string",
+                "value": "bytes"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "DataSize"
+              },
+              "value": {
+                "type": "int64",
+                "value": "67108864"
+              }
+            },
+            "Number of Samples (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Samples"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in cold time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "46734"
+              }
+            },
+            "Average CPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "CPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time observed from host."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.00028047131987418375"
+              }
+            },
+            "CPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold CPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009878517915727511"
+              }
+            },
+            "Average GPU Time (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GPU Time"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average isolated kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002757727249773843"
+              }
+            },
+            "GPU Relative Standard Deviation (Cold)": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Noise"
+              },
+              "description": {
+                "type": "string",
+                "value": "Relative standard deviation of the cold GPU execution time measurements."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.009988896437015563"
+              }
+            },
+            "Element Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "item_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Elem/s"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of input elements handled per second."
+              },
+              "value": {
+                "type": "float64",
+                "value": "60837111434.337364"
+              }
+            },
+            "Average Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "byte_rate"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "GlobalMem BW"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of bytes read/written per second to the CUDA device's global memory."
+              },
+              "value": {
+                "type": "float64",
+                "value": "486696891474.6989"
+              }
+            },
+            "Percent Peak Global Memory Throughput": {
+              "hint": {
+                "type": "string",
+                "value": "percentage"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "BWPeak"
+              },
+              "description": {
+                "type": "string",
+                "value": "Global device memory throughput as a percentage of the device's peak bandwidth."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.664741165147917"
+              }
+            },
+            "Average GPU Time (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "duration"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch GPU"
+              },
+              "description": {
+                "type": "string",
+                "value": "Average back-to-back kernel execution time as measured by CUDA events."
+              },
+              "value": {
+                "type": "float64",
+                "value": "0.0002754124925807889"
+              }
+            },
+            "Number of Samples (Batch)": {
+              "hint": {
+                "type": "string",
+                "value": "sample_size"
+              },
+              "short_name": {
+                "type": "string",
+                "value": "Batch"
+              },
+              "description": {
+                "type": "string",
+                "value": "Number of kernel executions in hot time measurements."
+              },
+              "value": {
+                "type": "int64",
+                "value": "46735"
+              }
+            }
+          },
+          "is_skipped": false
+        }
+      }
+    }
+  ]
+}
diff --git a/examples/outputs/nvbench.example.throughput.list.md b/examples/outputs/nvbench.example.throughput.list.md
new file mode 100644
index 0000000..b39ce69
--- /dev/null
+++ b/examples/outputs/nvbench.example.throughput.list.md
@@ -0,0 +1,32 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 30117 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 14939 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Benchmarks
+
+## [0] `throughput_bench` (1 configurations)
+
diff --git a/examples/outputs/nvbench.example.throughput.md b/examples/outputs/nvbench.example.throughput.md
new file mode 100644
index 0000000..50198c6
--- /dev/null
+++ b/examples/outputs/nvbench.example.throughput.md
@@ -0,0 +1,56 @@
+# Devices
+
+## [0] `Quadro GV100`
+* SM Version: 700 (PTX Version: 700)
+* Number of SMs: 80
+* SM Default Clock Rate: 1627 MHz
+* Global Memory: 32163 MiB Free / 32507 MiB Total
+* Global Memory Bus Peak: 870 GB/sec (4096-bit DDR @850MHz)
+* Max Shared Memory: 96 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 6144 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+## [1] `Quadro GP100`
+* SM Version: 600 (PTX Version: 600)
+* Number of SMs: 56
+* SM Default Clock Rate: 1442 MHz
+* Global Memory: 15999 MiB Free / 16278 MiB Total
+* Global Memory Bus Peak: 732 GB/sec (4096-bit DDR @715MHz)
+* Max Shared Memory: 64 KiB/SM, 48 KiB/Block
+* L2 Cache Size: 4096 KiB
+* Maximum Active Blocks: 32/SM
+* Maximum Active Threads: 2048/SM, 1024/Block
+* Available Registers: 65536/SM, 65536/Block
+* ECC Enabled: No
+
+# Log
+
+```
+Run:  throughput_bench [Device=0]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.15% > 0.50%)
+Pass: Cold: 0.264787ms GPU, 0.270938ms CPU, 12.64s total GPU, 47755x
+Pass: Batch: 0.263257ms GPU, 12.57s total GPU, 47756x
+Run:  throughput_bench [Device=1]
+Warn: Current measurement timed out (15.00s) while over noise threshold (1.00% > 0.50%)
+Pass: Cold: 0.275773ms GPU, 0.280471ms CPU, 12.89s total GPU, 46734x
+Pass: Batch: 0.275412ms GPU, 12.87s total GPU, 46735x
+```
+
+# Benchmark Results
+
+## throughput_bench
+
+### [0] Quadro GV100
+
+| NumElements |  DataSize  | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | GlobalMem BW | BWPeak | Batch GPU  | Batch  |
+|-------------|------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------|
+|    16777216 | 64.000 MiB |  47755x | 270.938 us | 1.12% | 264.787 us | 1.15% | 63.361G | 506.889 GB/s | 58.24% | 263.257 us | 47756x |
+
+### [1] Quadro GP100
+
+| NumElements |  DataSize  | Samples |  CPU Time  | Noise |  GPU Time  | Noise | Elem/s  | GlobalMem BW | BWPeak | Batch GPU  | Batch  |
+|-------------|------------|---------|------------|-------|------------|-------|---------|--------------|--------|------------|--------|
+|    16777216 | 64.000 MiB |  46734x | 280.471 us | 0.99% | 275.773 us | 1.00% | 60.837G | 486.697 GB/s | 66.47% | 275.412 us | 46735x |
diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu
index b254d61..4dd12b5 100644
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -172,6 +172,11 @@ void markdown_printer::do_print_benchmark_list(
                    bench_ptr->get_name(),
                    num_configs);
 
+    if (axes.empty())
+    {
+      continue;
+    }
+
     fmt::format_to(buffer, "### Axes\n\n");
     for (const auto &axis_ptr : axes)
     {
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 4eee401..2ede52a 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -353,7 +353,7 @@ void option_parser::parse_impl()
 
   if (m_exit_after_parsing)
   {
-    std::exit(0);
+    this->cleanup_and_exit(0);
   }
 
   if (m_benchmarks.empty())
@@ -406,22 +406,22 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->print_version();
       fmt::print("\n");
       this->print_help();
-      std::exit(0);
+      this->cleanup_and_exit(0);
     }
     else if (arg == "--help-axes" || arg == "--help-axis")
     {
       this->print_help_axis();
-      std::exit(0);
+      this->cleanup_and_exit(0);
     }
     else if (arg == "--version")
     {
       this->print_version();
-      std::exit(0);
+      this->cleanup_and_exit(0);
     }
     else if (arg == "--list" || arg == "-l")
     {
       this->print_list();
-      std::exit(0);
+      this->cleanup_and_exit(0);
     }
     else if (arg == "--persistence-mode" || arg == "--pm")
     {
@@ -590,11 +590,28 @@ void option_parser::print_version() const
 
 void option_parser::print_list() const
 {
-  const auto &bench_mgr = nvbench::benchmark_manager::get();
+  auto do_print = [](auto &&printer) {
+    printer.print_device_info();
 
-  nvbench::markdown_printer printer{std::cout};
-  printer.print_device_info();
-  printer.print_benchmark_list(bench_mgr.get_benchmarks());
+    const auto &bench_mgr = nvbench::benchmark_manager::get();
+    printer.print_benchmark_list(bench_mgr.get_benchmarks());
+  };
+
+  // Try to find a markdown printer in the current list:
+  for (const auto &printer : m_printer.get_printers())
+  {
+    if (const auto *md_printer_const =
+          dynamic_cast<const markdown_printer *>(printer.get());
+        md_printer_const)
+    {
+      auto &md_printer = const_cast<markdown_printer &>(*md_printer_const);
+      do_print(md_printer);
+      return;
+    }
+  }
+
+  // Fallback to a new stdout printer.
+  do_print(nvbench::markdown_printer{std::cout});
 }
 
 void option_parser::print_help() const
@@ -1012,4 +1029,11 @@ void option_parser::update_used_device_state() const
 
 nvbench::printer_base &option_parser::get_printer() { return m_printer; }
 
+void option_parser::cleanup_and_exit(int exit_code)
+{
+  // Free all ofstreams to make sure they flush:
+  m_ofstream_storage.clear();
+  std::exit(exit_code);
+}
+
 } // namespace nvbench
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index 19d2984..8b6074b 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -121,6 +121,9 @@ private:
 
   void update_used_device_state() const;
 
+  // Releases any important resources and calls `std::exit(exit_code)`
+  [[noreturn]] void cleanup_and_exit(int exit_code);
+
   // less gross argv:
   std::vector<std::string> m_args;
 
diff --git a/nvbench/printer_multiplex.cuh b/nvbench/printer_multiplex.cuh
index d34ceb2..f607879 100644
--- a/nvbench/printer_multiplex.cuh
+++ b/nvbench/printer_multiplex.cuh
@@ -46,6 +46,9 @@ struct printer_multiplex : nvbench::printer_base
     return m_printers.size();
   }
 
+  [[nodiscard]] const auto &get_printers() const { return m_printers; }
+  [[nodiscard]] auto &get_printers() { return m_printers; }
+
 private:
   void do_print_device_info() override;
   void do_print_log_preamble() override;