diff --git a/.clang-format b/.clang-format
index 2a3dc26d..e593f29e 100644
--- a/.clang-format
+++ b/.clang-format
@@ -36,9 +36,21 @@ BreakBeforeBinaryOperators: None
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeComma
 BreakInheritanceList: BeforeComma
-ColumnLimit: 80
+ColumnLimit: 100
 CompactNamespaces: false
 ContinuationIndentWidth: 2
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<cub'
+    Priority:        1
+  - Regex:           '^<thrust'
+    Priority:        2
+  - Regex:           '^<cuda'
+    Priority:        3
+  - Regex:           '^<nvbench'
+    Priority:        4
+  - Regex:           '^<[a-z]*>$'
+    Priority:        5
 IndentCaseLabels: true
 IndentPPDirectives: None
 IndentWidth: 2
@@ -55,7 +67,7 @@ PenaltyExcessCharacter: 100
 PenaltyReturnTypeOnItsOwnLine: 90
 PointerAlignment: Right
 ReflowComments: true
-SortIncludes: true
+SortIncludes: CaseInsensitive
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
diff --git a/.clangd b/.clangd
new file mode 100644
index 00000000..0e4c84bd
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,62 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Compiler: clang++
+  CompilationDatabase: .
+  Add:
+    - -x
+    - cuda
+    # report all errors
+    - "-ferror-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    - "-std=c++17"
+  Remove:
+    # strip CUDA fatbin args
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "-Xcudafe*"
+    - "-rdc=*"
+    - "-gpu=*"
+    - "--diag_suppress*"
+    # strip CUDA arch flags
+    - "-gencode*"
+    - "--generate-code*"
+    # strip gcc's -fcoroutines
+    - -fcoroutines
+    # strip CUDA flags unknown to clang
+    - "-ccbin*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+    # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error.
+    # Temporarily suppressing it, but should probably fix
+    - "template_param_shadow"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 00000000..e84b5f39
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,198 @@
+> **Note**
+> The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon!
+
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
+
+# CCCL Dev Containers
+
+CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL.
+
+## Table of Contents
+1. [Quickstart: VSCode (Recommended)](#vscode)
+2. [Quickstart: Docker (Manual Approach)](#docker)
+3. [Quickstart: Using WSL](#wsl)
+
+## Quickstart: VSCode (Recommended) <a name="vscode"></a>
+
+### Prerequisites
+- [Visual Studio Code](https://code.visualstudio.com/)
+- [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+- [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension
+
+### Steps
+
+1. Clone the Repository
+    ```bash
+    git clone https://github.com/nvidia/cccl.git
+    ```
+2. Open the cloned directory in VSCode
+
+3. Launch a Dev Container by clicking the prompt suggesting to "Reopen in Container"
+
+   ![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png)
+
+   - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
+
+     ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png)
+
+4. Select an environment with the desired CTK and host compiler from the list:
+
+   ![Shows list of available container environments.](./img/container_list.png)
+
+5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time.
+
+6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent.
+
+7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+### (Optional) Authenticate with GitHub for `sccache`
+
+After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations.
+
+Without authentication to the remote server, `sccache` will still accelerate local builds by using a filesystem cache.
+
+Follow the instructions in the prompt as below and enter the one-time code at https://github.com/login/device
+
+  ![Shows authentication with GitHub to access sccache bucket.](./img/github_auth.png)
+
+To manually trigger this authentication, execute the `devcontainer-utils-vault-s3-init` script within the container.
+
+For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache).
+
+## Quickstart: Docker (Manual Approach) <a name="docker"></a>
+
+### Prerequisites
+- [Docker](https://docs.docker.com/desktop/install/linux-install/)
+
+### Steps
+1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment
+    ```bash
+    git clone https://github.com/nvidia/cccl.git
+    cd cccl
+    ./.devcontainer/launch.sh --docker
+    ```
+    This script starts an interactive shell as the `coder` user inside the container with the local `cccl/` directory mirrored into `/home/coder/cccl`.
+
+    For specific environments, use the `--cuda` and `--host` options:
+    ```bassh
+    ./.devcontainer/launch.sh --docker --cuda 12.2 --host gcc10
+    ```
+    See `./.devcontainer/launch.sh --help` for more information.
+
+2. Done. See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+## Available Environments
+
+CCCL provides environments for both the oldest and newest supported CUDA versions with all compatible host compilers.
+
+Look in the [`.devcontainer/`](.) directory to see the available configurations. The top-level [`devcontainer.json`](./devcontainer.json) serves as the default environment. All `devcontainer.json` files in the `cuda<CTK_VERSION>-<HOST-COMPILER>` sub-directories are variations on this top-level file, with different base images for the different CUDA and host compiler versions.
+
+## VSCode Customization
+
+By default, CCCL's Dev Containers come with certain VSCode settings and extensions configured by default, as can be seen in the [`devcontainer.json`](./devcontainer.json) file. This can be further customized by users without needing to modify the `devcontainer.json` file directly.
+
+For extensions, the [`dev.containers.defaultExtensions` setting](https://code.visualstudio.com/docs/devcontainers/containers#_always-installed-extensions) allows listing extensions that will always be installed.
+
+For more general customizations, VSCode allows using a dotfile repository. See the [VSCode documentation](https://code.visualstudio.com/docs/devcontainers/containers#_personalizing-with-dotfile-repositories) for more information.
+
+## GitHub Codespaces
+
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
+
+One of the benefits of Dev Containers is that they integrate natively with [GitHub Codespaces](https://github.com/features/codespaces). Codespaces provide a VSCode development environment right in your browser running on a machine in the cloud. This provides a truly one-click, turnkey development environment where you can develop, build, and test with no other setup required.
+
+Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) to get started with CCCL's Dev Containers on Codespaces. This will start the default Dev Container environment. [Click here](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=296416761&skip_quickstart=true) to start a Codespace with a particular environment and hardware configuration as shown:
+
+   ![Shows configuring a Codespace with a custom environment](../docs/images/codespaces.png)
+
+## For Maintainers: The `make_devcontainers.sh` Script
+
+### Overview
+
+[`make_devcontainers.sh`](./make_devcontainers.sh) generates devcontainer configurations for the unique combinations of CUDA Toolkit (CTK) versions and host compilers in [`ci/matrix.yaml`](../ci/matrix.yaml).
+
+### How It Works:
+
+1. Parses the matrix from `ci/matrix.yaml`.
+2. Use the top-level [`.devcontainer/devcontainer.json`](./devcontainer.json) as a template. For each unique combination of CTK version and host compiler, generate a corresponding `devcontainer.json` configuration, adjusting only the base Docker image to match the desired environment.
+3. Place the generated configurations in the `.devcontainer` directory, organizing them into subdirectories following the naming convention `cuda<CTK_VERSION>-<COMPILER_VERSION>`.
+
+For more information, see the `.devcontainer/make_devcontainers.sh --help` message.
+
+**Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations.
+
+## Quickstart: Using WSL <a name="wsl"></a>
+
+> [!NOTE]
+> _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification.
+
+### Install WSL on your Windows host
+
+> [!WARNING]
+> Disclaimer: This guide was developed for WSL 2 on Windows 11.
+
+1. Launch a Windows terminal (_e.g. Powershell_) as an administrator.
+
+2. Install WSL 2 by running:
+```bash
+wsl --install 
+```
+This should probably install Ubuntu distro as a default.
+
+3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation.
+
+<h3 id="prereqs"> Install prerequisites and VS Code extensions</h3>
+
+4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell.
+
+5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code.
+
+    - `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension).
+
+    - Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case).
+
+6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code.
+
+    - In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that.
+
+7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`).
+
+8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following:
+
+```json
+{
+    "runtimes": {
+        "nvidia": {
+            "path": "nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    }
+}
+```
+
+then run `sudo systemctl restart docker.service`.
+
+---
+### Build CCCL in WSL using Dev Containers
+
+9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git`
+
+
+10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
+
+11. If prompted, choose `Reopen in Container`.
+    
+    - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
+
+12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
+
+From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
+
+13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. 
+
+> Failed opening a web browser at https://github.com/login/device
+  exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
+  Please try entering the URL in your browser manually
+
+In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code.
diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json
new file mode 100644
index 00000000..9cffedae
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc7-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc7",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc7"
+}
diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json
new file mode 100644
index 00000000..de336499
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc8-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc8",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc8"
+}
diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json
new file mode 100644
index 00000000..559bb50a
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc9-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc9",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc9"
+}
diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json
new file mode 100644
index 00000000..602753c6
--- /dev/null
+++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm9-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-llvm9",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda11.1-llvm9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-llvm9"
+}
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
new file mode 100644
index 00000000..5e480245
--- /dev/null
+++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc11-cuda11.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.8-gcc11",
+    "CCCL_CUDA_VERSION": "11.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda11.8-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.8-gcc11"
+}
diff --git a/.devcontainer/cuda12.0-gcc10/devcontainer.json b/.devcontainer/cuda12.0-gcc10/devcontainer.json
new file mode 100644
index 00000000..68d5f8ca
--- /dev/null
+++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc10-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc10",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc10"
+}
diff --git a/.devcontainer/cuda12.0-gcc11/devcontainer.json b/.devcontainer/cuda12.0-gcc11/devcontainer.json
new file mode 100644
index 00000000..f811a4a6
--- /dev/null
+++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc11-cuda12.0-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc11",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc11"
+}
diff --git a/.devcontainer/cuda12.0-gcc12/devcontainer.json b/.devcontainer/cuda12.0-gcc12/devcontainer.json
new file mode 100644
index 00000000..6f702f41
--- /dev/null
+++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.0-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc12",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc12"
+}
diff --git a/.devcontainer/cuda12.0-gcc7/devcontainer.json b/.devcontainer/cuda12.0-gcc7/devcontainer.json
new file mode 100644
index 00000000..ca9ab6ce
--- /dev/null
+++ b/.devcontainer/cuda12.0-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc7-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc7",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc7"
+}
diff --git a/.devcontainer/cuda12.0-gcc8/devcontainer.json b/.devcontainer/cuda12.0-gcc8/devcontainer.json
new file mode 100644
index 00000000..387b53db
--- /dev/null
+++ b/.devcontainer/cuda12.0-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc8-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc8",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc8"
+}
diff --git a/.devcontainer/cuda12.0-gcc9/devcontainer.json b/.devcontainer/cuda12.0-gcc9/devcontainer.json
new file mode 100644
index 00000000..d2e01ba1
--- /dev/null
+++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc9-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc9",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc9"
+}
diff --git a/.devcontainer/cuda12.0-llvm10/devcontainer.json b/.devcontainer/cuda12.0-llvm10/devcontainer.json
new file mode 100644
index 00000000..c227e9a5
--- /dev/null
+++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm10-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm10",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm10"
+}
diff --git a/.devcontainer/cuda12.0-llvm11/devcontainer.json b/.devcontainer/cuda12.0-llvm11/devcontainer.json
new file mode 100644
index 00000000..a61ae4b5
--- /dev/null
+++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm11-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm11",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm11"
+}
diff --git a/.devcontainer/cuda12.0-llvm12/devcontainer.json b/.devcontainer/cuda12.0-llvm12/devcontainer.json
new file mode 100644
index 00000000..c63e4050
--- /dev/null
+++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm12-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm12",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm12"
+}
diff --git a/.devcontainer/cuda12.0-llvm13/devcontainer.json b/.devcontainer/cuda12.0-llvm13/devcontainer.json
new file mode 100644
index 00000000..5cd6163c
--- /dev/null
+++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm13-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm13",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm13"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm13"
+}
diff --git a/.devcontainer/cuda12.0-llvm14/devcontainer.json b/.devcontainer/cuda12.0-llvm14/devcontainer.json
new file mode 100644
index 00000000..0fcae844
--- /dev/null
+++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm14-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm14",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm14"
+}
diff --git a/.devcontainer/cuda12.0-llvm9/devcontainer.json b/.devcontainer/cuda12.0-llvm9/devcontainer.json
new file mode 100644
index 00000000..6b9530e5
--- /dev/null
+++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm9-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm9",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm9"
+}
diff --git a/.devcontainer/cuda12.6-gcc10/devcontainer.json b/.devcontainer/cuda12.6-gcc10/devcontainer.json
new file mode 100644
index 00000000..2da4470b
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc10-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc10",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc10"
+}
diff --git a/.devcontainer/cuda12.6-gcc11/devcontainer.json b/.devcontainer/cuda12.6-gcc11/devcontainer.json
new file mode 100644
index 00000000..2930279f
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc11-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc11",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc11"
+}
diff --git a/.devcontainer/cuda12.6-gcc12/devcontainer.json b/.devcontainer/cuda12.6-gcc12/devcontainer.json
new file mode 100644
index 00000000..c4774db4
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc12",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc12"
+}
diff --git a/.devcontainer/cuda12.6-gcc7/devcontainer.json b/.devcontainer/cuda12.6-gcc7/devcontainer.json
new file mode 100644
index 00000000..1e731419
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc7-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc7",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc7"
+}
diff --git a/.devcontainer/cuda12.6-gcc8/devcontainer.json b/.devcontainer/cuda12.6-gcc8/devcontainer.json
new file mode 100644
index 00000000..92922c23
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc8-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc8",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc8"
+}
diff --git a/.devcontainer/cuda12.6-gcc9/devcontainer.json b/.devcontainer/cuda12.6-gcc9/devcontainer.json
new file mode 100644
index 00000000..f3f52237
--- /dev/null
+++ b/.devcontainer/cuda12.6-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc9-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc9",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc9"
+}
diff --git a/.devcontainer/cuda12.6-llvm10/devcontainer.json b/.devcontainer/cuda12.6-llvm10/devcontainer.json
new file mode 100644
index 00000000..01e2d4ab
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm10-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm10",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm10"
+}
diff --git a/.devcontainer/cuda12.6-llvm11/devcontainer.json b/.devcontainer/cuda12.6-llvm11/devcontainer.json
new file mode 100644
index 00000000..3ea9167a
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm11-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm11",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm11"
+}
diff --git a/.devcontainer/cuda12.6-llvm12/devcontainer.json b/.devcontainer/cuda12.6-llvm12/devcontainer.json
new file mode 100644
index 00000000..6a8fd246
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm12-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm12",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm12"
+}
diff --git a/.devcontainer/cuda12.6-llvm13/devcontainer.json b/.devcontainer/cuda12.6-llvm13/devcontainer.json
new file mode 100644
index 00000000..722b8a1b
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm13/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm13-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm13",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm13"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm13"
+}
diff --git a/.devcontainer/cuda12.6-llvm14/devcontainer.json b/.devcontainer/cuda12.6-llvm14/devcontainer.json
new file mode 100644
index 00000000..ca7ec344
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm14-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm14",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm14"
+}
diff --git a/.devcontainer/cuda12.6-llvm15/devcontainer.json b/.devcontainer/cuda12.6-llvm15/devcontainer.json
new file mode 100644
index 00000000..889c71a5
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm15/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm15-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm15",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "15",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm15"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm15"
+}
diff --git a/.devcontainer/cuda12.6-llvm16/devcontainer.json b/.devcontainer/cuda12.6-llvm16/devcontainer.json
new file mode 100644
index 00000000..e93737d3
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm16/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm16-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm16",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "16",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm16"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm16"
+}
diff --git a/.devcontainer/cuda12.6-llvm17/devcontainer.json b/.devcontainer/cuda12.6-llvm17/devcontainer.json
new file mode 100644
index 00000000..1f5e05dc
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm17/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm17-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm17",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "17",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm17"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm17"
+}
diff --git a/.devcontainer/cuda12.6-llvm18/devcontainer.json b/.devcontainer/cuda12.6-llvm18/devcontainer.json
new file mode 100644
index 00000000..6cbe548a
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm18/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm18-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm18",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "18",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm18"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm18"
+}
diff --git a/.devcontainer/cuda12.6-llvm9/devcontainer.json b/.devcontainer/cuda12.6-llvm9/devcontainer.json
new file mode 100644
index 00000000..9f97f1cd
--- /dev/null
+++ b/.devcontainer/cuda12.6-llvm9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-llvm9-cuda12.6-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-llvm9",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.6-llvm9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-llvm9"
+}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 00000000..c4774db4
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.12-cpp-gcc12-cuda12.6-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.6-gcc12",
+    "CCCL_CUDA_VERSION": "12.6",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.6-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.6-gcc12"
+}
diff --git a/.devcontainer/docker-entrypoint.sh b/.devcontainer/docker-entrypoint.sh
new file mode 100755
index 00000000..0fd94876
--- /dev/null
+++ b/.devcontainer/docker-entrypoint.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Maybe change the UID/GID of the container's non-root user to match the host's UID/GID
+
+: "${REMOTE_USER:="coder"}";
+: "${OLD_UID:=}";
+: "${OLD_GID:=}";
+: "${NEW_UID:=}";
+: "${NEW_GID:=}";
+
+eval "$(sed -n "s/${REMOTE_USER}:[^:]*:\([^:]*\):\([^:]*\):[^:]*:\([^:]*\).*/OLD_UID=\1;OLD_GID=\2;HOME_FOLDER=\3/p" /etc/passwd)";
+eval "$(sed -n "s/\([^:]*\):[^:]*:${NEW_UID}:.*/EXISTING_USER=\1/p" /etc/passwd)";
+eval "$(sed -n "s/\([^:]*\):[^:]*:${NEW_GID}:.*/EXISTING_GROUP=\1/p" /etc/group)";
+
+if [ -z "$OLD_UID" ]; then
+    echo "Remote user not found in /etc/passwd ($REMOTE_USER).";
+    exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+elif [ "$OLD_UID" = "$NEW_UID" ] && [ "$OLD_GID" = "$NEW_GID" ]; then
+    echo "UIDs and GIDs are the same ($NEW_UID:$NEW_GID).";
+    exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+elif [ "$OLD_UID" != "$NEW_UID" ] && [ -n "$EXISTING_USER" ]; then
+    echo "User with UID exists ($EXISTING_USER=$NEW_UID).";
+    exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+else
+    if [ "$OLD_GID" != "$NEW_GID" ] && [ -n "$EXISTING_GROUP" ]; then
+        echo "Group with GID exists ($EXISTING_GROUP=$NEW_GID).";
+        NEW_GID="$OLD_GID";
+    fi
+    echo "Updating UID:GID from $OLD_UID:$OLD_GID to $NEW_UID:$NEW_GID.";
+    sed -i -e "s/\(${REMOTE_USER}:[^:]*:\)[^:]*:[^:]*/\1${NEW_UID}:${NEW_GID}/" /etc/passwd;
+    if [ "$OLD_GID" != "$NEW_GID" ]; then
+        sed -i -e "s/\([^:]*:[^:]*:\)${OLD_GID}:/\1${NEW_GID}:/" /etc/group;
+    fi
+
+    # Fast parallel `chown -R`
+    find "$HOME_FOLDER/" -not -user "$REMOTE_USER" -print0 \
+  | xargs -0 -r -n1 -P"$(nproc --all)" chown "$NEW_UID:$NEW_GID"
+
+    # Run the container command as $REMOTE_USER, preserving the container startup environment.
+    #
+    # We cannot use `su -w` because that's not supported by the `su` in Ubuntu18.04, so we reset the following
+    # environment variables to the expected values, then pass through everything else from the startup environment.
+    export HOME="$HOME_FOLDER";
+    export XDG_CACHE_HOME="$HOME_FOLDER/.cache";
+    export XDG_CONFIG_HOME="$HOME_FOLDER/.config";
+    export XDG_STATE_HOME="$HOME_FOLDER/.local/state";
+    export PYTHONHISTFILE="$HOME_FOLDER/.local/state/.python_history";
+    exec su -p "$REMOTE_USER" -- "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+fi
diff --git a/.devcontainer/img/container_list.png b/.devcontainer/img/container_list.png
new file mode 100644
index 00000000..09c4510f
Binary files /dev/null and b/.devcontainer/img/container_list.png differ
diff --git a/.devcontainer/img/github_auth.png b/.devcontainer/img/github_auth.png
new file mode 100644
index 00000000..3f52b3a2
Binary files /dev/null and b/.devcontainer/img/github_auth.png differ
diff --git a/.devcontainer/img/open_in_container_manual.png b/.devcontainer/img/open_in_container_manual.png
new file mode 100644
index 00000000..e09435b8
Binary files /dev/null and b/.devcontainer/img/open_in_container_manual.png differ
diff --git a/.devcontainer/img/reopen_in_container.png b/.devcontainer/img/reopen_in_container.png
new file mode 100644
index 00000000..0e1d82dd
Binary files /dev/null and b/.devcontainer/img/reopen_in_container.png differ
diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh
new file mode 100755
index 00000000..a9ef143c
--- /dev/null
+++ b/.devcontainer/launch.sh
@@ -0,0 +1,307 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Ensure the script is being executed in the nvbench/ root
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..";
+
+print_help() {
+    echo "Usage: $0 [-c|--cuda <CUDA version>] [-H|--host <Host compiler>] [-d|--docker]"
+    echo "Launch a development container. If no CUDA version or Host compiler are specified,"
+    echo "the top-level devcontainer in .devcontainer/devcontainer.json will be used."
+    echo ""
+    echo "Options:"
+    echo "  -c, --cuda               Specify the CUDA version. E.g., 12.2"
+    echo "  -H, --host               Specify the host compiler. E.g., gcc12"
+    echo "  -d, --docker             Launch the development environment in Docker directly without using VSCode."
+    echo "  --gpus gpu-request       GPU devices to add to the container ('all' to pass all GPUs)."
+    echo "  -e, --env list           Set additional container environment variables."
+    echo "  -v, --volume list        Bind mount a volume."
+    echo "  -h, --help               Display this help message and exit."
+}
+
+# Assign variable one scope above the caller
+# Usage: local "$1" && _upvar $1 "value(s)"
+# Param: $1  Variable name to assign value to
+# Param: $*  Value(s) to assign.  If multiple values, an array is
+#            assigned, otherwise a single value is assigned.
+# See: http://fvue.nl/wiki/Bash:_Passing_variables_by_reference
+_upvar() {
+    if unset -v "$1"; then
+        if (( $# == 2 )); then
+            eval $1=\"\$2\";
+        else
+            eval $1=\(\"\${@:2}\"\);
+        fi;
+    fi
+}
+
+parse_options() {
+    local -;
+    set -euo pipefail;
+
+    # Read the name of the variable in which to return unparsed arguments
+    local UNPARSED="${!#}";
+    # Splice the unparsed arguments variable name from the arguments list
+    set -- "${@:1:$#-1}";
+
+    local OPTIONS=c:e:H:dhv
+    local LONG_OPTIONS=cuda:,env:,host:,gpus:,volume:,docker,help
+    # shellcheck disable=SC2155
+    local PARSED_OPTIONS=$(getopt -n "$0" -o "${OPTIONS}" --long "${LONG_OPTIONS}" -- "$@")
+
+    # shellcheck disable=SC2181
+    if [[ $? -ne 0 ]]; then
+        exit 1
+    fi
+
+    eval set -- "${PARSED_OPTIONS}"
+
+    while true; do
+        case "$1" in
+            -c|--cuda)
+                cuda_version="$2"
+                shift 2
+                ;;
+            -e|--env)
+                env_vars+=("$1" "$2")
+                shift 2
+                ;;
+            -H|--host)
+                host_compiler="$2"
+                shift 2
+                ;;
+            --gpus)
+                gpu_request="$2"
+                shift 2
+                ;;
+            -d|--docker)
+                docker_mode=true
+                shift
+                ;;
+            -h|--help)
+                print_help
+                exit 0
+                ;;
+            -v|--volume)
+                volumes+=("$1" "$2")
+                shift 2
+                ;;
+            --)
+                shift
+                _upvar "${UNPARSED}" "${@}"
+                break
+                ;;
+            *)
+                echo "Invalid option: $1"
+                print_help
+                exit 1
+                ;;
+        esac
+    done
+}
+
+# shellcheck disable=SC2155
+launch_docker() {
+    local -;
+    set -euo pipefail
+
+    inline_vars() {
+        cat - \
+        `# inline local workspace folder` \
+      | sed "s@\${localWorkspaceFolder}@$(pwd)@g" \
+        `# inline local workspace folder basename` \
+      | sed "s@\${localWorkspaceFolderBasename}@$(basename "$(pwd)")@g" \
+        `# inline container workspace folder` \
+      | sed "s@\${containerWorkspaceFolder}@${WORKSPACE_FOLDER:-}@g" \
+        `# inline container workspace folder basename` \
+      | sed "s@\${containerWorkspaceFolderBasename}@$(basename "${WORKSPACE_FOLDER:-}")@g" \
+        `# translate local envvars to shell syntax` \
+      | sed -r 's/\$\{localEnv:([^\:]*):?(.*)\}/${\1:-\2}/g'
+    }
+
+    args_to_path() {
+        local -a keys=("${@}")
+        keys=("${keys[@]/#/[}")
+        keys=("${keys[@]/%/]}")
+        echo "$(IFS=; echo "${keys[*]}")"
+    }
+
+    json_string() {
+        python3 -c "import json,sys; print(json.load(sys.stdin)$(args_to_path "${@}"))" 2>/dev/null | inline_vars
+    }
+
+    json_array() {
+        python3 -c "import json,sys; [print(f'\"{x}\"') for x in json.load(sys.stdin)$(args_to_path "${@}")]" 2>/dev/null | inline_vars
+    }
+
+    json_map() {
+        python3 -c "import json,sys; [print(f'{k}=\"{v}\"') for k,v in json.load(sys.stdin)$(args_to_path "${@}").items()]" 2>/dev/null | inline_vars
+    }
+
+    devcontainer_metadata_json() {
+        docker inspect --type image --format '{{json .Config.Labels}}' "$DOCKER_IMAGE" \
+      | json_string '"devcontainer.metadata"'
+    }
+
+    ###
+    # Read relevant values from devcontainer.json
+    ###
+
+    local devcontainer_json="${path}/devcontainer.json";
+
+    # Read image
+    local DOCKER_IMAGE="$(json_string '"image"' < "${devcontainer_json}")"
+    # Always pull the latest copy of the image
+    docker pull "$DOCKER_IMAGE"
+
+    # Read workspaceFolder
+    local WORKSPACE_FOLDER="$(json_string '"workspaceFolder"' < "${devcontainer_json}")"
+    # Read remoteUser
+    local REMOTE_USER="$(json_string '"remoteUser"' < "${devcontainer_json}")"
+    # If remoteUser isn't in our devcontainer.json, read it from the image's "devcontainer.metadata" label
+    if test -z "${REMOTE_USER:-}"; then
+        REMOTE_USER="$(devcontainer_metadata_json | json_string "-1" '"remoteUser"')"
+    fi
+    # Read runArgs
+    local -a RUN_ARGS="($(json_array '"runArgs"' < "${devcontainer_json}"))"
+    # Read initializeCommand
+    local -a INITIALIZE_COMMAND="($(json_array '"initializeCommand"' < "${devcontainer_json}"))"
+    # Read containerEnv
+    local -a ENV_VARS="($(json_map '"containerEnv"' < "${devcontainer_json}" | sed -r 's/(.*)=(.*)/--env \1=\2/'))"
+    # Read mounts
+    local -a MOUNTS="($(
+        tee < "${devcontainer_json}"          \
+            1>/dev/null                       \
+            >(json_array '"mounts"')          \
+            >(json_string '"workspaceMount"') \
+      | xargs -r -I% echo --mount '%'
+    ))"
+
+    ###
+    # Update run arguments and container environment variables
+    ###
+
+    # Only pass `-it` if the shell is a tty
+    if ! ${CI:-'false'} && tty >/dev/null 2>&1 && (exec </dev/tty); then
+        RUN_ARGS+=("-it")
+    fi
+
+    for flag in rm init; do
+        if [[ " ${RUN_ARGS[*]} " != *" --${flag} "* ]]; then
+            RUN_ARGS+=("--${flag}")
+        fi
+    done
+
+    # Prefer the user-provided --gpus argument
+    if test -n "${gpu_request:-}"; then
+        RUN_ARGS+=(--gpus "${gpu_request}")
+    else
+        # Otherwise read and infer from hostRequirements.gpu
+        local GPU_REQUEST="$(json_string '"hostRequirements"' '"gpu"' < "${devcontainer_json}")"
+        if test "${GPU_REQUEST:-false}" = true; then
+            RUN_ARGS+=(--gpus all)
+        elif test "${GPU_REQUEST:-false}" = optional && \
+             command -v nvidia-container-runtime >/dev/null 2>&1; then
+            RUN_ARGS+=(--gpus all)
+        fi
+    fi
+
+        RUN_ARGS+=(--workdir "${WORKSPACE_FOLDER:-/home/coder/nvbench}")
+
+    if test -n "${REMOTE_USER:-}"; then
+        ENV_VARS+=(--env NEW_UID="$(id -u)")
+        ENV_VARS+=(--env NEW_GID="$(id -g)")
+        ENV_VARS+=(--env REMOTE_USER="$REMOTE_USER")
+        RUN_ARGS+=(-u root:root)
+        RUN_ARGS+=(--entrypoint "${WORKSPACE_FOLDER:-/home/coder/nvbench}/.devcontainer/docker-entrypoint.sh")
+    fi
+
+    if test -n "${SSH_AUTH_SOCK:-}"; then
+        ENV_VARS+=(--env "SSH_AUTH_SOCK=/tmp/ssh-auth-sock")
+        MOUNTS+=(--mount "source=${SSH_AUTH_SOCK},target=/tmp/ssh-auth-sock,type=bind")
+    fi
+
+    # Append user-provided volumes
+    if test -v volumes && test ${#volumes[@]} -gt 0; then
+        MOUNTS+=("${volumes[@]}")
+    fi
+
+    # Append user-provided envvars
+    if test -v env_vars && test ${#env_vars[@]} -gt 0; then
+        ENV_VARS+=("${env_vars[@]}")
+    fi
+
+    # Run the initialize command before starting the container
+    if test "${#INITIALIZE_COMMAND[@]}" -gt 0; then
+        eval "${INITIALIZE_COMMAND[*]@Q}"
+    fi
+
+    exec docker run \
+        "${RUN_ARGS[@]}" \
+        "${ENV_VARS[@]}" \
+        "${MOUNTS[@]}" \
+        "${DOCKER_IMAGE}" \
+        "$@"
+}
+
+launch_vscode() {
+    local -;
+    set -euo pipefail;
+    # Since Visual Studio Code allows only one instance per `devcontainer.json`,
+    # this code prepares a unique temporary directory structure for each launch of a devcontainer.
+    # By doing so, it ensures that multiple instances of the same environment can be run
+    # simultaneously. The script replicates the `devcontainer.json` from the desired CUDA
+    # and compiler environment into this temporary directory, adjusting paths to ensure the
+    # correct workspace is loaded. A special URL is then generated to instruct VSCode to
+    # launch the development container using this temporary configuration.
+    local workspace="$(basename "$(pwd)")"
+    local tmpdir="$(mktemp -d)/${workspace}"
+    mkdir -p "${tmpdir}"
+    mkdir -p "${tmpdir}/.devcontainer"
+    cp -arL "${path}/devcontainer.json" "${tmpdir}/.devcontainer"
+    sed -i "s@\\${localWorkspaceFolder}@$(pwd)@g" "${tmpdir}/.devcontainer/devcontainer.json"
+    local path="${tmpdir}"
+    local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')"
+    local url="vscode://vscode-remote/dev-container+${hash}/home/coder/nvbench"
+
+    local launch=""
+    if type open >/dev/null 2>&1; then
+        launch="open"
+    elif type xdg-open >/dev/null 2>&1; then
+        launch="xdg-open"
+    fi
+
+    if [ -n "${launch}" ]; then
+        echo "Launching VSCode Dev Container URL: ${url}"
+        code --new-window "${tmpdir}"
+        exec "${launch}" "${url}" >/dev/null 2>&1
+    fi
+}
+
+main() {
+    local -a unparsed;
+    parse_options "$@" unparsed;
+    set -- "${unparsed[@]}";
+
+    # If no CTK/Host compiler are provided, just use the default environment
+    if [[ -z ${cuda_version:-} ]] && [[ -z ${host_compiler:-} ]]; then
+        path=".devcontainer"
+    else
+        path=".devcontainer/cuda${cuda_version}-${host_compiler}"
+        if [[ ! -f "${path}/devcontainer.json" ]]; then
+            echo "Unknown CUDA [${cuda_version}] compiler [${host_compiler}] combination"
+            echo "Requested devcontainer ${path}/devcontainer.json does not exist"
+            exit 1
+        fi
+    fi
+
+    if ${docker_mode:-'false'}; then
+        launch_docker "$@"
+    else
+        launch_vscode
+    fi
+}
+
+main "$@"
+
diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh
new file mode 100755
index 00000000..f868cc14
--- /dev/null
+++ b/.devcontainer/make_devcontainers.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of
+# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the
+# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version.
+# GitHub docs on using multiple devcontainer.json files:
+# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson
+
+set -euo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+
+function usage {
+    echo "Usage: $0 [--clean] [-h/--help] [-v/--verbose]"
+    echo "  --clean   Remove stale devcontainer subdirectories"
+    echo "  -h, --help   Display this help message"
+    echo "  -v, --verbose  Enable verbose mode (set -x)"
+    exit 1
+}
+
+# Function to update the devcontainer.json file with the provided parameters
+update_devcontainer() {
+    local input_file="$1"
+    local output_file="$2"
+    local name="$3"
+    local cuda_version="$4"
+    local compiler_name="$5"
+    local compiler_exe="$6"
+    local compiler_version="$7"
+    local os="$8"
+    local devcontainer_version="$9"
+
+    local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-"
+    local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}-${os}"
+
+    jq --arg image "$image" --arg name "$name" \
+       --arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \
+       --arg compiler_exe "$compiler_exe" --arg compiler_version "$compiler_version" --arg os "$os" \
+       '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name |
+        .containerEnv.CCCL_BUILD_INFIX = $name |
+        .containerEnv.CCCL_CUDA_VERSION = $cuda_version | .containerEnv.CCCL_HOST_COMPILER = $compiler_name |
+        .containerEnv.CCCL_HOST_COMPILER_VERSION = $compiler_version '\
+       "$input_file" > "$output_file"
+}
+
+make_name() {
+    local cuda_version="$1"
+    local compiler_name="$2"
+    local compiler_version="$3"
+
+    echo "cuda$cuda_version-$compiler_name$compiler_version"
+}
+
+CLEAN=false
+VERBOSE=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --clean)
+            CLEAN=true
+            ;;
+        -h|--help)
+            usage
+            ;;
+        -v|--verbose)
+            VERBOSE=true
+            ;;
+        *)
+            usage
+            ;;
+    esac
+    shift
+done
+
+MATRIX_FILE="../ci/matrix.yaml"
+
+# Enable verbose mode if requested
+if [ "$VERBOSE" = true ]; then
+    set -x
+    cat ${MATRIX_FILE}
+fi
+
+# Read matrix.yaml and convert it to json
+matrix_json=$(yq -o json ${MATRIX_FILE})
+
+# Exclude Windows environments
+readonly matrix_json=$(echo "$matrix_json" | jq 'del(.pull_request.nvcc[] | select(.os | contains("windows")))')
+
+# Get the devcontainer image version and define image tag root
+readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')
+
+# Get unique combinations of cuda version, compiler name/version, and Ubuntu version
+readonly combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_exe: .compiler.exe, compiler_version: .compiler.version, os: .os}] | unique | .[]')
+
+# Update the base devcontainer with the default values
+# The root devcontainer.json file is used as the default container as well as a template for all
+# other devcontainer.json files by replacing the `image:` field with the appropriate image name
+readonly base_devcontainer_file="./devcontainer.json"
+readonly NEWEST_GCC_CUDA_ENTRY=$(echo "$combinations" | jq -rs '[.[] | select(.compiler_name == "gcc")] | sort_by((.cuda | tonumber), (.compiler_version | tonumber)) | .[-1]')
+readonly DEFAULT_CUDA=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.cuda')
+readonly DEFAULT_COMPILER_NAME=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_name')
+readonly DEFAULT_COMPILER_EXE=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_exe')
+readonly DEFAULT_COMPILER_VERSION=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_version')
+readonly DEFAULT_OS=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.os')
+readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION")
+
+update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEFAULT_OS" "$DEVCONTAINER_VERSION"
+mv "./temp_devcontainer.json" ${base_devcontainer_file}
+
+# Create an array to keep track of valid subdirectory names
+valid_subdirs=()
+
+# The img folder should not be removed:
+valid_subdirs+=("img")
+
+# For each unique combination
+for combination in $combinations; do
+    cuda_version=$(echo "$combination" | jq -r '.cuda')
+    compiler_name=$(echo "$combination" | jq -r '.compiler_name')
+    compiler_exe=$(echo "$combination" | jq -r '.compiler_exe')
+    compiler_version=$(echo "$combination" | jq -r '.compiler_version')
+    os=$(echo "$combination" | jq -r '.os')
+
+    name=$(make_name "$cuda_version" "$compiler_name" "$compiler_version")
+    mkdir -p "$name"
+    new_devcontainer_file="$name/devcontainer.json"
+
+    update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$compiler_name" "$compiler_exe" "$compiler_version" "$os" "$DEVCONTAINER_VERSION"
+    echo "Created $new_devcontainer_file"
+
+    # Add the subdirectory name to the valid_subdirs array
+    valid_subdirs+=("$name")
+done
+
+# Clean up stale subdirectories and devcontainer.json files
+if [ "$CLEAN" = true ]; then
+    for subdir in ./*; do
+        if [ -d "$subdir" ] && [[ ! " ${valid_subdirs[@]} " =~ " ${subdir#./} " ]]; then
+            echo "Removing stale subdirectory: $subdir"
+            rm -r "$subdir"
+        fi
+    done
+fi
diff --git a/.devcontainer/nvbench-entrypoint.sh b/.devcontainer/nvbench-entrypoint.sh
new file mode 100755
index 00000000..8cf81c16
--- /dev/null
+++ b/.devcontainer/nvbench-entrypoint.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# shellcheck disable=SC1091
+
+set -e;
+
+devcontainer-utils-post-create-command;
+devcontainer-utils-init-git;
+devcontainer-utils-post-attach-command;
+
+cd /home/coder/nvbench/
+
+if test $# -gt 0; then
+    exec "$@";
+else
+    exec /bin/bash -li;
+fi
diff --git a/.devcontainer/verify_devcontainer.sh b/.devcontainer/verify_devcontainer.sh
new file mode 100755
index 00000000..b5934ea2
--- /dev/null
+++ b/.devcontainer/verify_devcontainer.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+function usage {
+    echo "Usage: $0"
+    echo
+    echo "This script is intended to be run within one of CCCL's Dev Containers."
+    echo "It verifies that the expected environment variables and binary versions match what is expected."
+}
+
+check_envvars() {
+    for var_name in "$@"; do
+        if [[ -z "${!var_name:-}" ]]; then
+            echo "::error:: ${var_name} variable is not set."
+            exit 1
+        else
+            echo "$var_name=${!var_name}"
+        fi
+    done
+}
+
+check_host_compiler_version() {
+    local version_output=$($CXX --version)
+
+    if [[ "$CXX" == "g++" ]]; then
+        local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 4 | cut -d '.' -f 1)
+        local expected_compiler="gcc"
+    elif [[ "$CXX" == "clang++" ]]; then
+        if [[ $version_output =~ clang\ version\ ([0-9]+) ]]; then
+            actual_version=${BASH_REMATCH[1]}
+        else
+            echo "::error:: Unable to determine clang version."
+            exit 1
+        fi
+        expected_compiler="llvm"
+    elif [[ "$CXX" == "icpc" ]]; then
+        local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 3 )
+        # The icpc compiler version of oneAPI release 2023.2.0 is 2021.10.0
+        if [[ "$actual_version" == "2021.10.0" ]]; then
+            actual_version="2023.2.0"
+        fi
+        expected_compiler="oneapi"
+    else
+        echo "::error:: Unexpected CXX value ($CXX)."
+        exit 1
+    fi
+
+    if [[ "$expected_compiler" != "${CCCL_HOST_COMPILER}" || "$actual_version" != "$CCCL_HOST_COMPILER_VERSION" ]]; then
+        echo "::error:: CXX ($CXX) version ($actual_version) does not match the expected compiler (${CCCL_HOST_COMPILER}) and version (${CCCL_HOST_COMPILER_VERSION})."
+        exit 1
+    else
+        echo "Detected host compiler: $CXX version $actual_version"
+    fi
+}
+
+check_cuda_version() {
+    local cuda_version_output=$(nvcc --version)
+    if [[ $cuda_version_output =~ release\ ([0-9]+\.[0-9]+) ]]; then
+        local actual_cuda_version=${BASH_REMATCH[1]}
+    else
+        echo "::error:: Unable to determine CUDA version from nvcc."
+        exit 1
+    fi
+
+    if [[ "$actual_cuda_version" != "$CCCL_CUDA_VERSION" ]]; then
+        echo "::error:: CUDA version ($actual_cuda_version) does not match the expected CUDA version ($CCCL_CUDA_VERSION)."
+        exit 1
+    else
+        echo "Detected CUDA version: $actual_cuda_version"
+    fi
+}
+
+main() {
+    if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+        usage
+        exit 0
+    fi
+
+    set -euo pipefail
+
+    check_envvars DEVCONTAINER_NAME CXX CUDAHOSTCXX CCCL_BUILD_INFIX CCCL_HOST_COMPILER CCCL_CUDA_VERSION CCCL_HOST_COMPILER_VERSION
+
+    check_host_compiler_version
+
+    check_cuda_version
+
+    echo "Dev Container successfully verified!"
+}
+
+main "$@"
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000..11130409
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,17 @@
+# Exclude these commits from git-blame and similar tools.
+#
+# To use this file, run the following command from the repo root:
+#
+# ```
+# $ git config blame.ignoreRevsFile .git-blame-ignore-revs
+# ```
+#
+# Include a brief comment with each commit added, for example:
+#
+# ```
+# 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
+# ```
+#
+# Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
+8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
+
diff --git a/.github/actions/compute-matrix/action.yml b/.github/actions/compute-matrix/action.yml
new file mode 100644
index 00000000..b8155e7a
--- /dev/null
+++ b/.github/actions/compute-matrix/action.yml
@@ -0,0 +1,25 @@
+
+name: Compute Matrix
+description: "Compute the matrix for a given matrix type from the specified matrix file"
+
+inputs:
+  matrix_query:
+    description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc"
+    required: true
+  matrix_file:
+    description: 'The file containing the matrix'
+    required: true
+outputs:
+  matrix:
+    description: 'The requested matrix'
+    value: ${{ steps.compute-matrix.outputs.MATRIX }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Compute matrix
+      id: compute-matrix
+      run: |
+        MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}}  ${{inputs.matrix_query}} )
+        echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT
+      shell: bash -euxo pipefail {0}
diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
new file mode 100755
index 00000000..cd3946f1
--- /dev/null
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -euo pipefail
+
+write_output() {
+  local key="$1"
+  local value="$2"
+  echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
+}
+
+extract_matrix() {
+  local file="$1"
+  local type="$2"
+  local matrix=$(yq -o=json "$file" | jq -cr ".$type")
+  write_output "DEVCONTAINER_VERSION" "$(yq -o json "$file" | jq -cr '.devcontainer_version')"
+
+  local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc')"
+  local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
+  write_output "PER_CUDA_COMPILER_MATRIX"  "$per_cuda_compiler_matrix"
+  write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
+}
+
+main() {
+  if [ "$1" == "-v" ]; then
+    set -x
+    shift
+  fi
+
+  if [ $# -ne 2 ] || [ "$2" != "pull_request" ]; then
+    echo "Usage: $0 [-v] MATRIX_FILE MATRIX_TYPE"
+    echo "  -v            : Enable verbose output"
+    echo "  MATRIX_FILE   : The path to the matrix file."
+    echo "  MATRIX_TYPE   : The desired matrix. Supported values: 'pull_request'"
+    exit 1
+  fi
+
+  echo "Input matrix file:" >&2
+  cat "$1" >&2
+  echo "Matrix Type: $2" >&2
+
+  extract_matrix "$1" "$2"
+}
+
+main "$@"
diff --git a/.github/actions/configure_cccl_sccache/action.yml b/.github/actions/configure_cccl_sccache/action.yml
new file mode 100644
index 00000000..e0ea2707
--- /dev/null
+++ b/.github/actions/configure_cccl_sccache/action.yml
@@ -0,0 +1,13 @@
+name: Set up AWS credentials and environment variables for sccache
+description: "Set up AWS credentials and environment variables for sccache"
+runs:
+  using: "composite"
+  steps:
+    - name: Set environment variables
+      run: |
+        echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
+        echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
+        echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
+        echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
+        echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
+      shell: bash
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 00000000..895ba83e
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/problem-matchers/problem-matcher.json b/.github/problem-matchers/problem-matcher.json
new file mode 100644
index 00000000..f196a5c8
--- /dev/null
+++ b/.github/problem-matchers/problem-matcher.json
@@ -0,0 +1,14 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "nvcc",
+      "pattern": [
+        {
+          "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$",
+          "severity": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml
new file mode 100644
index 00000000..e2364d7f
--- /dev/null
+++ b/.github/workflows/build-and-test-linux.yml
@@ -0,0 +1,36 @@
+name: build and test
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      cuda: {type: string, required: true}
+      host: {type: string, required: true}
+      cpu: {type: string, required: true}
+      test_name: {type: string, required: false}
+      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
+      container_image: {type: string, required: false}
+      run_tests: {type: boolean, required: false, default: true}
+
+permissions:
+  contents: read
+
+jobs:
+  build-and-test:
+    name: Build/Test ${{inputs.test_name}}
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      cuda: ${{ inputs.cuda }}
+      host: ${{ inputs.host }}
+      name: Build/Test ${{inputs.test_name}}
+      runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
+      image:  ${{ inputs.container_image }}
+      command: |
+        ${{ inputs.test_script }}
diff --git a/.github/workflows/build-and-test-windows.yml b/.github/workflows/build-and-test-windows.yml
new file mode 100644
index 00000000..2cabf9d0
--- /dev/null
+++ b/.github/workflows/build-and-test-windows.yml
@@ -0,0 +1,49 @@
+name: Build Windows
+
+on:
+  workflow_call:
+    inputs:
+      test_name: {type: string, required: false}
+      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
+      container_image: {type: string, required: false}
+
+jobs:
+  prepare:
+    name: Build Only ${{inputs.test_name}}
+    runs-on: windows-amd64-cpu16
+    permissions:
+      id-token: write
+      contents: read
+    env:
+      SCCACHE_BUCKET: rapids-sccache-devs
+      SCCACHE_REGION: us-east-2
+      SCCACHE_IDLE_TIMEOUT: 0
+      SCCACHE_S3_USE_SSL: true
+      SCCACHE_S3_NO_CREDENTIALS: false
+    steps:
+      - name: Get AWS credentials for sccache bucket
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+          aws-region: us-east-2
+          role-duration-seconds: 43200 # 12 hours
+      - name: Fetch ${{ inputs.container_image }}
+        shell: powershell
+        run: docker pull ${{ inputs.container_image }}
+      - name: Run the tests
+        shell: powershell
+        run: >-
+            docker run ${{ inputs.container_image }} powershell -c "[System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('AWS_SECRET_ACCESS_KEY','${{env.AWS_SECRET_ACCESS_KEY}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('AWS_SESSION_TOKEN','${{env.AWS_SESSION_TOKEN }}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_BUCKET','${{env.SCCACHE_BUCKET}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_REGION','${{env.SCCACHE_REGION}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_IDLE_TIMEOUT','${{env.SCCACHE_IDLE_TIMEOUT}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_USE_SSL','${{env.SCCACHE_S3_USE_SSL}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_NO_CREDENTIALS','${{env.SCCACHE_S3_NO_CREDENTIALS}}')
+                                                                    git clone https://github.com/NVIDIA/nvbench.git;
+                                                                    cd nvbench;
+                                                                    git fetch --all;
+                                                                    git checkout ${{github.ref_name}};
+                                                                    ${{inputs.build_script}};"
diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml
new file mode 100644
index 00000000..22ac2b2c
--- /dev/null
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -0,0 +1,53 @@
+name: Dispatch build and test
+
+on:
+  workflow_call:
+    inputs:
+      project_name: {type: string, required: true}
+      per_cuda_compiler_matrix: {type: string, required: true}
+      devcontainer_version: {type: string, required: true}
+      is_windows: {type: boolean, required: true}
+
+permissions:
+  contents: read
+
+jobs:
+  # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
+  # ensures that the build/test steps can overlap across different configurations. For example,
+  # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
+  build_and_test_linux:
+    name: build and test linux
+    permissions:
+      id-token: write
+      contents: read
+    if: ${{ !inputs.is_windows }}
+    uses: ./.github/workflows/build-and-test-linux.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
+    with:
+      cuda: ${{ matrix.cuda }}
+      host: ${{matrix.compiler.name}}${{matrix.compiler.version}}
+      cpu: ${{ matrix.cpu }}
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}} ${{matrix.extra_build_args}}
+      build_script: "./ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
+      test_script:  "./ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
+      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+
+  build_and_test_windows:
+    name: build and test windows
+    permissions:
+      id-token: write
+      contents: read
+    if: ${{ inputs.is_windows }}
+    uses: ./.github/workflows/build-and-test-windows.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
+    with:
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}
+      build_script: "./ci/windows/build_${{ inputs.project_name }}.ps1 ${{matrix.extra_build_args}}"
+      test_script:  "./ci/windows/test_${{ inputs.project_name }}.ps1 ${{matrix.extra_build_args}}"
+      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cuda${{matrix.cuda}}-${{matrix.compiler.name}}${{matrix.compiler.version}}-${{matrix.os}}
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 00000000..6d6708c3
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the main workflow that runs on every PR and push to main
+name: pr
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts.
+concurrency:
+  group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  compute-matrix:
+    name: Compute matrix
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+      - name: Compute matrix outputs
+        id: set-outputs
+        run: |
+          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+
+  nvbench:
+    name: NVBench CUDA${{ matrix.cuda_host_combination }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+    with:
+      project_name: "nvbench"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+      is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
+
+  verify-devcontainers:
+    name: Verify Dev Containers
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/verify-devcontainers.yml
+
+  # This job is the final job that runs after all other jobs and is used for branch protection status checks.
+  # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
+  # https://github.com/orgs/community/discussions/26822#discussioncomment-5122101
+  ci:
+    runs-on: ubuntu-latest
+    name: CI
+    if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
+    needs:
+      - nvbench
+      - verify-devcontainers
+    steps:
+      - name: Check status of all precursor jobs
+        if: >-
+          ${{
+               contains(needs.*.result, 'failure')
+            || contains(needs.*.result, 'cancelled')
+            || contains(needs.*.result, 'skipped')
+          }}
+        run: exit 1
diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml
new file mode 100644
index 00000000..c63f0256
--- /dev/null
+++ b/.github/workflows/run-as-coder.yml
@@ -0,0 +1,156 @@
+name: Run as coder user
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      cuda: {type: string, required: true}
+      host: {type: string, required: true}
+      name: {type: string, required: true}
+      image: {type: string, required: true}
+      runner: {type: string, required: true}
+      command: {type: string, required: true}
+      env: { type: string, required: false, default: "" }
+
+permissions:
+  contents: read
+
+jobs:
+  run-as-coder:
+    name: ${{inputs.name}}
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ${{inputs.runner}}
+    container:
+      # This job now uses a docker-outside-of-docker (DOOD) strategy.
+      #
+      # The GitHub Actions runner application mounts the host's docker socket `/var/run/docker.sock` into the
+      # container. By using a container with the `docker` CLI, this container can launch docker containers
+      # using the host's docker daemon.
+      #
+      # This allows us to run actions that require node v20 in the `cruizba/ubuntu-dind:jammy-26.1.3` container, and
+      # then launch our Ubuntu18.04-based GCC 6/7 containers to build and test CCCL.
+      #
+      # The main inconvenience to this approach is that any container mounts have to match the paths of the runner host,
+      # not the paths as seen in the intermediate (`cruizba/ubuntu-dind`) container.
+      #
+      # Note: I am using `cruizba/ubuntu-dind:jammy-26.1.3` instead of `docker:latest`, because GitHub doesn't support
+      # JS actions in alpine aarch64 containers, instead failing actions with this error:
+      # ```
+      # Error: JavaScript Actions in Alpine containers are only supported on x64 Linux runners. Detected Linux Arm64
+      # ```
+      image: cruizba/ubuntu-dind:jammy-26.1.3
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          path: nvbench
+          persist-credentials: false
+      - name: Add NVCC problem matcher
+        run: |
+          echo "::add-matcher::nvbench/.github/problem-matchers/problem-matcher.json"
+      - name: Configure credentials and environment variables for sccache
+        uses: ./nvbench/.github/actions/configure_cccl_sccache
+      - name: Run command
+        env:
+          CI: true
+          RUNNER: "${{inputs.runner}}"
+          COMMAND: "${{inputs.command}}"
+          AWS_ACCESS_KEY_ID: "${{env.AWS_ACCESS_KEY_ID}}"
+          AWS_SESSION_TOKEN: "${{env.AWS_SESSION_TOKEN}}"
+          AWS_SECRET_ACCESS_KEY: "${{env.AWS_SECRET_ACCESS_KEY}}"
+        run: |
+            echo "[host]      github.workspace: ${{github.workspace}}"
+            echo "[container] GITHUB_WORKSPACE: ${GITHUB_WORKSPACE:-}"
+            echo "[container]              PWD: $(pwd)"
+
+            # Necessary because we're doing docker-outside-of-docker:
+            # Make a symlink in the container that matches the host's ${{github.workspace}}, so that way `$(pwd)`
+            # in `.devcontainer/launch.sh` constructs volume paths relative to the hosts's ${{github.workspace}}.
+            mkdir -p "$(dirname "${{github.workspace}}")"
+            ln -s "$(pwd)" "${{github.workspace}}"
+
+            cd "${{github.workspace}}"
+
+            cat <<"EOF" > ci.sh
+
+            #! /usr/bin/env bash
+            set -eo pipefail
+            echo -e "\e[1;34mRunning as '$(whoami)' user in $(pwd):\e[0m"
+            echo -e "\e[1;34m${{inputs.command}}\e[0m"
+            eval "${{inputs.command}}" || exit_code=$?
+            if [ ! -z "$exit_code" ]; then
+              echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
+              echo "::error:: To replicate this failure locally, follow the steps below:"
+              echo "1. Clone the repository, and navigate to the correct branch and commit:"
+              echo "   git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
+              echo ""
+              echo "2. Run the failed command inside the same Docker container used by the CI:"
+              echo "   docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
+              echo ""
+              echo "For additional information, see:"
+              echo "   - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
+              echo "   - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
+              exit $exit_code
+            fi
+            EOF
+
+            chmod +x ci.sh
+
+            mkdir "$RUNNER_TEMP/.aws";
+
+            cat <<EOF > "$RUNNER_TEMP/.aws/config"
+            [default]
+            bucket=rapids-sccache-devs
+            region=us-east-2
+            EOF
+
+            cat <<EOF > "$RUNNER_TEMP/.aws/credentials"
+            [default]
+            aws_access_key_id=$AWS_ACCESS_KEY_ID
+            aws_session_token=$AWS_SESSION_TOKEN
+            aws_secret_access_key=$AWS_SECRET_ACCESS_KEY
+            EOF
+
+            chmod 0600 "$RUNNER_TEMP/.aws/credentials"
+            chmod 0664 "$RUNNER_TEMP/.aws/config"
+
+            declare -a gpu_request=()
+
+            # Explicitly pass which GPU to use if on a GPU runner
+            if [[ "${RUNNER}" = *"-gpu-"* ]]; then
+              gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES}")
+            fi
+
+            host_path() {
+              sed "s@/__w@$(dirname "$(dirname "${{github.workspace}}")")@" <<< "$1"
+            }
+
+            # Launch this container using the host's docker daemon
+            ${{github.event.repository.name}}/.devcontainer/launch.sh \
+              --docker \
+              --cuda ${{inputs.cuda}} \
+              --host ${{inputs.host}} \
+              "${gpu_request[@]}" \
+              --env "CI=$CI" \
+              --env "AWS_ROLE_ARN=" \
+              --env "COMMAND=$COMMAND" \
+              --env "GITHUB_ENV=$GITHUB_ENV" \
+              --env "GITHUB_SHA=$GITHUB_SHA" \
+              --env "GITHUB_PATH=$GITHUB_PATH" \
+              --env "GITHUB_OUTPUT=$GITHUB_OUTPUT" \
+              --env "GITHUB_ACTIONS=$GITHUB_ACTIONS" \
+              --env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \
+              --env "GITHUB_WORKSPACE=$GITHUB_WORKSPACE" \
+              --env "GITHUB_REPOSITORY=$GITHUB_REPOSITORY" \
+              --env "GITHUB_STEP_SUMMARY=$GITHUB_STEP_SUMMARY" \
+              --volume "${{github.workspace}}/ci.sh:/ci.sh" \
+              --volume "$(host_path "$RUNNER_TEMP")/.aws:/root/.aws" \
+              --volume "$(dirname "$(dirname "${{github.workspace}}")"):/__w" \
+              -- /ci.sh
diff --git a/.github/workflows/verify-devcontainers.yml b/.github/workflows/verify-devcontainers.yml
new file mode 100644
index 00000000..4bbfa6b3
--- /dev/null
+++ b/.github/workflows/verify-devcontainers.yml
@@ -0,0 +1,86 @@
+name: Verify devcontainers
+
+on:
+  workflow_call:
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+permissions:
+  contents: read
+
+jobs:
+  verify-make-devcontainers:
+    name: Verify devcontainer files are up-to-date
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Setup jq and yq
+      run: |
+        sudo apt-get update
+        sudo apt-get install jq -y
+        sudo wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.34.2/yq_linux_amd64
+        sudo chmod +x /usr/local/bin/yq
+    - name: Run the script to generate devcontainer files
+      run: |
+        ./.devcontainer/make_devcontainers.sh --verbose
+    - name: Check for changes
+      run: |
+        if [[ $(git diff --stat) != '' || $(git status --porcelain | grep '^??') != '' ]]; then
+          git diff --minimal
+          git status --porcelain
+          echo "::error:: Dev Container files are out of date or there are untracked files. Run the .devcontainer/make_devcontainers.sh script and commit the changes."
+          exit 1
+        else
+          echo "::note::Dev Container files are up-to-date."
+        fi
+
+  get-devcontainer-list:
+    needs: verify-make-devcontainers
+    name: Get list of devcontainer.json files
+    runs-on: ubuntu-latest
+    outputs:
+      devcontainers: ${{ steps.get-list.outputs.devcontainers }}
+    steps:
+    - name: Check out the code
+      uses: actions/checkout@v4
+    - name: Get list of devcontainer.json paths and names
+      id: get-list
+      run: |
+        devcontainers=$(find .devcontainer/ -name 'devcontainer.json' | while read -r devcontainer; do
+          jq --arg path "$devcontainer" '{path: $path, name: .name}' "$devcontainer"
+          done | jq -s -c .)
+        echo "devcontainers=${devcontainers}" | tee --append "${GITHUB_OUTPUT}"
+
+  verify-devcontainers:
+    needs: get-devcontainer-list
+    name: ${{matrix.devcontainer.name}}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        devcontainer: ${{fromJson(needs.get-devcontainer-list.outputs.devcontainers)}}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - name: Check out the code
+      uses: actions/checkout@v4
+      # We don't really need sccache configured, but we need the AWS credentials envvars to be set
+      # in order to avoid the devcontainer hanging waiting for GitHub authentication
+    - name: Configure credentials and environment variables for sccache
+      uses: ./.github/actions/configure_cccl_sccache
+    - name: Run in devcontainer
+      uses: devcontainers/ci@v0.3
+      with:
+        push: never
+        configFile: ${{ matrix.devcontainer.path }}
+        env: |
+          SCCACHE_REGION=${{ env.SCCACHE_REGION }}
+          AWS_ACCESS_KEY_ID=${{ env.AWS_ACCESS_KEY_ID }}
+          AWS_SESSION_TOKEN=${{ env.AWS_SESSION_TOKEN }}
+          AWS_SECRET_ACCESS_KEY=${{ env.AWS_SECRET_ACCESS_KEY }}
+        runCmd: |
+          .devcontainer/verify_devcontainer.sh
diff --git a/.gitignore b/.gitignore
index 57309ab5..50fac98d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,10 @@
 build*/
+.aws
+.vscode
+.cache
+.config
 .idea
 cmake-build-*
 *~
+compile_commands.json
+CMakeUserPresets.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 369d6311..8eb5f883 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,5 @@
-# 3.20.1 required for rapids-cmake
-# 3.21.0 required for NVBench_ADD_DEPENDENT_DLLS_TO_* (MSVC only)
-cmake_minimum_required(VERSION 3.20.1)
+# 3.23.1 required for rapids-cmake
+cmake_minimum_required(VERSION 3.23.1)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
@@ -22,6 +21,11 @@ project(NVBench
 
 nvbench_init_rapids_cmake()
 
+# Define NVBench_DETECTED_${LANG}_STANDARDS
+include(cmake/DetectSupportedStandards.cmake)
+detect_supported_standards(NVBench CXX 17 20)
+detect_supported_standards(NVBench CUDA 17 20)
+
 # See NVIDIA/NVBench#52
 find_package(CUDAToolkit REQUIRED)
 set(cupti_default ON)
@@ -29,15 +33,22 @@ if (${CUDAToolkit_VERSION} VERSION_LESS 11.3)
   set(cupti_default OFF)
 endif()
 
+option(BUILD_SHARED_LIBS "Build NVBench as a shared library" ON)
+
 option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
 option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})
 
 option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
+option(NVBench_ENABLE_HEADER_TESTING "Build NVBench testing suite." OFF)
 option(NVBench_ENABLE_DEVICE_TESTING
   "Include tests that require a GPU (with locked clocks)."
   OFF
 )
 option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
+option(NVBench_ENABLE_INSTALL_RULES "Install NVBench." ${NVBench_TOPLEVEL_PROJECT})
+
+include(cmake/NVBenchUtilities.cmake) # Must be first
+include(cmake/NVBenchClangdCompileInfo.cmake) # Must be before any targets are created
 
 include(cmake/NVBenchConfigTarget.cmake)
 include(cmake/NVBenchDependentDlls.cmake)
@@ -45,13 +56,15 @@ include(cmake/NVBenchExports.cmake)
 include(cmake/NVBenchWriteConfigHeader.cmake)
 include(cmake/NVBenchDependencies.cmake)
 include(cmake/NVBenchInstallRules.cmake)
-include(cmake/NVBenchUtilities.cmake)
 
 message(STATUS "NVBench CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
 add_subdirectory(nvbench)
 
-if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
+if (NVBench_ENABLE_EXAMPLES OR
+    NVBench_ENABLE_TESTING OR
+    NVBench_ENABLE_HEADER_TESTING)
+  include(CTest)
   enable_testing()
 endif()
 
@@ -65,4 +78,8 @@ if (NVBench_ENABLE_TESTING)
   add_subdirectory(testing)
 endif()
 
+if (NVBench_ENABLE_HEADER_TESTING)
+  include(cmake/NVBenchHeaderTesting.cmake)
+endif()
+
 nvbench_generate_exports()
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 00000000..3e66f9ad
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,74 @@
+{
+  "version": 3,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 23,
+    "patch": 1
+  },
+  "configurePresets": [
+    {
+      "name": "base",
+      "hidden": true,
+      "generator": "Ninja",
+      "binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CUDA_ARCHITECTURES": "all-major",
+        "NVBench_ENABLE_CUPTI": true,
+        "NVBench_ENABLE_DEVICE_TESTING": false,
+        "NVBench_ENABLE_EXAMPLES": true,
+        "NVBench_ENABLE_HEADER_TESTING": true,
+        "NVBench_ENABLE_INSTALL_RULES": true,
+        "NVBench_ENABLE_NVML": true,
+        "NVBench_ENABLE_TESTING": true,
+        "NVBench_ENABLE_WERROR": true
+      }
+    },
+    {
+      "name": "nvbench-dev",
+      "displayName": "Developer Build",
+      "inherits": "base",
+      "cacheVariables": {
+        "NVBench_ENABLE_DEVICE_TESTING": true
+      }
+    },
+    {
+      "name": "nvbench-ci",
+      "displayName": "NVBench CI",
+      "inherits": "base"
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "nvbench-dev",
+      "configurePreset": "nvbench-dev"
+    },
+    {
+      "name": "nvbench-ci",
+      "configurePreset": "nvbench-ci"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "base",
+      "hidden": true,
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "noTestsAction": "error",
+        "stopOnFailure": false
+      }
+    },
+    {
+      "name": "nvbench-dev",
+      "configurePreset": "nvbench-dev",
+      "inherits": "base"
+    },
+    {
+      "name": "nvbench-ci",
+      "configurePreset": "nvbench-ci",
+      "inherits": "base"
+    }
+  ]
+}
diff --git a/README.md b/README.md
index c1cad5ad..285213f1 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,15 @@ features:
     * Executes the benchmark multiple times back-to-back and records total time.
     * Reports the average execution time (total time / number of executions).
 
+# Supported Compilers and Tools
+
+- CMake > 2.23.1
+- CUDA Toolkit + nvcc: 11.1 -> 12.4
+- g++: 7 -> 12
+- clang++: 9 -> 18
+- cl.exe: 2019 -> 2022 (19.29, 29.39)
+- Headers are tested with C++17 -> C++20.
+
 # Getting Started
 
 ## Minimal Benchmark
@@ -34,7 +43,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:
 
 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.exec([](nvbench::launch& launch) { 
+  state.exec([](nvbench::launch& launch) {
     my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
   });
 }
@@ -72,7 +81,7 @@ mkdir -p build
 cd build
 cmake -DNVBench_ENABLE_EXAMPLES=ON -DCMAKE_CUDA_ARCHITECTURES=70 .. && make
 ```
-Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on. 
+Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on.
 
 Examples are built by default into `build/bin` and are prefixed with `nvbench.example`.
 
@@ -119,7 +128,7 @@ Pass: Batch: 0.261963ms GPU, 7.18s total GPU, 27394x
 ## Demo Project
 
 To get started using NVBench with your own kernels, consider trying out
-the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). 
+the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo).
 
 `nvbench_demo` provides a simple CMake project that uses NVBench to build an
 example benchmark. It's a great way to experiment with the library without a lot
@@ -129,7 +138,7 @@ of investment.
 
 Contributions are welcome!
 
-For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors. 
+For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors.
 
 ## Tests
 
@@ -146,7 +155,7 @@ To run all tests:
 ```
 make test
 ```
-or 
+or
 ```
 ctest
 ```
diff --git a/ci/axis/cpu.yml b/ci/axis/cpu.yml
deleted file mode 100644
index 7230b666..00000000
--- a/ci/axis/cpu.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 5
-  - 6
-  - 7
-  - 8
-  - 9
-  - 10
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 5
-  - CXX_TYPE: clang
-    CXX_VER: 6
-  - CXX_TYPE: gcc
-    CXX_VER: 12
diff --git a/ci/axis/gpu.yml b/ci/axis/gpu.yml
deleted file mode 100644
index 15310794..00000000
--- a/ci/axis/gpu.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 11
-  - CXX_TYPE: gcc
-    CXX_VER: 12
diff --git a/ci/build_common.sh b/ci/build_common.sh
new file mode 100755
index 00000000..2c30414a
--- /dev/null
+++ b/ci/build_common.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+# Script defaults
+HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
+CXX_STANDARD=17
+CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc`
+CUDA_ARCHS= # Empty, use presets by default.
+GLOBAL_CMAKE_OPTIONS=()
+DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks.
+
+# Check if the correct number of arguments has been provided
+function usage {
+    echo "Usage: $0 [OPTIONS]"
+    echo
+    echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo
+    echo "Options:"
+    echo "  -v/--verbose: enable shell echo for debugging"
+    echo "  -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
+    echo "  -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
+    echo "  -std: CUDA/C++ standard (Defaults to 17)"
+    echo "  -arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to value in presets file)"
+    echo "  -cmake-options: Additional options to pass to CMake"
+    echo
+    echo "Examples:"
+    echo "  $ PARALLEL_LEVEL=8 $0"
+    echo "  $ PARALLEL_LEVEL=8 $0 -cxx g++-9"
+    echo "  $ $0 -cxx clang++-8"
+    echo "  $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc"
+    echo "  $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\""
+    exit 1
+}
+
+# Parse options
+
+# Copy the args into a temporary array, since we will modify them and
+# the parent script may still need them.
+args=("$@")
+while [ "${#args[@]}" -ne 0 ]; do
+    case "${args[0]}" in
+    -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    -cxx)  HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
+    -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
+    -disable-benchmarks) DISABLE_CUB_BENCHMARKS=1; args=("${args[@]:1}");;
+    -cmake-options)
+        if [ -n "${args[1]}" ]; then
+            IFS=' ' read -ra split_args <<< "${args[1]}"
+            GLOBAL_CMAKE_OPTIONS+=("${split_args[@]}")
+            args=("${args[@]:2}")
+        else
+            echo "Error: No arguments provided for -cmake-options"
+            usage
+            exit 1
+        fi
+        ;;
+    -h | -help | --help) usage ;;
+    *) echo "Unrecognized option: ${args[0]}"; usage ;;
+    esac
+done
+
+# Convert to full paths:
+HOST_COMPILER=$(which ${HOST_COMPILER})
+CUDA_COMPILER=$(which ${CUDA_COMPILER})
+
+if [[ -n "${CUDA_ARCHS}" ]]; then
+    GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}")
+fi
+
+if [ $VERBOSE ]; then
+    set -x
+fi
+
+# Begin processing unsets after option parsing
+set -u
+
+readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
+
+if [ -z ${CCCL_BUILD_INFIX+x} ]; then
+    CCCL_BUILD_INFIX=""
+fi
+
+# Presets will be configured in this directory:
+BUILD_DIR="../build/${CCCL_BUILD_INFIX}"
+
+# The most recent build will always be symlinked to cccl/build/latest
+mkdir -p $BUILD_DIR
+rm -f ../build/latest
+ln -sf $BUILD_DIR ../build/latest
+
+# Now that BUILD_DIR exists, use readlink to canonicalize the path:
+BUILD_DIR=$(readlink -f "${BUILD_DIR}")
+
+# Prepare environment for CMake:
+export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}"
+export CTEST_PARALLEL_LEVEL="1"
+export CXX="${HOST_COMPILER}"
+export CUDACXX="${CUDA_COMPILER}"
+export CUDAHOSTCXX="${HOST_COMPILER}"
+export CXX_STANDARD
+
+source ./pretty_printing.sh
+
+print_environment_details() {
+  begin_group "⚙️ Environment Details"
+
+  echo "pwd=$(pwd)"
+
+  print_var_values \
+      BUILD_DIR \
+      CXX_STANDARD \
+      CXX \
+      CUDACXX \
+      CUDAHOSTCXX \
+      NVCC_VERSION \
+      CMAKE_BUILD_PARALLEL_LEVEL \
+      CTEST_PARALLEL_LEVEL \
+      CCCL_BUILD_INFIX \
+      GLOBAL_CMAKE_OPTIONS
+
+  echo "Current commit is:"
+  git log -1 || echo "Not a repository"
+
+  if command -v nvidia-smi &> /dev/null; then
+    nvidia-smi
+  else
+    echo "nvidia-smi not found"
+  fi
+
+  end_group "⚙️ Environment Details"
+}
+
+fail_if_no_gpu() {
+    if ! nvidia-smi &> /dev/null; then
+        echo "Error: No NVIDIA GPU detected. Please ensure you have an NVIDIA GPU installed and the drivers are properly configured." >&2
+        exit 1
+    fi
+}
+
+function print_test_time_summary()
+{
+    ctest_log=${1}
+
+    if [ -f ${ctest_log} ]; then
+        begin_group "⏱️ Longest Test Steps"
+        # Only print the full output in CI:
+        if [ -n "${GITHUB_ACTIONS:-}" ]; then
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake
+        else
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake | head -n 15
+        fi
+        end_group "⏱️ Longest Test Steps"
+    fi
+}
+
+function configure_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+    local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE "${GLOBAL_CMAKE_OPTIONS[@]}" $CMAKE_OPTIONS
+    status=$?
+    popd > /dev/null
+    return $status
+}
+
+function build_preset() {
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local green="1;32"
+    local red="1;31"
+    local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
+
+    source "./sccache_stats.sh" "start"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
+    status=$?
+    popd > /dev/null
+
+    minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
+
+    # Only print detailed stats in actions workflow
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        begin_group "💲 sccache stats"
+        echo "${minimal_sccache_stats}"
+        sccache -s
+        end_group
+
+        begin_group "🥷 ninja build times"
+        echo "The "weighted" time is the elapsed time of each build step divided by the number
+              of tasks that were running in parallel. This makes it an excellent approximation
+              of how "important" a slow step was. A link that is entirely or mostly serialized
+              will have a weighted time that is the same or similar to its elapsed time. A
+              compile that runs in parallel with 999 other compiles will have a weighted time
+              that is tiny."
+        ./ninja_summary.py -C ${BUILD_DIR}/${PRESET} || echo "ninja_summary.py failed"
+        end_group
+    else
+      echo $minimal_sccache_stats
+    fi
+
+    return $status
+}
+
+function test_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local GROUP_NAME="🚀  Test ${BUILD_NAME}"
+
+    fail_if_no_gpu
+
+
+    ctest_log_dir="${BUILD_DIR}/log/ctest"
+    ctest_log="${ctest_log_dir}/${PRESET}"
+    mkdir -p "${ctest_log_dir}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" ctest --output-log "${ctest_log}" --preset=$PRESET
+    status=$?
+    popd > /dev/null
+
+    print_test_time_summary ${ctest_log}
+
+    return $status
+}
+
+function configure_and_build_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
diff --git a/ci/build_nvbench.sh b/ci/build_nvbench.sh
new file mode 100755
index 00000000..e9ba372e
--- /dev/null
+++ b/ci/build_nvbench.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+PRESET="nvbench-ci"
+
+CMAKE_OPTIONS=""
+
+function version_lt() {
+  local lhs="${1//v/}"
+  local rhs="${2//v/}"
+  # If the versions are equal, return false
+  [ "$lhs" = "$rhs" ] && return 1
+  # If the left-hand side is less than the right-hand side, return true
+  [  "$lhs" = `echo -e "$lhs\n$rhs" | sort -V | head -n1` ]
+}
+
+# If CUDA_COMPILER is nvcc and the version < 11.3, disable CUPTI
+if [[ "$CUDA_COMPILER" == *"nvcc"* ]]; then
+  CUDA_VERSION=$(nvcc --version | grep release | sed -r 's/.*release ([0-9.]+).*/\1/')
+  if version_lt "$CUDA_VERSION" "11.3"; then
+    CMAKE_OPTIONS+=" -DNVBench_ENABLE_CUPTI=OFF "
+  fi
+fi
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/common/build.bash b/ci/common/build.bash
deleted file mode 100755
index 61b3654c..00000000
--- a/ci/common/build.bash
+++ /dev/null
@@ -1,231 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI
-################################################################################
-
-set -e
-
-# append variable value
-# Appends ${value} to ${variable}, adding a space before ${value} if
-# ${variable} is not empty.
-function append {
-  tmp="${!1:+${!1} }${2}"
-  eval "${1}=\${tmp}"
-}
-
-# log args...
-# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
-function log() {
-  printf "\n>>>> %s\n\n" "${*}"
-}
-
-# print_with_trailing_blank_line args...
-# Prints ${args[*]} with one blank line following, preserving newlines within
-# ${args[*]} but stripping any preceding ${args[*]}.
-function print_with_trailing_blank_line {
-  printf "%s\n\n" "${*}"
-}
-
-# echo_and_run name args...
-# Echo ${args[@]}, then execute ${args[@]}
-function echo_and_run {
-  echo "${1}: ${@:2}"
-  ${@:2}
-}
-
-# echo_and_run_timed name args...
-# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
-# including ${name} in the output of the time.
-function echo_and_run_timed {
-  echo "${@:2}"
-  TIMEFORMAT=$'\n'"${1} Time: %lR"
-  time ${@:2}
-}
-
-# join_delimit <delimiter> [value [value [...]]]
-# Combine all values into a single string, separating each by a single character
-# delimiter. Eg:
-# foo=(bar baz kramble)
-# joined_foo=$(join_delimit "|" "${foo[@]}")
-# echo joined_foo # "bar|baz|kramble"
-function join_delimit {
-  local IFS="${1}"
-  shift
-  echo "${*}"
-}
-
-################################################################################
-# VARIABLES - Set up bash and environmental variables.
-################################################################################
-
-# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
-source /etc/cccl.bashrc
-
-# Set path.
-export PATH=/usr/local/cuda/bin:${PATH}
-
-# Set home to the job's workspace.
-export HOME=${WORKSPACE}
-
-# Switch to the build directory.
-cd ${WORKSPACE}
-mkdir -p build
-cd build
-
-# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
-rm -f .ninja_log
-
-if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
-  CMAKE_BUILD_TYPE="Release"
-fi
-
-CMAKE_BUILD_FLAGS="--"
-
-# The Docker image sets up `${CXX}` and `${CUDACXX}`.
-append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
-append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
-
-if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
-  echo "nvc++ not supported."
-  exit 1
-else
-  if [[ "${CXX_TYPE}" == "icc" ]]; then
-    echo "icc not supported."
-    exit 1
-  fi
-  # We're using NVCC so we need to set the host compiler.
-  append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-DCMAKE_CUDA_HOST_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-G Ninja"
-  # Don't stop on build failures.
-  append CMAKE_BUILD_FLAGS "-k0"
-fi
-
-if [[ -n "${PARALLEL_LEVEL}" ]]; then
-  DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}"
-fi
-
-WSL=0
-if [[ $(grep -i microsoft /proc/version) ]]; then
-  echo "Windows Subsystem for Linux detected."
-  WSL=1
-fi
-export WSL
-
-#append CMAKE_FLAGS "-DCMAKE_CUDA_ARCHITECTURES=all"
-
-append CMAKE_FLAGS "-DNVBench_ENABLE_EXAMPLES=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_TESTING=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_CUPTI=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_WERROR=ON"
-
-# These consume a lot of time and don't currently have
-# any value as regression tests.
-append CMAKE_FLAGS "-DNVBench_ENABLE_DEVICE_TESTING=OFF"
-
-# NVML doesn't work under WSL
-if [[ ${WSL} -eq 0 ]]; then
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=ON"
-else
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=OFF"
-fi
-
-if [[ -n "${@}" ]]; then
-  append CMAKE_BUILD_FLAGS "${@}"
-fi
-
-append CTEST_FLAGS "--output-on-failure"
-
-# Export variables so they'll show up in the logs when we report the environment.
-export CMAKE_FLAGS
-export CMAKE_BUILD_FLAGS
-export CTEST_FLAGS
-
-################################################################################
-# ENVIRONMENT - Configure and print out information about the environment.
-################################################################################
-
-log "Determine system topology..."
-
-# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
-# system topology.
-source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
-
-log "Get environment..."
-
-env | sort
-
-log "Check versions..."
-
-# We use sed and echo below to ensure there is always one and only trailing
-# line following the output from each tool.
-
-${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-if [[ "${BUILD_TYPE}" == "gpu" ]]; then
-  nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
-fi
-
-################################################################################
-# BUILD
-################################################################################
-
-log "Configure..."
-
-echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
-configure_status=$?
-
-log "Build..."
-
-# ${PARALLEL_LEVEL} needs to be passed after we run
-# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
-set +e # Don't stop on build failures.
-echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
-build_status=$?
-set -e
-
-################################################################################
-# TEST - Run examples and tests.
-################################################################################
-
-log "Test..."
-
-(
-  # Make sure test_status captures ctest, not tee:
-  # https://stackoverflow.com/a/999259/11130318
-  set -o pipefail
-  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} -j ${PARALLEL_LEVEL} | tee ctest_log
-)
-
-test_status=$?
-
-################################################################################
-# SUMMARY - Print status of each step and exit with failure if needed.
-################################################################################
-
-log "Summary:"
-echo "- Configure Error Code: ${configure_status}"
-echo "- Build Error Code: ${build_status}"
-echo "- Test Error Code: ${test_status}"
-
-if [[ "${configure_status}" != "0" ]] || \
-   [[ "${build_status}" != "0" ]] || \
-   [[ "${test_status}" != "0" ]]; then
-     exit 1
-fi
diff --git a/ci/common/determine_build_parallelism.bash b/ci/common/determine_build_parallelism.bash
deleted file mode 100755
index 1a1cf4c7..00000000
--- a/ci/common/determine_build_parallelism.bash
+++ /dev/null
@@ -1,119 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-function usage {
-  echo "Usage: ${0} [flags...]"
-  echo
-  echo "Examine the system topology to determine a reasonable amount of build"
-  echo "parallelism."
-  echo
-  echo "Exported variables:"
-  echo "  \${LOGICAL_CPUS}          : Logical processors (e.g. threads)."
-  echo "  \${PHYSICAL_CPUS}         : Physical processors (e.g. cores)."
-  echo "  \${TOTAL_MEM}             : Total system memory [GB]."
-  echo "  \${MAX_THREADS_PER_CORE}  : Maximum threads per core allowed."
-  echo "  \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
-  echo "  \${CPU_BOUND_THREADS}     : # of build threads constrained by processors."
-  echo "  \${MEM_BOUND_THREADS}     : # of build threads constrained by memory [GB]."
-  echo "  \${PARALLEL_LEVEL}        : Determined # of build threads."
-  echo "  \${MEM_PER_THREAD}        : Memory [GB] per build thread."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-q, --quiet"
-  echo "  Print nothing and only export variables."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Explicitly set the number of build threads to use."
-  echo
-  echo "--max-threads-per-core <threads>"
-  echo "  Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
-  echo
-  echo "--min-memory-per-thread <gigabytes>"
-  echo "  Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
-
-  exit -3
-}
-
-QUIET=0
-
-export MAX_THREADS_PER_CORE=2
-export MIN_MEMORY_PER_THREAD=1 # [GB]
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -q) ;&
-  --quiet) QUIET=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  --max-threads-per-core)
-    shift # The next argument is the number of threads per core.
-    MAX_THREADS_PER_CORE="${1}"
-    ;;
-  --min-memory-per-thread)
-    shift # The next argument is the amount of memory per thread.
-    MIN_MEMORY_PER_THREAD="${1}"
-    ;;
-  esac
-  shift
-done
-
-# https://stackoverflow.com/a/23378780
-if [ $(uname) == "Darwin" ]; then
-  export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
-  export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
-else
-  export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
-  export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
-fi
-
-export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
-
-export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
-export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
-
-if [[ -z "${PARALLEL_LEVEL}" ]]; then
-  # Pick the smaller of the two as the default.
-  if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
-    export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
-  else
-    export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
-  fi
-else
-  EXPLICIT_PARALLEL_LEVEL=1
-fi
-
-# This can be a floating point number.
-export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
-
-if [[ "${QUIET}" == 0 ]]; then
-  echo    "Logical CPUs:           ${LOGICAL_CPUS} [threads]"
-  echo    "Physical CPUs:          ${PHYSICAL_CPUS} [cores]"
-  echo    "Total Mem:              ${TOTAL_MEM} [GBs]"
-  echo    "Max Threads Per Core:   ${MAX_THREADS_PER_CORE} [threads/core]"
-  echo    "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
-  echo    "CPU Bound Threads:      ${CPU_BOUND_THREADS} [threads]"
-  echo    "Mem Bound Threads:      ${MEM_BOUND_THREADS} [threads]"
-
-  echo -n "Parallel Level:         ${PARALLEL_LEVEL} [threads]"
-  if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
-    echo " (explicitly set)"
-  else
-    echo
-  fi
-
-  echo    "Mem Per Thread:         ${MEM_PER_THREAD} [GBs/thread]"
-fi
-
diff --git a/ci/cpu/build.bash b/ci/cpu/build.bash
deleted file mode 100755
index edf1ba31..00000000
--- a/ci/cpu/build.bash
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (CPU-only)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/gpu/build.bash b/ci/gpu/build.bash
deleted file mode 100755
index 9f6fc01f..00000000
--- a/ci/gpu/build.bash
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (heterogeneous)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/local/build.bash b/ci/local/build.bash
deleted file mode 100755
index 60d22dea..00000000
--- a/ci/local/build.bash
+++ /dev/null
@@ -1,215 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench local containerized build script
-################################################################################
-
-function usage {
-  echo "Usage: ${0} [flags...] [cmake-targets...]"
-  echo
-  echo "Build and test your local repository using a gpuCI Docker image."
-  echo "If CMake targets are specified, only those targets are built and tested."
-  echo "Otherwise, everything is built and tested."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-r <path>, --repository <path>"
-  echo "  Path to the repository (default: ${REPOSITORY_PATH})."
-  echo
-  echo "-i <image>, --image <image>"
-  echo "  Docker image to use (default: ${IMAGE})"
-  echo
-  echo "-l, --local-image"
-  echo "  Use the local version of the image instead of pulling from Docker hub."
-  echo
-  echo "-s, --shell-only"
-  echo "  Skip building and testing and launch an interactive shell instead."
-  echo
-  echo "-d, --disable-gpus"
-  echo "  Don't start the container with the NVIDIA runtime and GPUs attached."
-  echo
-  echo "-c, --clean"
-  echo "  If the build directory already exists, delete it."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Number of threads to use when building (default: inferred)."
-  echo
-  echo "-b <type>, --cmake-build-type <plan>"
-  echo "  CMake build type to use, either Release, RelWithDebInfo, or Debug"
-  echo "  (default: ${CMAKE_BUILD_TYPE})."
-  echo
-
-  exit -3
-}
-
-SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
-
-REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
-
-################################################################################
-# FLAGS - Process command line flags.
-################################################################################
-
-IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9"
-
-LOCAL_IMAGE=0
-
-SHELL_ONLY=0
-
-BUILD_TYPE="gpu"
-
-CLEAN=0
-
-PARALLEL_LEVEL=""
-
-CMAKE_BUILD_TYPE="Release"
-
-TARGETS=""
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -r) ;&
-  --repository)
-    shift # The next argument is the path.
-    REPOSITORY_PATH="${1}"
-    ;;
-  -i) ;&
-  --image)
-    shift # The next argument is the image.
-    IMAGE="${1}"
-    ;;
-  -l) ;&
-  --local-image) LOCAL_IMAGE=1 ;;
-  -s) ;&
-  --shell-only) SHELL_ONLY=1 ;;
-  -d) ;&
-  --disable-gpus) BUILD_TYPE="cpu" ;;
-  -c) ;&
-  --clean) CLEAN=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  -b) ;&
-  --cmake-build-type)
-    shift # The next argument is the build type.
-    CMAKE_BUILD_TYPE="${1}"
-    ;;
-  *)
-    TARGETS="${TARGETS:+${TARGETS} }${1}"
-    ;;
-  esac
-  shift
-done
-
-################################################################################
-# PATHS - Setup paths for the container.
-################################################################################
-
-# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
-# built and tested. It can be set with the --repository flag.
-#
-# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
-# is named after the image name, allowing multiple image builds to coexist on
-# the local filesystem.
-#
-# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
-# the container.
-#
-# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
-# container.
-
-BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
-
-if [[ "${CLEAN}" != 0 ]]; then
-  rm -rf ${BUILD_PATH}
-fi
-
-mkdir -p ${BUILD_PATH}
-
-BASE_PATH_IN_CONTAINER="/cccl"
-
-REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
-
-BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
-
-################################################################################
-# ENVIRONMENT - Setup the thunk build script that will be run by the container.
-################################################################################
-
-# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
-# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
-
-COMMAND="sudo ldconfig; sudo ldconfig"
-if [[ "${SHELL_ONLY}" != 0 ]]; then
-  COMMAND="${COMMAND}; bash"
-else
-  COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
-fi
-
-################################################################################
-# GPU - Setup GPUs.
-################################################################################
-
-# Note: We always start docker with --gpus, even for cpu builds. Otherwise
-# libcuda.so.1 is not present and no NVBench tests are able to run.
-
-# Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
-if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
-  VISIBLE_DEVICES="all"
-else
-  VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
-fi
-
-DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
-GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
-if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
-then
-  GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
-fi
-
-################################################################################
-# LAUNCH - Pull and launch the container.
-################################################################################
-
-#NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
-NVIDIA_DOCKER_INSTALLED=1 # Broken on WSL
-if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
-  echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
-  exit -4
-fi
-
-if [[ "${LOCAL_IMAGE}" == 0 ]]; then
-  docker pull "${IMAGE}"
-fi
-
-docker run --rm -it ${GPU_OPTS} \
-  --cap-add=SYS_PTRACE \
-  --user "$(id -u)":"$(id -g)" \
-  -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
-  -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
-  -v /etc/passwd:/etc/passwd:ro \
-  -v /etc/group:/etc/group:ro \
-  -v /etc/subuid:/etc/subuid:ro \
-  -v /etc/subgid:/etc/subgid:ro \
-  -v /etc/shadow:/etc/shadow:ro \
-  -v /etc/gshadow:/etc/gshadow:ro \
-  -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
-  -e "BUILD_TYPE=${BUILD_TYPE}" \
-  -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
-  -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
-  -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
-  -w "${BUILD_PATH_IN_CONTAINER}" \
-  "${IMAGE}" bash -c "${COMMAND}"
-
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
new file mode 100644
index 00000000..6da2d332
--- /dev/null
+++ b/ci/matrix.yaml
@@ -0,0 +1,86 @@
+
+cuda_prev_min: &cuda_prev_min '11.1' # Does not support the CUPTI APIs we use (added in 11.3)
+cuda_prev_max: &cuda_prev_max '11.8'
+cuda_curr_min: &cuda_curr_min '12.0'
+cuda_curr_max: &cuda_curr_max '12.6'
+
+# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
+devcontainer_version: '24.12'
+
+# gcc compiler configurations
+gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' }
+gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' }
+gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
+gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
+gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
+gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
+gcc12: &gcc13 { name: 'gcc', version: '13', exe: 'g++' }
+
+# LLVM Compiler configurations
+llvm9: &llvm9 { name: 'llvm', version: '9', exe: 'clang++' }
+llvm10: &llvm10 { name: 'llvm', version: '10', exe: 'clang++' }
+llvm11: &llvm11 { name: 'llvm', version: '11', exe: 'clang++' }
+llvm12: &llvm12 { name: 'llvm', version: '12', exe: 'clang++' }
+llvm13: &llvm13 { name: 'llvm', version: '13', exe: 'clang++' }
+llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
+llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
+llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
+llvm16: &llvm17 { name: 'llvm', version: '17', exe: 'clang++' }
+llvm16: &llvm18 { name: 'llvm', version: '18', exe: 'clang++' }
+
+# MSVC configs
+msvc2019: &msvc2019 { name: 'cl', version: '14.29', exe: 'cl++' }
+msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
+
+# Each environment below will generate a unique build/test job
+# See the "compute-matrix" job in the workflow for how this is parsed and used
+# cuda: The CUDA Toolkit version
+# os: The operating system used
+# cpu: The CPU architecture
+# compiler: The compiler to use
+#   name: The compiler name
+#   version: The compiler version
+#   exe: The unverionsed compiler binary name
+# std: The C++ standards to build for
+#    This field is unique as it will generate an independent build/test job for each value
+
+# Configurations that will run for every PR
+pull_request:
+  nvcc:
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
+    # Fails to compile simple input on CTK12.4. Try to add later.
+    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,  extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019 }
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022 }
diff --git a/ci/ninja_summary.py b/ci/ninja_summary.py
new file mode 100755
index 00000000..f496db53
--- /dev/null
+++ b/ci/ninja_summary.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+r"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+This script is designed to be automatically run after each ninja build in
+order to summarize the build's performance. Making build performance information
+more visible should make it easier to notice anomalies and opportunities. To use
+this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat.
+
+On Linux you can get autoninja to invoke this script using this syntax:
+
+$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome
+
+You can also call this script directly using ninja's syntax to specify the
+output directory of interest:
+
+> python3 post_build_ninja_summary.py -C out/Default
+
+Typical output looks like this:
+
+>ninja -C out\debug_component base
+ninja.exe -C out\debug_component base -j 960 -l 48  -d keeprsp
+ninja: Entering directory `out\debug_component'
+[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files
+Longest build steps:
+       0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time)
+       0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time)
+       0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time)
+       1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time)
+Time by build-step type:
+       0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum)
+       0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum)
+       0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum)
+       1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed
+time sum)
+      23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum)
+26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism)
+839 build steps completed, average of 32.17/s
+
+If no gn clean has been done then results will be for the last non-NULL
+invocation of ninja. Ideas for future statistics, and implementations are
+appreciated.
+
+The "weighted" time is the elapsed time of each build step divided by the number
+of tasks that were running in parallel. This makes it an excellent approximation
+of how "important" a slow step was. A link that is entirely or mostly serialized
+will have a weighted time that is the same or similar to its elapsed time. A
+compile that runs in parallel with 999 other compiles will have a weighted time
+that is tiny."""
+
+import argparse
+import errno
+import fnmatch
+import os
+import subprocess
+import sys
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if (self.weighted_duration > self.Duration() + epsilon):
+            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+        assert (self.weighted_duration <= self.Duration() + epsilon)
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ', '.join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + '...'
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    # Handle empty ninja_log gracefully by silently returning an empty list of
+    # targets.
+    if not header:
+        return []
+    assert header == '# ninja log v5\n', \
+           'unrecognized ninja log version %r' % header
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split('\t')
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restat.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+  For targets that generate multiple outputs it is important to return a
+  consistent 'canonical' extension. Ultimately the goal is to group build steps
+  by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(';'):
+                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith('type_mappings'):
+            extension = 'type_mappings'
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = '(no extension found)'
+
+        if ext1 in ['.pdb', '.dll', '.exe']:
+            extension = 'PEFile (linking)'
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in ['.so', '.TOC']:
+            extension = '.so (linking)'
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in ['.obj', '.o']:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == '.jar':
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count('.mojom') > 0:
+            extension = 'mojo'
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.end, 'stop', target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == 'start':
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == 'stop':
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert (len(running_tasks) == 0)
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print('Warning: Possible corrupt ninja log, results may be '
+              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+              (length, weighted_total))
+
+    # Print the slowest build steps:
+    print('    Longest build steps:')
+    if elapsed_time_sorting:
+        entries.sort(key=lambda x: x.Duration())
+    else:
+        entries.sort(key=lambda x: x.WeightedDuration())
+    for target in entries[-long_count:]:
+        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+              (target.WeightedDuration(), target.DescribeTargets(),
+               target.Duration()))
+
+    # Sum up the time by file extension/type of the output file
+    count_by_ext = {}
+    time_by_ext = {}
+    weighted_time_by_ext = {}
+    # Scan through all of the targets to build up per-extension statistics.
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        time_by_ext[extension] = time_by_ext.get(extension,
+                                                 0) + target.Duration()
+        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
+            extension, 0) + target.WeightedDuration()
+        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
+
+    print('    Time by build-step type:')
+    # Copy to a list with extension name and total time swapped, to (time, ext)
+    if elapsed_time_sorting:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in time_by_ext.items())
+    else:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in weighted_time_by_ext.items())
+    # Print the slowest build target types:
+    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
+        print(
+            '      %8.1f s weighted time to generate %d %s files '
+            '(%1.1f s elapsed time sum)' %
+            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+
+    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+          'parallelism)' %
+          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+    print('    %d build steps completed, average of %1.2f/s' %
+          (len(entries), len(entries) / (length)))
+
+
+def main():
+    log_file = '.ninja_log'
+    metrics_file = 'siso_metrics.json'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument(
+        '-s',
+        '--step-types',
+        help='semicolon separated fnmatch patterns for build-step grouping')
+    parser.add_argument(
+        '-e',
+        '--elapsed_time_sorting',
+        default=False,
+        action='store_true',
+        help='Sort output by elapsed time instead of weighted time')
+    parser.add_argument('--log-file',
+                        help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+        metrics_file = os.path.join(args.build_directory, metrics_file)
+    if args.log_file:
+        log_file = args.log_file
+    if not args.step_types:
+        # Offer a convenient way to add extra step types automatically,
+        # including when this script is run by autoninja. get() returns None if
+        # the variable isn't set.
+        args.step_types = os.environ.get('chromium_step_types')
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(';'))
+
+    if os.path.exists(metrics_file):
+        # Automatically handle summarizing siso builds.
+        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
+        cmd.extend(['metrics', 'summary'])
+        if args.build_directory:
+            cmd.extend(['-C', args.build_directory])
+        if args.step_types:
+            cmd.extend(['--step_types', args.step_types])
+        if args.elapsed_time_sorting:
+            cmd.append('--elapsed_time_sorting')
+        subprocess.run(cmd)
+    else:
+        try:
+            with open(log_file, 'r') as log:
+                entries = ReadTargets(log, False)
+                if entries:
+                    SummarizeEntries(entries, args.step_types,
+                                     args.elapsed_time_sorting)
+        except IOError:
+            print('Log file %r not found, no build summary created.' % log_file)
+            return errno.ENOENT
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/ci/pretty_printing.sh b/ci/pretty_printing.sh
new file mode 100644
index 00000000..5bea1af9
--- /dev/null
+++ b/ci/pretty_printing.sh
@@ -0,0 +1,105 @@
+# Print "ARG=${ARG}" for all args.
+function print_var_values() {
+    # Iterate through the arguments
+    for var_name in "$@"; do
+        if [ -z "$var_name" ]; then
+            echo "Usage: print_var_values <variable_name1> <variable_name2> ..."
+            return 1
+        fi
+
+        # Dereference the variable and print the result
+        echo "$var_name=${!var_name:-(undefined)}"
+    done
+}
+
+# begin_group: Start a named section of log output, possibly with color.
+# Usage: begin_group "Group Name" [Color]
+#   Group Name: A string specifying the name of the group.
+#   Color (optional): ANSI color code to set text color. Default is blue (1;34).
+function begin_group() {
+    # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
+    local blue="34"
+    local name="${1:-}"
+    local color="${2:-$blue}"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo -e "::group::\e[${color}m${name}\e[0m"
+    else
+        echo -e "\e[${color}m================== ${name} ======================\e[0m"
+    fi
+}
+
+# end_group: End a named section of log output and print status based on exit status.
+# Usage: end_group "Group Name" [Exit Status]
+#   Group Name: A string specifying the name of the group.
+#   Exit Status (optional): The exit status of the command run within the group. Default is 0.
+function end_group() {
+    local name="${1:-}"
+    local build_status="${2:-0}"
+    local duration="${3:-}"
+    local red="31"
+    local blue="34"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo "::endgroup::"
+
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
+        fi
+    else
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
+        else
+            echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
+        fi
+    fi
+}
+
+declare -A command_durations
+
+# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
+# Usage: run_command "Group Name" command [arguments...]
+function run_command() {
+    local group_name="${1:-}"
+    shift
+    local command=("$@")
+    local status
+
+    begin_group "$group_name"
+    set +e
+    local start_time=$(date +%s)
+    "${command[@]}"
+    status=$?
+    local end_time=$(date +%s)
+    set -e
+    local duration=$((end_time - start_time))
+    end_group "$group_name" $status $duration
+    command_durations["$group_name"]=$duration
+    return $status
+}
+
+function string_width() {
+    local str="$1"
+    echo "$str" | awk '{print length}'
+}
+
+function print_time_summary() {
+    local max_length=0
+    local group
+
+    # Find the longest group name for formatting
+    for group in "${!command_durations[@]}"; do
+        local group_length=$(echo "$group" | awk '{print length}')
+        if [ "$group_length" -gt "$max_length" ]; then
+            max_length=$group_length
+        fi
+    done
+
+    echo "Time Summary:"
+    for group in "${!command_durations[@]}"; do
+        printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
+    done
+
+    # Clear the array of timing info
+    declare -gA command_durations=()
+}
diff --git a/ci/sccache_hit_rate.sh b/ci/sccache_hit_rate.sh
new file mode 100755
index 00000000..de8ae465
--- /dev/null
+++ b/ci/sccache_hit_rate.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Ensure two arguments are provided
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <before-file> <after-file>" >&2
+  exit 1
+fi
+
+# Print the contents of the before file
+echo "=== Contents of $1 ===" >&2
+cat $1 >&2
+echo "=== End of $1 ===" >&2
+
+# Print the contents of the after file
+echo "=== Contents of $2 ==="  >&2
+cat $2 >&2
+echo "=== End of $2 ===" >&2
+
+# Extract compile requests and cache hits from the before and after files
+requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1")
+hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1")
+requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2")
+hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2")
+
+# Calculate the differences to find out how many new requests and hits
+requests_diff=$((requests_after - requests_before))
+hits_diff=$((hits_after - hits_before))
+
+echo "New Compile Requests: $requests_diff" >&2
+echo "New Hits: $hits_diff" >&2
+
+# Calculate and print the hit rate
+if [ $requests_diff -eq 0 ]; then
+    echo "No new compile requests, hit rate is not applicable"
+else
+    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
+    echo "sccache hit rate: $hit_rate%" >&2
+    echo "$hit_rate" 
+fi
diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh
new file mode 100755
index 00000000..3a3ebc42
--- /dev/null
+++ b/ci/sccache_stats.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# This script prints the sccache hit rate between two calls to sccache --show-stats.
+# It should be sourced in your script before and after the operations you want to profile,
+# with the 'start' or 'end' argument respectively.
+
+mode=$1
+
+if [[ "$mode" != "start" && "$mode" != "end" ]]; then
+    echo "Invalid mode: $mode"
+    echo "Usage: $0 {start|end}"
+    exit 1
+fi
+
+# Check if sccache is available
+if ! command -v sccache &> /dev/null; then
+    echo "Notice: sccache is not available. Skipping..."
+    exit 0
+fi
+
+case $mode in
+  start)
+    export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    ;;
+  end)
+    if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then
+        echo "Error: start stats not collected. Did you call this script with 'start' before your operations?"
+        exit 1
+    fi
+
+    final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    hits=$((final_hits - SCCACHE_START_HITS))
+    misses=$((final_misses - SCCACHE_START_MISSES))
+    total=$((hits + misses))
+
+    prefix=""
+    if [ ${GITHUB_ACTIONS:-false} = "true" ]; then
+      prefix="::notice::"
+    fi
+
+    if (( total > 0 )); then
+      hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }')
+      echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%"
+    else
+      echo ${prefix}"sccache stats: N/A No new compilation requests"
+    fi
+    unset SCCACHE_START_HITS
+    unset SCCACHE_START_MISSES
+    ;;
+esac
diff --git a/ci/test_nvbench.sh b/ci/test_nvbench.sh
new file mode 100755
index 00000000..40559eda
--- /dev/null
+++ b/ci/test_nvbench.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+# Run NVBench tests with high parallelism. If any need to be
+# serialized, define the `RUN_SERIAL` CMake property on the
+# test.
+export CTEST_PARALLEL_LEVEL=${PARALLEL_LEVEL}
+
+print_environment_details
+
+./build_nvbench.sh "$@"
+
+PRESET="nvbench-ci"
+
+test_preset "NVBench" ${PRESET}
+
+print_time_summary
diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1
new file mode 100644
index 00000000..1edea634
--- /dev/null
+++ b/ci/windows/build_common.psm1
@@ -0,0 +1,207 @@
+
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("std")]
+    [ValidateNotNullOrEmpty()]
+    [ValidateSet(17)]
+    [int]$CXX_STANDARD = 17
+)
+
+# We need the full path to cl because otherwise cmake will replace CMAKE_CXX_COMPILER with the full path
+# and keep CMAKE_CUDA_HOST_COMPILER at "cl" which breaks our cmake script
+$script:HOST_COMPILER  = (Get-Command "cl").source -replace '\\','/'
+$script:PARALLEL_LEVEL = (Get-WmiObject -class Win32_processor).NumberOfLogicalProcessors
+
+# Extract the CL version for export to build scripts:
+$script:CL_VERSION_STRING = & cl.exe /?
+if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") {
+    $CL_VERSION = [version]$matches[1]
+    Write-Host "Detected cl.exe version: $CL_VERSION"
+}
+
+if (-not $env:CCCL_BUILD_INFIX) {
+    $env:CCCL_BUILD_INFIX = ""
+}
+
+# Presets will be configured in this directory:
+$BUILD_DIR = "../build/$env:CCCL_BUILD_INFIX"
+
+If(!(test-path -PathType container "../build")) {
+    New-Item -ItemType Directory -Path "../build"
+}
+
+# The most recent build will always be symlinked to cccl/build/latest
+New-Item -ItemType Directory -Path "$BUILD_DIR" -Force
+
+# Prepare environment for CMake:
+$env:CMAKE_BUILD_PARALLEL_LEVEL = $PARALLEL_LEVEL
+$env:CTEST_PARALLEL_LEVEL = 1
+$env:CUDAHOSTCXX = $HOST_COMPILER.FullName
+$env:CXX = $HOST_COMPILER.FullName
+
+Write-Host "========================================"
+Write-Host "Begin build"
+Write-Host "pwd=$pwd"
+Write-Host "BUILD_DIR=$BUILD_DIR"
+Write-Host "CXX_STANDARD=$CXX_STANDARD"
+Write-Host "CXX=$env:CXX"
+Write-Host "CUDACXX=$env:CUDACXX"
+Write-Host "CUDAHOSTCXX=$env:CUDAHOSTCXX"
+Write-Host "NVCC_VERSION=$NVCC_VERSION"
+Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL"
+Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL"
+Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX"
+Write-Host "Current commit is:"
+Write-Host "$(git log -1)"
+Write-Host "========================================"
+
+function configure_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET,
+        [Parameter(Mandatory = $true)]
+        [AllowEmptyString()]
+        [string]$CMAKE_OPTIONS
+    )
+
+    $step = "$BUILD_NAME (configure)"
+
+    # CMake must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    $cmake_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE"
+    echo "$cmake_command"
+    Invoke-Expression $cmake_command
+    $test_result = $LastExitCode
+
+    If ($test_result -ne 0) {
+        throw "$step Failed"
+    }
+
+    popd
+    Write-Host "$step complete."
+}
+
+function build_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET
+    )
+
+    $step = "$BUILD_NAME (build)"
+
+    # CMake must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    sccache_stats('Start')
+
+    cmake --build --preset $PRESET -v
+    $test_result = $LastExitCode
+
+    sccache_stats('Stop')
+
+    echo "$step complete"
+
+    If ($test_result -ne 0) {
+         throw "$step Failed"
+    }
+
+    popd
+}
+
+function test_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET
+    )
+
+    $step = "$BUILD_NAME (test)"
+
+    # CTest must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    sccache_stats('Start')
+
+    ctest --preset $PRESET
+    $test_result = $LastExitCode
+
+    sccache_stats('Stop')
+
+    echo "$step complete"
+
+    If ($test_result -ne 0) {
+         throw "$step Failed"
+    }
+
+    popd
+}
+
+function configure_and_build_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET,
+        [Parameter(Mandatory = $true)]
+        [AllowEmptyString()]
+        [string]$CMAKE_OPTIONS
+    )
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
+
+function sccache_stats {
+    Param (
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [ValidateSet('Start','Stop')]
+        [string]$MODE
+    )
+
+    $sccache_stats = sccache -s
+    If($MODE -eq 'Start') {
+        [int]$script:sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+')
+        [int]$script:sccache_cache_hits_cpp   = ($sccache_stats[2] -replace '[^\d]+')
+        [int]$script:sccache_cache_hits_cuda  = ($sccache_stats[3] -replace '[^\d]+')
+        [int]$script:sccache_cache_miss_cpp   = ($sccache_stats[5] -replace '[^\d]+')
+        [int]$script:sccache_cache_miss_cuda  = ($sccache_stats[6] -replace '[^\d]+')
+    } else {
+        [int]$final_sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+')
+        [int]$final_sccache_cache_hits_cpp   = ($sccache_stats[2] -replace '[^\d]+')
+        [int]$final_sccache_cache_hits_cuda  = ($sccache_stats[3] -replace '[^\d]+')
+        [int]$final_sccache_cache_miss_cpp   = ($sccache_stats[5] -replace '[^\d]+')
+        [int]$final_sccache_cache_miss_cuda  = ($sccache_stats[6] -replace '[^\d]+')
+
+        [int]$total_requests  = $final_sccache_compile_requests - $script:sccache_compile_requests
+        [int]$total_hits_cpp  = $final_sccache_cache_hits_cpp   - $script:sccache_cache_hits_cpp
+        [int]$total_hits_cuda = $final_sccache_cache_hits_cuda  - $script:sccache_cache_hits_cuda
+        [int]$total_miss_cpp  = $final_sccache_cache_miss_cpp   - $script:sccache_cache_miss_cpp
+        [int]$total_miss_cuda = $final_sccache_cache_miss_cuda  - $script:sccache_cache_miss_cuda
+        If ( $total_requests -gt 0 ) {
+            [int]$hit_rate_cpp  = $total_hits_cpp  / $total_requests * 100;
+            [int]$hit_rate_cuda = $total_hits_cuda / $total_requests * 100;
+            echo "sccache hits cpp:  $total_hits_cpp  `t| misses: $total_miss_cpp  `t| hit rate: $hit_rate_cpp%"
+            echo "sccache hits cuda: $total_hits_cuda `t| misses: $total_miss_cuda `t| hit rate: $hit_rate_cuda%"
+        } else {
+            echo "sccache stats: N/A No new compilation requests"
+        }
+    }
+}
+
+Export-ModuleMember -Function configure_preset, build_preset, test_preset, configure_and_build_preset, sccache_stats
+Export-ModuleMember -Variable BUILD_DIR, CL_VERSION
diff --git a/ci/windows/build_nvbench.ps1 b/ci/windows/build_nvbench.ps1
new file mode 100644
index 00000000..7240698c
--- /dev/null
+++ b/ci/windows/build_nvbench.ps1
@@ -0,0 +1,30 @@
+
+Param(
+    [Parameter(Mandatory = $false)]
+    [Alias("cmake-options")]
+    [ValidateNotNullOrEmpty()]
+    [string]$ARG_CMAKE_OPTIONS = ""
+)
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+Remove-Module -Name build_common
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList 17
+
+$PRESET = "nvbench-ci"
+$CMAKE_OPTIONS = ""
+
+# Append any arguments pass in on the command line
+If($ARG_CMAKE_OPTIONS -ne "") {
+    $CMAKE_OPTIONS += " $ARG_CMAKE_OPTIONS"
+}
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}
diff --git a/ci/windows/test_nvbench.ps1 b/ci/windows/test_nvbench.ps1
new file mode 100644
index 00000000..4ee5106b
--- /dev/null
+++ b/ci/windows/test_nvbench.ps1
@@ -0,0 +1,31 @@
+
+Param(
+    [Parameter(Mandatory = $false)]
+    [Alias("cmake-options")]
+    [ValidateNotNullOrEmpty()]
+    [string]$ARG_CMAKE_OPTIONS = ""
+)
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+Remove-Module -Name build_common
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList 17
+
+$PRESET = "nvbench-ci"
+$CMAKE_OPTIONS = ""
+
+# Append any arguments pass in on the command line
+If($ARG_CMAKE_OPTIONS -ne "") {
+    $CMAKE_OPTIONS += " $ARG_CMAKE_OPTIONS"
+}
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+test_preset "NVBench" "$PRESET"
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}
diff --git a/cmake/DetectSupportedStandards.cmake b/cmake/DetectSupportedStandards.cmake
new file mode 100644
index 00000000..6a86d6ac
--- /dev/null
+++ b/cmake/DetectSupportedStandards.cmake
@@ -0,0 +1,65 @@
+# Detect the langauge standards supported by the current compilers.
+#
+# Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
+#
+# - var_prefix: Used to name result variables,
+#   e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for
+#   each XX in ${standards}.
+# - lang: The language to test: C, CXX, or CUDA.
+# - standards: List of any standard versions.
+#
+# Example: detect_supported_standards(PROJ CXX 11 14 17)
+#   - Sets the following variables in the parent scope to TRUE or FALSE:
+#     - PROJ_CXX_11_SUPPORTED
+#     - PROJ_CXX_14_SUPPORTED
+#     - PROJ_CXX_17_SUPPORTED
+#   - Sets `PROJ_DETECTED_CXX_STANDARDS` to a list of supported standards (e.g. "11;14;17").
+function(detect_supported_standards prefix lang)
+  string(TOLOWER "${lang}_std" feature_prefix)
+  set(all_stds)
+  foreach(standard IN LISTS ARGN)
+    set(var_name "${prefix}_${lang}_${standard}_SUPPORTED")
+    if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES)
+      set(${var_name} TRUE)
+    else()
+      set(${var_name} FALSE)
+    endif()
+
+    # Special cases:
+    if (standard EQUAL 17 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)))
+      # gcc < 7 and clang < 8 don't fully support C++17.
+      # They accept the flag and have partial support, but nvcc will refuse
+      # to enable it and falls back to the default dialect for the current
+      # CXX compiler version. This breaks our CI.
+      # CMake's COMPILE_FEATURES var reports that these compilers support C++17,
+      # but we can't rely on it, so manually disable the dialect in these cases.
+      set(${var_name} FALSE)
+    endif()
+
+    if (standard EQUAL 20 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1930)))
+      # Similar to the above, but for C++20.
+      set(${var_name} FALSE)
+    endif()
+
+    if (${var_name})
+      list(APPEND all_stds ${standard})
+    endif()
+
+    message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}")
+    set(${var_name} ${${var_name}} PARENT_SCOPE)
+  endforeach()
+
+  set(${prefix}_DETECTED_${lang}_STANDARDS "${all_stds}" PARENT_SCOPE)
+endfunction()
diff --git a/cmake/NVBenchClangdCompileInfo.cmake b/cmake/NVBenchClangdCompileInfo.cmake
new file mode 100644
index 00000000..a4b9c5e7
--- /dev/null
+++ b/cmake/NVBenchClangdCompileInfo.cmake
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Tell cmake to generate a json file of compile commands for clangd:
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Symlink the compile command output to the source dir, where clangd will find it.
+set(compile_commands_file "${CMAKE_BINARY_DIR}/compile_commands.json")
+set(compile_commands_link "${CMAKE_SOURCE_DIR}/compile_commands.json")
+message(STATUS "Creating symlink from ${compile_commands_link} to ${compile_commands_file}...")
+nvbench_execute_non_fatal_process(COMMAND
+  "${CMAKE_COMMAND}" -E rm -f "${compile_commands_link}")
+nvbench_execute_non_fatal_process(COMMAND
+  "${CMAKE_COMMAND}" -E touch "${compile_commands_file}")
+nvbench_execute_non_fatal_process(COMMAND
+  "${CMAKE_COMMAND}" -E create_symlink "${compile_commands_file}" "${compile_commands_link}")
diff --git a/cmake/NVBenchConfigTarget.cmake b/cmake/NVBenchConfigTarget.cmake
index ebb6e4d4..bef95fcf 100644
--- a/cmake/NVBenchConfigTarget.cmake
+++ b/cmake/NVBenchConfigTarget.cmake
@@ -29,7 +29,6 @@ function(nvbench_add_cxx_flag target_name type flag)
     target_compile_options(${target_name} ${type}
       $<$<COMPILE_LANGUAGE:CXX>:${flag}>
       $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=${flag}>
-      # FIXME nvc++ case
     )
   endif()
 endfunction()
@@ -57,14 +56,15 @@ else()
   nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-parameter")
   nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wvla")
   nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wgnu")
+  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wno-gnu-line-marker") # WAR 3916341
 
   if (NVBench_ENABLE_WERROR)
     nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Werror")
   endif()
 endif()
 
-# GCC-specific flags
-if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+# Experimental filesystem library
+if (CMAKE_CXX_COMPILER_ID STREQUAL GNU OR CMAKE_CXX_COMPILER_ID STREQUAL Clang)
   target_link_libraries(nvbench.build_interface INTERFACE stdc++fs)
 endif()
 
diff --git a/cmake/NVBenchDependencies.cmake b/cmake/NVBenchDependencies.cmake
index 5496b8fc..8ba07fe6 100644
--- a/cmake/NVBenchDependencies.cmake
+++ b/cmake/NVBenchDependencies.cmake
@@ -1,52 +1,52 @@
 ################################################################################
 # fmtlib/fmt
-rapids_cpm_find(fmt 7.1.3
+include("${rapids-cmake-dir}/cpm/fmt.cmake")
+
+if(NOT BUILD_SHARED_LIBS AND NVBench_ENABLE_INSTALL_RULES)
+set(export_set_details BUILD_EXPORT_SET nvbench-targets
+                       INSTALL_EXPORT_SET nvbench-targets)
+endif()
+
+rapids_cpm_fmt(${export_set_details}
   CPM_ARGS
-    GITHUB_REPOSITORY fmtlib/fmt
-    GIT_TAG 7.1.3
-    GIT_SHALLOW TRUE
     OPTIONS
       # Force static to keep fmt internal.
       "BUILD_SHARED_LIBS OFF"
-      "CMAKE_POSITION_INDEPENDENT_CODE ON"
 )
 
+if(NOT fmt_ADDED)
+  set(fmt_is_external TRUE)
+endif()
+
 ################################################################################
 # nlohmann/json
 #
 # Following recipe from
 # http://github.com/cpm-cmake/CPM.cmake/blob/master/examples/json/CMakeLists.txt
 # Download the zips because the repo takes an excessively long time to clone.
-rapids_cpm_find(nlohmann_json 3.9.1
-  # Release:
+rapids_cpm_find(nlohmann_json 3.11.3
   CPM_ARGS
-    URL https://github.com/nlohmann/json/releases/download/v3.9.1/include.zip
-    URL_HASH SHA256=6bea5877b1541d353bd77bdfbdb2696333ae5ed8f9e8cc22df657192218cad91
-    PATCH_COMMAND
-      # Work around compiler bug in nvcc 11.0, see NVIDIA/NVBench#18
-      ${CMAKE_COMMAND} -E copy
-        "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/nlohmann_json.hpp"
-        "./include/nlohmann/json.hpp"
-
-  # Development version:
-  # I'm waiting for https://github.com/nlohmann/json/issues/2676 to be fixed,
-  # leave this in to simplify testing patches as they come out. Update the
-  # `nvbench_json` target too when switching branches.
-  #  CPM_ARGS
-  #    VERSION develop
-  #    URL https://github.com/nlohmann/json/archive/refs/heads/develop.zip
-  #    OPTIONS JSON_MultipleHeaders ON
+    URL https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip
+    URL_HASH SHA256=a22461d13119ac5c78f205d3df1db13403e58ce1bb1794edc9313677313f4a9d
+  PATCH_COMMAND
+    ${CMAKE_COMMAND}
+      -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
+      -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
+      -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
+      -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/json_unordered_map_ice.cmake"
 )
 
-# nlohmann_json release headers
 add_library(nvbench_json INTERFACE IMPORTED)
-target_include_directories(nvbench_json SYSTEM INTERFACE
-  "${nlohmann_json_SOURCE_DIR}/include"
-)
-
-# nlohmann_json development branch:
-#add_library(nvbench_json INTERFACE)
-#target_link_libraries(nvbench_json INTERFACE nlohmann_json)
+if (TARGET nlohmann_json::nlohmann_json)
+  # If we have a target, just use it. Cannot be an ALIAS library because
+  # nlohmann_json::nlohmann_json itself might be one.
+  target_link_libraries(nvbench_json INTERFACE nlohmann_json::nlohmann_json)
+else()
+  # Otherwise we only downloaded the headers.
+  target_include_directories(nvbench_json SYSTEM INTERFACE
+    "${nlohmann_json_SOURCE_DIR}/include"
+  )
+endif()
 
 ################################################################################
 # CUDAToolkit
diff --git a/cmake/NVBenchDependentDlls.cmake b/cmake/NVBenchDependentDlls.cmake
index bd9270d6..1a51c873 100644
--- a/cmake/NVBenchDependentDlls.cmake
+++ b/cmake/NVBenchDependentDlls.cmake
@@ -12,14 +12,6 @@ else()
   set(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD OFF)
 endif()
 
-if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD)
-  message(STATUS
-    "CMake 3.21.0 is required when NVBench_ADD_DEPENDENT_DLLS_TO_BUILD "
-    "is enabled."
-  )
-  cmake_minimum_required(VERSION 3.21.0)
-endif()
-
 function(nvbench_setup_dep_dlls target_name)
   # The custom command below fails when there aren't any runtime DLLs to copy,
   # so only enable it when a relevant dependency is enabled:
diff --git a/cmake/NVBenchExports.cmake b/cmake/NVBenchExports.cmake
index ef96acd9..cb32bf88 100644
--- a/cmake/NVBenchExports.cmake
+++ b/cmake/NVBenchExports.cmake
@@ -1,37 +1,51 @@
 macro(nvbench_generate_exports)
-  set(nvbench_build_export_code_block "")
-  set(nvbench_install_export_code_block "")
+  if(NVBench_ENABLE_INSTALL_RULES)
+    set(nvbench_build_export_code_block "")
+    set(nvbench_install_export_code_block "")
 
-  if (NVBench_ENABLE_NVML)
-    string(APPEND nvbench_build_export_code_block
-      "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
-    )
-    string(APPEND nvbench_install_export_code_block
-      "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
-    )
-  endif()
+    if (NVBench_ENABLE_NVML)
+      string(APPEND nvbench_build_export_code_block
+        "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
+      )
+      string(APPEND nvbench_install_export_code_block
+        "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
+      )
+    endif()
 
-  if (NVBench_ENABLE_CUPTI)
-    string(APPEND nvbench_build_export_code_block
-      "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
+    if (NVBench_ENABLE_CUPTI)
+      string(APPEND nvbench_build_export_code_block
+        "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
+      )
+      string(APPEND nvbench_install_export_code_block
+        "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
+      )
+    endif()
+
+    if (TARGET nvbench_json)
+      set(nvbench_json_code_block
+        [=[
+        add_library(nvbench_json INTERFACE IMPORTED)
+        if (TARGET nlohmann_json::nlohmann_json)
+          target_link_libraries(nvbench_json INTERFACE nlohmann_json::nlohmann_json)
+        endif()
+        ]=])
+      string(APPEND nvbench_build_export_code_block ${nvbench_json_code_block})
+      string(APPEND nvbench_install_export_code_block ${nvbench_json_code_block})
+    endif()
+
+    rapids_export(BUILD NVBench
+      EXPORT_SET nvbench-targets
+      NAMESPACE "nvbench::"
+      GLOBAL_TARGETS nvbench main ctl internal_build_interface
+      LANGUAGES CUDA CXX
+      FINAL_CODE_BLOCK nvbench_build_export_code_block
     )
-    string(APPEND nvbench_install_export_code_block
-      "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
+    rapids_export(INSTALL NVBench
+      EXPORT_SET nvbench-targets
+      NAMESPACE "nvbench::"
+      GLOBAL_TARGETS nvbench main ctl internal_build_interface
+      LANGUAGES CUDA CXX
+      FINAL_CODE_BLOCK nvbench_install_export_code_block
     )
   endif()
-
-  rapids_export(BUILD NVBench
-    EXPORT_SET nvbench-targets
-    NAMESPACE "nvbench::"
-    GLOBAL_TARGETS nvbench main ctl internal_build_interface
-    LANGUAGES CUDA CXX
-    FINAL_CODE_BLOCK nvbench_build_export_code_block
-  )
-  rapids_export(INSTALL NVBench
-    EXPORT_SET nvbench-targets
-    NAMESPACE "nvbench::"
-    GLOBAL_TARGETS nvbench main ctl internal_build_interface
-    LANGUAGES CUDA CXX
-    FINAL_CODE_BLOCK nvbench_install_export_code_block
-  )
 endmacro()
diff --git a/cmake/NVBenchHeaderTesting.cmake b/cmake/NVBenchHeaderTesting.cmake
new file mode 100644
index 00000000..354ec84d
--- /dev/null
+++ b/cmake/NVBenchHeaderTesting.cmake
@@ -0,0 +1,40 @@
+# For every public header, build a translation unit containing `#include <header>`
+# with some various checks.
+
+set(excluded_headers_regexes
+  # Should never be used externally.
+  "^detail"
+  "^internal"
+)
+
+# Meta target for all configs' header builds:
+add_custom_target(nvbench.headers.all)
+add_dependencies(nvbench.all nvbench.headers.all)
+
+file(GLOB_RECURSE header_files
+  RELATIVE "${NVBench_SOURCE_DIR}/nvbench/"
+  CONFIGURE_DEPENDS
+  "${NVBench_SOURCE_DIR}/nvbench/*.cuh"
+)
+
+foreach (exclusion IN LISTS excluded_headers_regexes)
+  list(FILTER header_files EXCLUDE REGEX "${exclusion}")
+endforeach()
+
+function (nvbench_add_header_target target_name cuda_std)
+  foreach (header IN LISTS header_files)
+    set(headertest_src "headers/${target_name}/${header}.cu")
+    set(header_str "nvbench/${header}") # Substitution used by configure_file:
+    configure_file("${NVBench_SOURCE_DIR}/cmake/header_test.in.cxx" "${headertest_src}")
+    list(APPEND headertest_srcs "${headertest_src}")
+  endforeach()
+
+  add_library(${target_name} OBJECT ${headertest_srcs})
+  target_link_libraries(${target_name} PUBLIC nvbench::nvbench)
+  set_target_properties(${target_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
+  add_dependencies(nvbench.headers.all ${target_name})
+endfunction()
+
+foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
+  nvbench_add_header_target(nvbench.headers.cpp${std} ${std})
+endforeach()
diff --git a/cmake/NVBenchInstallRules.cmake b/cmake/NVBenchInstallRules.cmake
index 77bc9ff4..16e9b7e6 100644
--- a/cmake/NVBenchInstallRules.cmake
+++ b/cmake/NVBenchInstallRules.cmake
@@ -1,61 +1,69 @@
-include(GNUInstallDirs)
-rapids_cmake_install_lib_dir(NVBench_INSTALL_LIB_DIR)
-
-# in-source public headers:
-install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
-  TYPE INCLUDE
-  FILES_MATCHING
-    PATTERN "*.cuh"
-    PATTERN "internal" EXCLUDE
-)
-
-# generated headers from build dir:
-install(
-  FILES
-    "${NVBench_BINARY_DIR}/nvbench/config.cuh"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
-)
-install(
-  FILES
-    "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
-    "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
-)
-
-#
-# Install CMake files needed by consumers to locate dependencies:
-#
-
-# Borrowing this logic from rapids_cmake's export logic to make sure these end
-# up in the same location as nvbench-config.cmake:
-rapids_cmake_install_lib_dir(config_install_location)
-set(config_install_location "${config_install_location}/cmake/nvbench")
-
-if (NVBench_ENABLE_NVML)
+
+if(NVBench_ENABLE_INSTALL_RULES)
+
+  include(GNUInstallDirs)
+  rapids_cmake_install_lib_dir(NVBench_INSTALL_LIB_DIR)
+
+  # in-source public headers:
+  install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
+    TYPE INCLUDE
+    FILES_MATCHING
+      PATTERN "*.cuh"
+      PATTERN "internal" EXCLUDE
+  )
+
+  # generated headers from build dir:
   install(
     FILES
-      "${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
-    DESTINATION "${config_install_location}"
+      "${NVBench_BINARY_DIR}/nvbench/config.cuh"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
   )
-endif()
-
-if (NVBench_ENABLE_CUPTI)
   install(
     FILES
-      "${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
-    DESTINATION "${config_install_location}"
+      "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
+      "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
   )
+
+  #
+  # Install CMake files needed by consumers to locate dependencies:
+  #
+
+  # Borrowing this logic from rapids_cmake's export logic to make sure these end
+  # up in the same location as nvbench-config.cmake:
+  rapids_cmake_install_lib_dir(config_install_location)
+  set(config_install_location "${config_install_location}/cmake/nvbench")
+
+  if (NVBench_ENABLE_NVML)
+    install(
+      FILES
+        "${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
+      DESTINATION "${config_install_location}"
+    )
+  endif()
+
+  if (NVBench_ENABLE_CUPTI)
+    install(
+      FILES
+        "${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
+      DESTINATION "${config_install_location}"
+    )
+  endif()
 endif()
 
 # Call with a list of library targets to generate install rules:
 function(nvbench_install_libraries)
-  install(TARGETS ${ARGN}
-    DESTINATION "${NVBench_INSTALL_LIB_DIR}"
-    EXPORT nvbench-targets
-  )
+  if(NVBench_ENABLE_INSTALL_RULES)
+    install(TARGETS ${ARGN}
+      DESTINATION "${NVBench_INSTALL_LIB_DIR}"
+      EXPORT nvbench-targets
+    )
+  endif()
 endfunction()
 
 # Call with a list of executables to generate install rules:
 function(nvbench_install_executables)
-  install(TARGETS ${ARGN} EXPORT nvbench-targets)
+  if(NVBench_ENABLE_INSTALL_RULES)
+    install(TARGETS ${ARGN} EXPORT nvbench-targets)
+  endif()
 endfunction()
diff --git a/cmake/NVBenchNVML.cmake b/cmake/NVBenchNVML.cmake
index f2aadbbe..4b005f3c 100644
--- a/cmake/NVBenchNVML.cmake
+++ b/cmake/NVBenchNVML.cmake
@@ -1,37 +1,43 @@
-# Since this file is installed, we need to make sure that the CUDAToolkit has
-# been found by consumers:
-if (NOT TARGET CUDA::toolkit)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (WIN32)
-  # The CUDA:: targets currently don't provide dll locations through the
-  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
-  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
-  # CMake function from copying the dlls to the build / install directories.
-  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
-  # and the other CMake issues it links to.
-  #
-  # We create a nvbench-specific target that configures the nvml interface as
-  # described here:
-  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
-  #
-  # Use find_file instead of find_library, which would search for a .lib file.
-  # This is also nice because find_file searches recursively (find_library
-  # does not) and some versions of CTK nest nvml.dll several directories deep
-  # under C:\Windows\System32.
-  find_file(NVBench_NVML_DLL nvml.dll REQUIRED
-    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
-    PATHS "C:/Windows/System32"
-  )
-  mark_as_advanced(NVBench_NVML_DLL)
-  add_library(nvbench::nvml SHARED IMPORTED)
-  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
-  set_target_properties(nvbench::nvml PROPERTIES
-    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
-    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
-  )
-else()
-  # Linux is much easier...
-  add_library(nvbench::nvml ALIAS CUDA::nvml)
-endif()
+# Since this file is installed, we need to make sure that the CUDAToolkit has
+# been found by consumers:
+if (NOT TARGET CUDA::toolkit)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (WIN32)
+  # The CUDA:: targets currently don't provide dll locations through the
+  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
+  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
+  # CMake function from copying the dlls to the build / install directories.
+  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
+  # and the other CMake issues it links to.
+  #
+  # We create a nvbench-specific target that configures the nvml interface as
+  # described here:
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
+  #
+  # Use find_file instead of find_library, which would search for a .lib file.
+  # This is also nice because find_file searches recursively (find_library
+  # does not) and some versions of CTK nest nvml.dll several directories deep
+  # under C:\Windows\System32.
+  find_file(NVBench_NVML_DLL nvml.dll
+    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
+    PATHS "C:/Windows/System32"
+  )
+  mark_as_advanced(NVBench_NVML_DLL)
+endif()
+
+if (NVBench_NVML_DLL)
+  add_library(nvbench::nvml SHARED IMPORTED)
+  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
+  set_target_properties(nvbench::nvml PROPERTIES
+    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
+    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
+  )
+elseif(TARGET CUDA::nvml)
+  add_library(nvbench::nvml ALIAS CUDA::nvml)
+else()
+  message(FATAL_ERROR "Could not find nvml.dll or CUDA::nvml target. "
+          "Set -DNVBench_ENABLE_NVML=OFF to disable NVML support "
+          "or set -DNVBench_NVML_DLL to the full path to nvml.dll on Windows.")
+endif()
diff --git a/cmake/NVBenchRapidsCMake.cmake b/cmake/NVBenchRapidsCMake.cmake
index 5c09d302..b110ccc5 100644
--- a/cmake/NVBenchRapidsCMake.cmake
+++ b/cmake/NVBenchRapidsCMake.cmake
@@ -1,10 +1,12 @@
 # Called before project(...)
 macro(nvbench_load_rapids_cmake)
-  file(DOWNLOAD
-    https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
-    "${CMAKE_BINARY_DIR}/RAPIDS.cmake"
-  )
-  include("${CMAKE_BINARY_DIR}/RAPIDS.cmake")
+  if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake")
+    file(DOWNLOAD
+      https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
+      "${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake"
+    )
+  endif()
+  include("${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake")
 
   include(rapids-cmake)
   include(rapids-cpm)
@@ -19,9 +21,5 @@ endmacro()
 macro(nvbench_init_rapids_cmake)
   rapids_cmake_build_type(Release)
   rapids_cmake_write_version_file("${NVBench_BINARY_DIR}/nvbench/detail/version.cuh")
-  rapids_cmake_write_git_revision_file(
-    nvbench_git_revision
-    "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
-  )
   rapids_cpm_init()
 endmacro()
diff --git a/cmake/NVBenchUtilities.cmake b/cmake/NVBenchUtilities.cmake
index 36684203..caa79b8b 100644
--- a/cmake/NVBenchUtilities.cmake
+++ b/cmake/NVBenchUtilities.cmake
@@ -1,3 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Passes all args directly to execute_process while setting up the following
+# results variables and propogating them to the caller's scope:
+#
+# - nvbench_process_exit_code
+# - nvbench_process_stdout
+# - nvbench_process_stderr
+#
+# If the command is not successful (e.g. the last command does not return zero),
+# a non-fatal warning is printed.
+function(nvbench_execute_non_fatal_process)
+  execute_process(${ARGN}
+    RESULT_VARIABLE nvbench_process_exit_code
+    OUTPUT_VARIABLE nvbench_process_stdout
+    ERROR_VARIABLE nvbench_process_stderr
+  )
+
+  if (NOT nvbench_process_exit_code EQUAL 0)
+    message(WARNING
+      "execute_process failed with non-zero exit code: ${nvbench_process_exit_code}\n"
+      "${ARGN}\n"
+      "stdout:\n${nvbench_process_stdout}\n"
+      "stderr:\n${nvbench_process_stderr}\n"
+    )
+  endif()
+
+  set(nvbench_process_exit_code "${nvbench_process_exit_code}" PARENT_SCOPE)
+  set(nvbench_process_stdout "${nvbench_process_stdout}" PARENT_SCOPE)
+  set(nvbench_process_stderr "${nvbench_process_stderr}" PARENT_SCOPE)
+endfunction()
+
 # Writes CMAKE_CUDA_ARCHITECTURES to out_var, but using escaped semicolons
 # as delimiters
 function(nvbench_escaped_cuda_arches out_var)
diff --git a/cmake/PrintCTestRunTimes.cmake b/cmake/PrintCTestRunTimes.cmake
new file mode 100644
index 00000000..f4ac7d90
--- /dev/null
+++ b/cmake/PrintCTestRunTimes.cmake
@@ -0,0 +1,127 @@
+## This CMake script parses the output of ctest and prints a formatted list
+## of individual test runtimes, sorted longest first.
+##
+## ctest > ctest_log
+## cmake -DLOGFILE=ctest_log \
+##       -DMINSEC=10 \
+##       -P PrintCTestRunTimes.cmake
+##
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
+endif()
+
+if (NOT DEFINED MINSEC)
+  set(MINSEC 10)
+endif()
+
+set(num_below_thresh 0)
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+string(JOIN "" regex
+  "[0-9]+/[0-9]+[ ]+Test[ ]+#"
+  "([0-9]+)"                        # Test ID
+  ":[ ]+"
+  "([^ ]+)"                         # Test Name
+  "[ ]*\\.+[ ]*\\**[ ]*"
+  "([^ ]+)"                         # Result
+  "[ ]+"
+  "([0-9]+)"                        # Seconds
+  "\\.[0-9]+[ ]+sec"
+)
+
+message(DEBUG "LOGFILE: ${LOGFILE}")
+message(DEBUG "MINSEC: ${MINSEC}")
+message(DEBUG "regex: ${regex}")
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH "${regex}" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 4)
+    set(test_id      "${CMAKE_MATCH_1}")
+    set(test_name    "${CMAKE_MATCH_2}")
+    set(test_result  "${CMAKE_MATCH_3}")
+    set(tmp          "${CMAKE_MATCH_4}") # floor(runtime_seconds)
+
+    if (tmp LESS MINSEC)
+      math(EXPR num_below_thresh "${num_below_thresh} + 1")
+      continue()
+    endif()
+
+    # Compute human readable time
+    math(EXPR days         "${tmp} / (60 * 60 * 24)")
+    math(EXPR tmp          "${tmp} - (${days} * 60 * 60 * 24)")
+    math(EXPR hours        "${tmp} / (60 * 60)")
+    math(EXPR tmp          "${tmp} - (${hours} * 60 * 60)")
+    math(EXPR minutes      "${tmp} / (60)")
+    math(EXPR tmp          "${tmp} - (${minutes} * 60)")
+    math(EXPR seconds      "${tmp}")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${test_id}" key)
+    string(JOIN " | " ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s"
+      "${test_result}"
+      "${test_id}: ${test_name}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(STATUS "LOGFILE contained no test times ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries ORDER DESCENDING)
+
+# Dump table:
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
+
+if (num_below_thresh GREATER 0)
+  message(STATUS "${num_below_thresh} additional tests took < ${MINSEC}s each.")
+endif()
diff --git a/cmake/PrintNinjaBuildTimes.cmake b/cmake/PrintNinjaBuildTimes.cmake
new file mode 100644
index 00000000..65d243d3
--- /dev/null
+++ b/cmake/PrintNinjaBuildTimes.cmake
@@ -0,0 +1,101 @@
+## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
+## build/link times, sorted longest first.
+##
+## cmake -DLOGFILE=<.ninja_log file> \
+##       -P PrintNinjaBuildTimes.cmake
+##
+## If LOGFILE is omitted, the current directory's .ninja_log file is used.
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  set(LOGFILE ".ninja_log")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH
+    "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 3)
+    set(start_ms ${CMAKE_MATCH_1})
+    set(end_ms ${CMAKE_MATCH_2})
+    set(command "${CMAKE_MATCH_3}")
+    math(EXPR runtime_ms "${end_ms} - ${start_ms}")
+
+    # Compute human readable time
+    math(EXPR days         "${runtime_ms} / (1000 * 60 * 60 * 24)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
+    math(EXPR hours        "${runtime_ms} / (1000 * 60 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
+    math(EXPR minutes      "${runtime_ms} / (1000 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${minutes} * 1000 * 60)")
+    math(EXPR seconds      "${runtime_ms} / 1000")
+    math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+    pad_string_with_zeros(milliseconds 3)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${command}" key)
+    set(ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries)
+list(REVERSE entries)
+
+# Dump table:
+message(STATUS "-----------------------+----------------------------")
+message(STATUS "Time                   | Command                    ")
+message(STATUS "-----------------------+----------------------------")
+
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff --git a/cmake/header_test.in.cxx b/cmake/header_test.in.cxx
new file mode 100644
index 00000000..c26753e1
--- /dev/null
+++ b/cmake/header_test.in.cxx
@@ -0,0 +1,57 @@
+// This source file checks that:
+// 1) Header <${header_str}> compiles without error.
+// 2) Common macro collisions with platform/system headers are avoided.
+
+// Turn off failures for certain configurations:
+#ifndef NVBench_IGNORE_MACRO_CHECKS
+
+// Define NVBench_MACRO_CHECK(macro, header), which emits a diagnostic indicating
+// a potential macro collision and halts.
+//
+// Hacky way to build a string, but it works on all tested platforms.
+#define NVBench_MACRO_CHECK(MACRO, HEADER)                                      \
+  NVBench_MACRO_CHECK_IMPL(Identifier MACRO should not be used from NVBench      \
+                           headers due to conflicts with HEADER macros.)
+
+// Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
+// don't want to #include any headers other than the one being tested.
+//
+// This is only implemented for MSVC/GCC/Clang.
+#if defined(_MSC_VER) // MSVC
+
+// Fake up an error for MSVC
+#define NVBench_MACRO_CHECK_IMPL(msg)                                           \
+  /* Print message that looks like an error: */                                \
+  __pragma(message(__FILE__ ":" NVBench_MACRO_CHECK_IMPL0(__LINE__)             \
+                   ": error: " #msg))                                          \
+  /* abort compilation due to static_assert or syntax error: */                \
+  static_assert(false, #msg);
+#define NVBench_MACRO_CHECK_IMPL0(x) NVBench_MACRO_CHECK_IMPL1(x)
+#define NVBench_MACRO_CHECK_IMPL1(x) #x
+
+#elif defined(__clang__) || defined(__GNUC__)
+
+// GCC/clang are easy:
+#define NVBench_MACRO_CHECK_IMPL(msg) NVBench_MACRO_CHECK_IMPL0(GCC error #msg)
+#define NVBench_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+
+#endif
+
+// complex.h conflicts
+#define I NVBench_MACRO_CHECK('I', complex.h)
+
+// windows.h conflicts
+#define small NVBench_MACRO_CHECK('small', windows.h)
+// We can't enable these checks without breaking some builds -- some standard
+// library implementations unconditionally `#undef` these macros, which then
+// causes random failures later.
+// Leaving these commented out as a warning: Here be dragons.
+//#define min(...) NVBench_MACRO_CHECK('min', windows.h)
+//#define max(...) NVBench_MACRO_CHECK('max', windows.h)
+
+// termios.h conflicts (NVIDIA/thrust#1547)
+#define B0 NVBench_MACRO_CHECK("B0", termios.h)
+
+#endif // NVBench_IGNORE_MACRO_CHECKS
+
+#include <${header_str}>
diff --git a/cmake/patches/json_unordered_map_ice.cmake b/cmake/patches/json_unordered_map_ice.cmake
new file mode 100644
index 00000000..44f37c3b
--- /dev/null
+++ b/cmake/patches/json_unordered_map_ice.cmake
@@ -0,0 +1,22 @@
+# NVCC 11.1 and GCC 9 need a patch to build, otherwise:
+#
+# nlohmann/ordered_map.hpp(29): error #3316:
+# Internal Compiler Error (codegen): "internal error during structure layout!"
+#
+# Usage:
+# ${CMAKE_COMMAND}
+#   -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
+#   -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
+#   -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
+#   -P "json_unordered_map_ice.cmake"
+
+if(CUDA_VERSION VERSION_GREATER 11.8 OR NOT CXX_ID STREQUAL "GNU" OR CXX_VERSION VERSION_LESS 9.0)
+  return()
+endif()
+
+# Read the file and replace the string "JSON_NO_UNIQUE_ADDRESS" with
+# "/* JSON_NO_UNIQUE_ADDRESS */".
+file(READ "include/nlohmann/ordered_map.hpp" NLOHMANN_ORDERED_MAP_HPP)
+string(REPLACE "JSON_NO_UNIQUE_ADDRESS" "/* [NVBench Patch] JSON_NO_UNIQUE_ADDRESS */"
+  NLOHMANN_ORDERED_MAP_HPP "${NLOHMANN_ORDERED_MAP_HPP}")
+file(WRITE "include/nlohmann/ordered_map.hpp" "${NLOHMANN_ORDERED_MAP_HPP}")
diff --git a/cmake/patches/nlohmann_json.hpp b/cmake/patches/nlohmann_json.hpp
deleted file mode 100644
index 9a3a0ccb..00000000
--- a/cmake/patches/nlohmann_json.hpp
+++ /dev/null
@@ -1,8799 +0,0 @@
-/*
-    __ _____ _____ _____
- __|  |   __|     |   | |  JSON for Modern C++
-|  |  |__   |  |  | | | |  version 3.9.1
-|_____|_____|_____|_|___|  https://github.com/nlohmann/json
-
-Licensed under the MIT License <http://opensource.org/licenses/MIT>.
-SPDX-License-Identifier: MIT
-Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
-
-Permission is hereby  granted, free of charge, to any  person obtaining a copy
-of this software and associated  documentation files (the "Software"), to deal
-in the Software  without restriction, including without  limitation the rights
-to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
-copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
-IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
-FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
-AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
-LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-
-#ifndef INCLUDE_NLOHMANN_JSON_HPP_
-#define INCLUDE_NLOHMANN_JSON_HPP_
-
-#define NLOHMANN_JSON_VERSION_MAJOR 3
-#define NLOHMANN_JSON_VERSION_MINOR 9
-#define NLOHMANN_JSON_VERSION_PATCH 1
-
-#include <algorithm> // all_of, find, for_each
-#include <cstddef> // nullptr_t, ptrdiff_t, size_t
-#include <functional> // hash, less
-#include <initializer_list> // initializer_list
-#include <iosfwd> // istream, ostream
-#include <iterator> // random_access_iterator_tag
-#include <memory> // unique_ptr
-#include <numeric> // accumulate
-#include <string> // string, stoi, to_string
-#include <utility> // declval, forward, move, pair, swap
-#include <vector> // vector
-
-#include <nlohmann/adl_serializer.hpp>
-#include <nlohmann/byte_container_with_subtype.hpp>
-#include <nlohmann/detail/conversions/from_json.hpp>
-#include <nlohmann/detail/conversions/to_json.hpp>
-#include <nlohmann/detail/exceptions.hpp>
-#include <nlohmann/detail/hash.hpp>
-#include <nlohmann/detail/input/binary_reader.hpp>
-#include <nlohmann/detail/input/input_adapters.hpp>
-#include <nlohmann/detail/input/lexer.hpp>
-#include <nlohmann/detail/input/parser.hpp>
-#include <nlohmann/detail/iterators/internal_iterator.hpp>
-#include <nlohmann/detail/iterators/iter_impl.hpp>
-#include <nlohmann/detail/iterators/iteration_proxy.hpp>
-#include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
-#include <nlohmann/detail/iterators/primitive_iterator.hpp>
-#include <nlohmann/detail/json_pointer.hpp>
-#include <nlohmann/detail/json_ref.hpp>
-#include <nlohmann/detail/macro_scope.hpp>
-#include <nlohmann/detail/meta/cpp_future.hpp>
-#include <nlohmann/detail/meta/type_traits.hpp>
-#include <nlohmann/detail/output/binary_writer.hpp>
-#include <nlohmann/detail/output/output_adapters.hpp>
-#include <nlohmann/detail/output/serializer.hpp>
-#include <nlohmann/detail/value_t.hpp>
-#include <nlohmann/json_fwd.hpp>
-#include <nlohmann/ordered_map.hpp>
-
-/*!
-@brief namespace for Niels Lohmann
-@see https://github.com/nlohmann
-@since version 1.0.0
-*/
-namespace nlohmann
-{
-
-/*!
-@brief a class to store JSON values
-
-@tparam ObjectType type for JSON objects (`std::map` by default; will be used
-in @ref object_t)
-@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used
-in @ref array_t)
-@tparam StringType type for JSON strings and object keys (`std::string` by
-default; will be used in @ref string_t)
-@tparam BooleanType type for JSON booleans (`bool` by default; will be used
-in @ref boolean_t)
-@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by
-default; will be used in @ref number_integer_t)
-@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c
-`uint64_t` by default; will be used in @ref number_unsigned_t)
-@tparam NumberFloatType type for JSON floating-point numbers (`double` by
-default; will be used in @ref number_float_t)
-@tparam BinaryType type for packed binary data for compatibility with binary
-serialization formats (`std::vector<std::uint8_t>` by default; will be used in
-@ref binary_t)
-@tparam AllocatorType type of the allocator to use (`std::allocator` by
-default)
-@tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
-and `from_json()` (@ref adl_serializer by default)
-
-@requirement The class satisfies the following concept requirements:
-- Basic
- - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
-   JSON values can be default constructed. The result will be a JSON null
-   value.
- - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
-   A JSON value can be constructed from an rvalue argument.
- - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
-   A JSON value can be copy-constructed from an lvalue expression.
- - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
-   A JSON value van be assigned from an rvalue argument.
- - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
-   A JSON value can be copy-assigned from an lvalue expression.
- - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
-   JSON values can be destructed.
-- Layout
- - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
-   JSON values have
-   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
-   All non-static data members are private and standard layout types, the
-   class has no virtual functions or (virtual) base classes.
-- Library-wide
- - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
-   JSON values can be compared with `==`, see @ref
-   operator==(const_reference,const_reference).
- - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
-   JSON values can be compared with `<`, see @ref
-   operator<(const_reference,const_reference).
- - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
-   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
-   other compatible types, using unqualified function call @ref swap().
- - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
-   JSON values can be compared against `std::nullptr_t` objects which are used
-   to model the `null` value.
-- Container
- - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
-   JSON values can be used like STL containers and provide iterator access.
- - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
-   JSON values can be used like STL containers and provide reverse iterator
-   access.
-
-@invariant The member variables @a m_value and @a m_type have the following
-relationship:
-- If `m_type == value_t::object`, then `m_value.object != nullptr`.
-- If `m_type == value_t::array`, then `m_value.array != nullptr`.
-- If `m_type == value_t::string`, then `m_value.string != nullptr`.
-The invariants are checked by member function assert_invariant().
-
-@internal
-@note ObjectType trick from https://stackoverflow.com/a/9860911
-@endinternal
-
-@see [RFC 7159: The JavaScript Object Notation (JSON) Data Interchange
-Format](http://rfc7159.net/rfc7159)
-
-@since version 1.0.0
-
-@nosubgrouping
-*/
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-class basic_json
-{
-  private:
-    template<detail::value_t> friend struct detail::external_constructor;
-    friend ::nlohmann::json_pointer<basic_json>;
-
-    template<typename BasicJsonType, typename InputType>
-    friend class ::nlohmann::detail::parser;
-    friend ::nlohmann::detail::serializer<basic_json>;
-    template<typename BasicJsonType>
-    friend class ::nlohmann::detail::iter_impl;
-    template<typename BasicJsonType, typename CharType>
-    friend class ::nlohmann::detail::binary_writer;
-    template<typename BasicJsonType, typename InputType, typename SAX>
-    friend class ::nlohmann::detail::binary_reader;
-    template<typename BasicJsonType>
-    friend class ::nlohmann::detail::json_sax_dom_parser;
-    template<typename BasicJsonType>
-    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
-
-    /// workaround type for MSVC
-    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
-
-    // convenience aliases for types residing in namespace detail;
-    using lexer = ::nlohmann::detail::lexer_base<basic_json>;
-
-    template<typename InputAdapterType>
-    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
-        InputAdapterType adapter,
-        detail::parser_callback_t<basic_json>cb = nullptr,
-        const bool allow_exceptions = true,
-        const bool ignore_comments = false
-    )
-    {
-        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
-                std::move(cb), allow_exceptions, ignore_comments);
-    }
-
-    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
-    template<typename BasicJsonType>
-    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
-    template<typename BasicJsonType>
-    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
-    template<typename Iterator>
-    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
-    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
-
-    template<typename CharType>
-    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
-
-    template<typename InputType>
-    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
-    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
-
-    using serializer = ::nlohmann::detail::serializer<basic_json>;
-
-  public:
-    using value_t = detail::value_t;
-    /// JSON Pointer, see @ref nlohmann::json_pointer
-    using json_pointer = ::nlohmann::json_pointer<basic_json>;
-    template<typename T, typename SFINAE>
-    using json_serializer = JSONSerializer<T, SFINAE>;
-    /// how to treat decoding errors
-    using error_handler_t = detail::error_handler_t;
-    /// how to treat CBOR tags
-    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
-    /// helper type for initializer lists of basic_json values
-    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
-
-    using input_format_t = detail::input_format_t;
-    /// SAX interface type, see @ref nlohmann::json_sax
-    using json_sax_t = json_sax<basic_json>;
-
-    ////////////////
-    // exceptions //
-    ////////////////
-
-    /// @name exceptions
-    /// Classes to implement user-defined exceptions.
-    /// @{
-
-    /// @copydoc detail::exception
-    using exception = detail::exception;
-    /// @copydoc detail::parse_error
-    using parse_error = detail::parse_error;
-    /// @copydoc detail::invalid_iterator
-    using invalid_iterator = detail::invalid_iterator;
-    /// @copydoc detail::type_error
-    using type_error = detail::type_error;
-    /// @copydoc detail::out_of_range
-    using out_of_range = detail::out_of_range;
-    /// @copydoc detail::other_error
-    using other_error = detail::other_error;
-
-    /// @}
-
-
-    /////////////////////
-    // container types //
-    /////////////////////
-
-    /// @name container types
-    /// The canonic container types to use @ref basic_json like any other STL
-    /// container.
-    /// @{
-
-    /// the type of elements in a basic_json container
-    using value_type = basic_json;
-
-    /// the type of an element reference
-    using reference = value_type&;
-    /// the type of an element const reference
-    using const_reference = const value_type&;
-
-    /// a type to represent differences between iterators
-    using difference_type = std::ptrdiff_t;
-    /// a type to represent container sizes
-    using size_type = std::size_t;
-
-    /// the allocator type
-    using allocator_type = AllocatorType<basic_json>;
-
-    /// the type of an element pointer
-    using pointer = typename std::allocator_traits<allocator_type>::pointer;
-    /// the type of an element const pointer
-    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
-
-    /// an iterator for a basic_json container
-    using iterator = iter_impl<basic_json>;
-    /// a const iterator for a basic_json container
-    using const_iterator = iter_impl<const basic_json>;
-    /// a reverse iterator for a basic_json container
-    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
-    /// a const reverse iterator for a basic_json container
-    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
-
-    /// @}
-
-
-    /*!
-    @brief returns the allocator associated with the container
-    */
-    static allocator_type get_allocator()
-    {
-        return allocator_type();
-    }
-
-    /*!
-    @brief returns version information on the library
-
-    This function returns a JSON object with information about the library,
-    including the version number and information on the platform and compiler.
-
-    @return JSON object holding version information
-    key         | description
-    ----------- | ---------------
-    `compiler`  | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version).
-    `copyright` | The copyright line for the library as string.
-    `name`      | The name of the library as string.
-    `platform`  | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`.
-    `url`       | The URL of the project as string.
-    `version`   | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string).
-
-    @liveexample{The following code shows an example output of the `meta()`
-    function.,meta}
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @complexity Constant.
-
-    @since 2.1.0
-    */
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json meta()
-    {
-        basic_json result;
-
-        result["copyright"] = "(C) 2013-2020 Niels Lohmann";
-        result["name"] = "JSON for Modern C++";
-        result["url"] = "https://github.com/nlohmann/json";
-        result["version"]["string"] =
-            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
-            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
-            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
-        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
-        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
-        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
-
-#ifdef _WIN32
-        result["platform"] = "win32";
-#elif defined __linux__
-        result["platform"] = "linux";
-#elif defined __APPLE__
-        result["platform"] = "apple";
-#elif defined __unix__
-        result["platform"] = "unix";
-#else
-        result["platform"] = "unknown";
-#endif
-
-#if defined(__ICC) || defined(__INTEL_COMPILER)
-        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
-#elif defined(__clang__)
-        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
-#elif defined(__GNUC__) || defined(__GNUG__)
-        result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}};
-#elif defined(__HP_cc) || defined(__HP_aCC)
-        result["compiler"] = "hp"
-#elif defined(__IBMCPP__)
-        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
-#elif defined(_MSC_VER)
-        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
-#elif defined(__PGI)
-        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
-#elif defined(__SUNPRO_CC)
-        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
-#else
-        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
-#endif
-
-#ifdef __cplusplus
-        result["compiler"]["c++"] = std::to_string(__cplusplus);
-#else
-        result["compiler"]["c++"] = "unknown";
-#endif
-        return result;
-    }
-
-
-    ///////////////////////////
-    // JSON value data types //
-    ///////////////////////////
-
-    /// @name JSON value data types
-    /// The data types to store a JSON value. These types are derived from
-    /// the template arguments passed to class @ref basic_json.
-    /// @{
-
-#if defined(JSON_HAS_CPP_14)
-    // Use transparent comparator if possible, combined with perfect forwarding
-    // on find() and count() calls prevents unnecessary string construction.
-    using object_comparator_t = std::less<>;
-#else
-    using object_comparator_t = std::less<StringType>;
-#endif
-
-    /*!
-    @brief a type for an object
-
-    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON objects as follows:
-    > An object is an unordered collection of zero or more name/value pairs,
-    > where a name is a string and a value is a string, number, boolean, null,
-    > object, or array.
-
-    To store objects in C++, a type is defined by the template parameters
-    described below.
-
-    @tparam ObjectType  the container to store objects (e.g., `std::map` or
-    `std::unordered_map`)
-    @tparam StringType the type of the keys or names (e.g., `std::string`).
-    The comparison function `std::less<StringType>` is used to order elements
-    inside the container.
-    @tparam AllocatorType the allocator to use for objects (e.g.,
-    `std::allocator`)
-
-    #### Default type
-
-    With the default values for @a ObjectType (`std::map`), @a StringType
-    (`std::string`), and @a AllocatorType (`std::allocator`), the default
-    value for @a object_t is:
-
-    @code {.cpp}
-    std::map<
-      std::string, // key_type
-      basic_json, // value_type
-      std::less<std::string>, // key_compare
-      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
-    >
-    @endcode
-
-    #### Behavior
-
-    The choice of @a object_t influences the behavior of the JSON class. With
-    the default type, objects have the following behavior:
-
-    - When all names are unique, objects will be interoperable in the sense
-      that all software implementations receiving that object will agree on
-      the name-value mappings.
-    - When the names within an object are not unique, it is unspecified which
-      one of the values for a given key will be chosen. For instance,
-      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
-      `{"key": 2}`.
-    - Internally, name/value pairs are stored in lexicographical order of the
-      names. Objects will also be serialized (see @ref dump) in this order.
-      For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
-      and serialized as `{"a": 2, "b": 1}`.
-    - When comparing objects, the order of the name/value pairs is irrelevant.
-      This makes objects interoperable in the sense that they will not be
-      affected by these differences. For instance, `{"b": 1, "a": 2}` and
-      `{"a": 2, "b": 1}` will be treated as equal.
-
-    #### Limits
-
-    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
-    > An implementation may set limits on the maximum depth of nesting.
-
-    In this class, the object's limit of nesting is not explicitly constrained.
-    However, a maximum depth of nesting may be introduced by the compiler or
-    runtime environment. A theoretical limit can be queried by calling the
-    @ref max_size function of a JSON object.
-
-    #### Storage
-
-    Objects are stored as pointers in a @ref basic_json type. That is, for any
-    access to object values, a pointer of type `object_t*` must be
-    dereferenced.
-
-    @sa @ref array_t -- type for an array value
-
-    @since version 1.0.0
-
-    @note The order name/value pairs are added to the object is *not*
-    preserved by the library. Therefore, iterating an object may return
-    name/value pairs in a different order than they were originally stored. In
-    fact, keys will be traversed in alphabetical order as `std::map` with
-    `std::less` is used by default. Please note this behavior conforms to [RFC
-    7159](http://rfc7159.net/rfc7159), because any order implements the
-    specified "unordered" nature of JSON objects.
-    */
-    using object_t = ObjectType<StringType,
-          basic_json,
-          object_comparator_t,
-          AllocatorType<std::pair<const StringType,
-          basic_json>>>;
-
-    /*!
-    @brief a type for an array
-
-    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON arrays as follows:
-    > An array is an ordered sequence of zero or more values.
-
-    To store objects in C++, a type is defined by the template parameters
-    explained below.
-
-    @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
-    `std::list`)
-    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)
-
-    #### Default type
-
-    With the default values for @a ArrayType (`std::vector`) and @a
-    AllocatorType (`std::allocator`), the default value for @a array_t is:
-
-    @code {.cpp}
-    std::vector<
-      basic_json, // value_type
-      std::allocator<basic_json> // allocator_type
-    >
-    @endcode
-
-    #### Limits
-
-    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
-    > An implementation may set limits on the maximum depth of nesting.
-
-    In this class, the array's limit of nesting is not explicitly constrained.
-    However, a maximum depth of nesting may be introduced by the compiler or
-    runtime environment. A theoretical limit can be queried by calling the
-    @ref max_size function of a JSON array.
-
-    #### Storage
-
-    Arrays are stored as pointers in a @ref basic_json type. That is, for any
-    access to array values, a pointer of type `array_t*` must be dereferenced.
-
-    @sa @ref object_t -- type for an object value
-
-    @since version 1.0.0
-    */
-    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
-
-    /*!
-    @brief a type for a string
-
-    [RFC 7159](http://rfc7159.net/rfc7159) describes JSON strings as follows:
-    > A string is a sequence of zero or more Unicode characters.
-
-    To store objects in C++, a type is defined by the template parameter
-    described below. Unicode values are split by the JSON class into
-    byte-sized characters during deserialization.
-
-    @tparam StringType  the container to store strings (e.g., `std::string`).
-    Note this container is used for keys/names in objects, see @ref object_t.
-
-    #### Default type
-
-    With the default values for @a StringType (`std::string`), the default
-    value for @a string_t is:
-
-    @code {.cpp}
-    std::string
-    @endcode
-
-    #### Encoding
-
-    Strings are stored in UTF-8 encoding. Therefore, functions like
-    `std::string::size()` or `std::string::length()` return the number of
-    bytes in the string rather than the number of characters or glyphs.
-
-    #### String comparison
-
-    [RFC 7159](http://rfc7159.net/rfc7159) states:
-    > Software implementations are typically required to test names of object
-    > members for equality. Implementations that transform the textual
-    > representation into sequences of Unicode code units and then perform the
-    > comparison numerically, code unit by code unit, are interoperable in the
-    > sense that implementations will agree in all cases on equality or
-    > inequality of two strings. For example, implementations that compare
-    > strings with escaped characters unconverted may incorrectly find that
-    > `"a\\b"` and `"a\u005Cb"` are not equal.
-
-    This implementation is interoperable as it does compare strings code unit
-    by code unit.
-
-    #### Storage
-
-    String values are stored as pointers in a @ref basic_json type. That is,
-    for any access to string values, a pointer of type `string_t*` must be
-    dereferenced.
-
-    @since version 1.0.0
-    */
-    using string_t = StringType;
-
-    /*!
-    @brief a type for a boolean
-
-    [RFC 7159](http://rfc7159.net/rfc7159) implicitly describes a boolean as a
-    type which differentiates the two literals `true` and `false`.
-
-    To store objects in C++, a type is defined by the template parameter @a
-    BooleanType which chooses the type to use.
-
-    #### Default type
-
-    With the default values for @a BooleanType (`bool`), the default value for
-    @a boolean_t is:
-
-    @code {.cpp}
-    bool
-    @endcode
-
-    #### Storage
-
-    Boolean values are stored directly inside a @ref basic_json type.
-
-    @since version 1.0.0
-    */
-    using boolean_t = BooleanType;
-
-    /*!
-    @brief a type for a number (integer)
-
-    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
-    > The representation of numbers is similar to that used in most
-    > programming languages. A number is represented in base 10 using decimal
-    > digits. It contains an integer component that may be prefixed with an
-    > optional minus sign, which may be followed by a fraction part and/or an
-    > exponent part. Leading zeros are not allowed. (...) Numeric values that
-    > cannot be represented in the grammar below (such as Infinity and NaN)
-    > are not permitted.
-
-    This description includes both integer and floating-point numbers.
-    However, C++ allows more precise storage if it is known whether the number
-    is a signed integer, an unsigned integer or a floating-point number.
-    Therefore, three different types, @ref number_integer_t, @ref
-    number_unsigned_t and @ref number_float_t are used.
-
-    To store integer numbers in C++, a type is defined by the template
-    parameter @a NumberIntegerType which chooses the type to use.
-
-    #### Default type
-
-    With the default values for @a NumberIntegerType (`int64_t`), the default
-    value for @a number_integer_t is:
-
-    @code {.cpp}
-    int64_t
-    @endcode
-
-    #### Default behavior
-
-    - The restrictions about leading zeros is not enforced in C++. Instead,
-      leading zeros in integer literals lead to an interpretation as octal
-      number. Internally, the value will be stored as decimal number. For
-      instance, the C++ integer literal `010` will be serialized to `8`.
-      During deserialization, leading zeros yield an error.
-    - Not-a-number (NaN) values will be serialized to `null`.
-
-    #### Limits
-
-    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
-    > An implementation may set limits on the range and precision of numbers.
-
-    When the default type is used, the maximal integer number that can be
-    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
-    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
-    that are out of range will yield over/underflow when used in a
-    constructor. During deserialization, too large or small integer numbers
-    will be automatically be stored as @ref number_unsigned_t or @ref
-    number_float_t.
-
-    [RFC 7159](http://rfc7159.net/rfc7159) further states:
-    > Note that when such software is used, numbers that are integers and are
-    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
-    > that implementations will agree exactly on their numeric values.
-
-    As this range is a subrange of the exactly supported range [INT64_MIN,
-    INT64_MAX], this class's integer type is interoperable.
-
-    #### Storage
-
-    Integer number values are stored directly inside a @ref basic_json type.
-
-    @sa @ref number_float_t -- type for number values (floating-point)
-
-    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
-
-    @since version 1.0.0
-    */
-    using number_integer_t = NumberIntegerType;
-
-    /*!
-    @brief a type for a number (unsigned)
-
-    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
-    > The representation of numbers is similar to that used in most
-    > programming languages. A number is represented in base 10 using decimal
-    > digits. It contains an integer component that may be prefixed with an
-    > optional minus sign, which may be followed by a fraction part and/or an
-    > exponent part. Leading zeros are not allowed. (...) Numeric values that
-    > cannot be represented in the grammar below (such as Infinity and NaN)
-    > are not permitted.
-
-    This description includes both integer and floating-point numbers.
-    However, C++ allows more precise storage if it is known whether the number
-    is a signed integer, an unsigned integer or a floating-point number.
-    Therefore, three different types, @ref number_integer_t, @ref
-    number_unsigned_t and @ref number_float_t are used.
-
-    To store unsigned integer numbers in C++, a type is defined by the
-    template parameter @a NumberUnsignedType which chooses the type to use.
-
-    #### Default type
-
-    With the default values for @a NumberUnsignedType (`uint64_t`), the
-    default value for @a number_unsigned_t is:
-
-    @code {.cpp}
-    uint64_t
-    @endcode
-
-    #### Default behavior
-
-    - The restrictions about leading zeros is not enforced in C++. Instead,
-      leading zeros in integer literals lead to an interpretation as octal
-      number. Internally, the value will be stored as decimal number. For
-      instance, the C++ integer literal `010` will be serialized to `8`.
-      During deserialization, leading zeros yield an error.
-    - Not-a-number (NaN) values will be serialized to `null`.
-
-    #### Limits
-
-    [RFC 7159](http://rfc7159.net/rfc7159) specifies:
-    > An implementation may set limits on the range and precision of numbers.
-
-    When the default type is used, the maximal integer number that can be
-    stored is `18446744073709551615` (UINT64_MAX) and the minimal integer
-    number that can be stored is `0`. Integer numbers that are out of range
-    will yield over/underflow when used in a constructor. During
-    deserialization, too large or small integer numbers will be automatically
-    be stored as @ref number_integer_t or @ref number_float_t.
-
-    [RFC 7159](http://rfc7159.net/rfc7159) further states:
-    > Note that when such software is used, numbers that are integers and are
-    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
-    > that implementations will agree exactly on their numeric values.
-
-    As this range is a subrange (when considered in conjunction with the
-    number_integer_t type) of the exactly supported range [0, UINT64_MAX],
-    this class's integer type is interoperable.
-
-    #### Storage
-
-    Integer number values are stored directly inside a @ref basic_json type.
-
-    @sa @ref number_float_t -- type for number values (floating-point)
-    @sa @ref number_integer_t -- type for number values (integer)
-
-    @since version 2.0.0
-    */
-    using number_unsigned_t = NumberUnsignedType;
-
-    /*!
-    @brief a type for a number (floating-point)
-
-    [RFC 7159](http://rfc7159.net/rfc7159) describes numbers as follows:
-    > The representation of numbers is similar to that used in most
-    > programming languages. A number is represented in base 10 using decimal
-    > digits. It contains an integer component that may be prefixed with an
-    > optional minus sign, which may be followed by a fraction part and/or an
-    > exponent part. Leading zeros are not allowed. (...) Numeric values that
-    > cannot be represented in the grammar below (such as Infinity and NaN)
-    > are not permitted.
-
-    This description includes both integer and floating-point numbers.
-    However, C++ allows more precise storage if it is known whether the number
-    is a signed integer, an unsigned integer or a floating-point number.
-    Therefore, three different types, @ref number_integer_t, @ref
-    number_unsigned_t and @ref number_float_t are used.
-
-    To store floating-point numbers in C++, a type is defined by the template
-    parameter @a NumberFloatType which chooses the type to use.
-
-    #### Default type
-
-    With the default values for @a NumberFloatType (`double`), the default
-    value for @a number_float_t is:
-
-    @code {.cpp}
-    double
-    @endcode
-
-    #### Default behavior
-
-    - The restrictions about leading zeros is not enforced in C++. Instead,
-      leading zeros in floating-point literals will be ignored. Internally,
-      the value will be stored as decimal number. For instance, the C++
-      floating-point literal `01.2` will be serialized to `1.2`. During
-      deserialization, leading zeros yield an error.
-    - Not-a-number (NaN) values will be serialized to `null`.
-
-    #### Limits
-
-    [RFC 7159](http://rfc7159.net/rfc7159) states:
-    > This specification allows implementations to set limits on the range and
-    > precision of numbers accepted. Since software that implements IEEE
-    > 754-2008 binary64 (double precision) numbers is generally available and
-    > widely used, good interoperability can be achieved by implementations
-    > that expect no more precision or range than these provide, in the sense
-    > that implementations will approximate JSON numbers within the expected
-    > precision.
-
-    This implementation does exactly follow this approach, as it uses double
-    precision floating-point numbers. Note values smaller than
-    `-1.79769313486232e+308` and values greater than `1.79769313486232e+308`
-    will be stored as NaN internally and be serialized to `null`.
-
-    #### Storage
-
-    Floating-point number values are stored directly inside a @ref basic_json
-    type.
-
-    @sa @ref number_integer_t -- type for number values (integer)
-
-    @sa @ref number_unsigned_t -- type for number values (unsigned integer)
-
-    @since version 1.0.0
-    */
-    using number_float_t = NumberFloatType;
-
-    /*!
-    @brief a type for a packed binary type
-
-    This type is a type designed to carry binary data that appears in various
-    serialized formats, such as CBOR's Major Type 2, MessagePack's bin, and
-    BSON's generic binary subtype. This type is NOT a part of standard JSON and
-    exists solely for compatibility with these binary types. As such, it is
-    simply defined as an ordered sequence of zero or more byte values.
-
-    Additionally, as an implementation detail, the subtype of the binary data is
-    carried around as a `std::uint8_t`, which is compatible with both of the
-    binary data formats that use binary subtyping, (though the specific
-    numbering is incompatible with each other, and it is up to the user to
-    translate between them).
-
-    [CBOR's RFC 7049](https://tools.ietf.org/html/rfc7049) describes this type
-    as:
-    > Major type 2: a byte string. The string's length in bytes is represented
-    > following the rules for positive integers (major type 0).
-
-    [MessagePack's documentation on the bin type
-    family](https://github.com/msgpack/msgpack/blob/master/spec.md#bin-format-family)
-    describes this type as:
-    > Bin format family stores an byte array in 2, 3, or 5 bytes of extra bytes
-    > in addition to the size of the byte array.
-
-    [BSON's specifications](http://bsonspec.org/spec.html) describe several
-    binary types; however, this type is intended to represent the generic binary
-    type which has the description:
-    > Generic binary subtype - This is the most commonly used binary subtype and
-    > should be the 'default' for drivers and tools.
-
-    None of these impose any limitations on the internal representation other
-    than the basic unit of storage be some type of array whose parts are
-    decomposable into bytes.
-
-    The default representation of this binary format is a
-    `std::vector<std::uint8_t>`, which is a very common way to represent a byte
-    array in modern C++.
-
-    #### Default type
-
-    The default values for @a BinaryType is `std::vector<std::uint8_t>`
-
-    #### Storage
-
-    Binary Arrays are stored as pointers in a @ref basic_json type. That is,
-    for any access to array values, a pointer of the type `binary_t*` must be
-    dereferenced.
-
-    #### Notes on subtypes
-
-    - CBOR
-       - Binary values are represented as byte strings. No subtypes are
-         supported and will be ignored when CBOR is written.
-    - MessagePack
-       - If a subtype is given and the binary array contains exactly 1, 2, 4, 8,
-         or 16 elements, the fixext family (fixext1, fixext2, fixext4, fixext8)
-         is used. For other sizes, the ext family (ext8, ext16, ext32) is used.
-         The subtype is then added as singed 8-bit integer.
-       - If no subtype is given, the bin family (bin8, bin16, bin32) is used.
-    - BSON
-       - If a subtype is given, it is used and added as unsigned 8-bit integer.
-       - If no subtype is given, the generic binary subtype 0x00 is used.
-
-    @sa @ref binary -- create a binary array
-
-    @since version 3.8.0
-    */
-    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
-    /// @}
-
-  private:
-
-    /// helper for exception-safe object creation
-    template<typename T, typename... Args>
-    JSON_HEDLEY_RETURNS_NON_NULL
-    static T* create(Args&& ... args)
-    {
-        AllocatorType<T> alloc;
-        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
-
-        auto deleter = [&](T * object)
-        {
-            AllocatorTraits::deallocate(alloc, object, 1);
-        };
-        std::unique_ptr<T, decltype(deleter)> object(AllocatorTraits::allocate(alloc, 1), deleter);
-        AllocatorTraits::construct(alloc, object.get(), std::forward<Args>(args)...);
-        JSON_ASSERT(object != nullptr);
-        return object.release();
-    }
-
-    ////////////////////////
-    // JSON value storage //
-    ////////////////////////
-
-    /*!
-    @brief a JSON value
-
-    The actual storage for a JSON value of the @ref basic_json class. This
-    union combines the different storage types for the JSON value types
-    defined in @ref value_t.
-
-    JSON type | value_t type    | used type
-    --------- | --------------- | ------------------------
-    object    | object          | pointer to @ref object_t
-    array     | array           | pointer to @ref array_t
-    string    | string          | pointer to @ref string_t
-    boolean   | boolean         | @ref boolean_t
-    number    | number_integer  | @ref number_integer_t
-    number    | number_unsigned | @ref number_unsigned_t
-    number    | number_float    | @ref number_float_t
-    binary    | binary          | pointer to @ref binary_t
-    null      | null            | *no value is stored*
-
-    @note Variable-length types (objects, arrays, and strings) are stored as
-    pointers. The size of the union should not exceed 64 bits if the default
-    value types are used.
-
-    @since version 1.0.0
-    */
-    union json_value
-    {
-        /// object (stored with pointer to save storage)
-        object_t* object;
-        /// array (stored with pointer to save storage)
-        array_t* array;
-        /// string (stored with pointer to save storage)
-        string_t* string;
-        /// binary (stored with pointer to save storage)
-        binary_t* binary;
-        /// boolean
-        boolean_t boolean;
-        /// number (integer)
-        number_integer_t number_integer;
-        /// number (unsigned integer)
-        number_unsigned_t number_unsigned;
-        /// number (floating-point)
-        number_float_t number_float;
-
-        /// default constructor (for null values)
-        json_value() = default;
-        /// constructor for booleans
-        json_value(boolean_t v) noexcept : boolean(v) {}
-        /// constructor for numbers (integer)
-        json_value(number_integer_t v) noexcept : number_integer(v) {}
-        /// constructor for numbers (unsigned)
-        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
-        /// constructor for numbers (floating-point)
-        json_value(number_float_t v) noexcept : number_float(v) {}
-        /// constructor for empty values of a given type
-        json_value(value_t t)
-        {
-            switch (t)
-            {
-                case value_t::object:
-                {
-                    object = create<object_t>();
-                    break;
-                }
-
-                case value_t::array:
-                {
-                    array = create<array_t>();
-                    break;
-                }
-
-                case value_t::string:
-                {
-                    string = create<string_t>("");
-                    break;
-                }
-
-                case value_t::binary:
-                {
-                    binary = create<binary_t>();
-                    break;
-                }
-
-                case value_t::boolean:
-                {
-                    boolean = boolean_t(false);
-                    break;
-                }
-
-                case value_t::number_integer:
-                {
-                    number_integer = number_integer_t(0);
-                    break;
-                }
-
-                case value_t::number_unsigned:
-                {
-                    number_unsigned = number_unsigned_t(0);
-                    break;
-                }
-
-                case value_t::number_float:
-                {
-                    number_float = number_float_t(0.0);
-                    break;
-                }
-
-                case value_t::null:
-                {
-                    object = nullptr;  // silence warning, see #821
-                    break;
-                }
-
-                default:
-                {
-                    object = nullptr;  // silence warning, see #821
-                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
-                    {
-                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.9.1")); // LCOV_EXCL_LINE
-                    }
-                    break;
-                }
-            }
-        }
-
-        /// constructor for strings
-        json_value(const string_t& value)
-        {
-            string = create<string_t>(value);
-        }
-
-        /// constructor for rvalue strings
-        json_value(string_t&& value)
-        {
-            string = create<string_t>(std::move(value));
-        }
-
-        /// constructor for objects
-        json_value(const object_t& value)
-        {
-            object = create<object_t>(value);
-        }
-
-        /// constructor for rvalue objects
-        json_value(object_t&& value)
-        {
-            object = create<object_t>(std::move(value));
-        }
-
-        /// constructor for arrays
-        json_value(const array_t& value)
-        {
-            array = create<array_t>(value);
-        }
-
-        /// constructor for rvalue arrays
-        json_value(array_t&& value)
-        {
-            array = create<array_t>(std::move(value));
-        }
-
-        /// constructor for binary arrays
-        json_value(const typename binary_t::container_type& value)
-        {
-            binary = create<binary_t>(value);
-        }
-
-        /// constructor for rvalue binary arrays
-        json_value(typename binary_t::container_type&& value)
-        {
-            binary = create<binary_t>(std::move(value));
-        }
-
-        /// constructor for binary arrays (internal type)
-        json_value(const binary_t& value)
-        {
-            binary = create<binary_t>(value);
-        }
-
-        /// constructor for rvalue binary arrays (internal type)
-        json_value(binary_t&& value)
-        {
-            binary = create<binary_t>(std::move(value));
-        }
-
-        void destroy(value_t t) noexcept
-        {
-            // flatten the current json_value to a heap-allocated stack
-            std::vector<basic_json> stack;
-
-            // move the top-level items to stack
-            if (t == value_t::array)
-            {
-                stack.reserve(array->size());
-                std::move(array->begin(), array->end(), std::back_inserter(stack));
-            }
-            else if (t == value_t::object)
-            {
-                stack.reserve(object->size());
-                for (auto&& it : *object)
-                {
-                    stack.push_back(std::move(it.second));
-                }
-            }
-
-            while (!stack.empty())
-            {
-                // move the last item to local variable to be processed
-                basic_json current_item(std::move(stack.back()));
-                stack.pop_back();
-
-                // if current_item is array/object, move
-                // its children to the stack to be processed later
-                if (current_item.is_array())
-                {
-                    std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(),
-                              std::back_inserter(stack));
-
-                    current_item.m_value.array->clear();
-                }
-                else if (current_item.is_object())
-                {
-                    for (auto&& it : *current_item.m_value.object)
-                    {
-                        stack.push_back(std::move(it.second));
-                    }
-
-                    current_item.m_value.object->clear();
-                }
-
-                // it's now safe that current_item get destructed
-                // since it doesn't have any children
-            }
-
-            switch (t)
-            {
-                case value_t::object:
-                {
-                    AllocatorType<object_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
-                    break;
-                }
-
-                case value_t::array:
-                {
-                    AllocatorType<array_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
-                    break;
-                }
-
-                case value_t::string:
-                {
-                    AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
-                    break;
-                }
-
-                case value_t::binary:
-                {
-                    AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
-                    break;
-                }
-
-                default:
-                {
-                    break;
-                }
-            }
-        }
-    };
-
-    /*!
-    @brief checks the class invariants
-
-    This function asserts the class invariants. It needs to be called at the
-    end of every constructor to make sure that created objects respect the
-    invariant. Furthermore, it has to be called each time the type of a JSON
-    value is changed, because the invariant expresses a relationship between
-    @a m_type and @a m_value.
-    */
-    void assert_invariant() const noexcept
-    {
-        JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr);
-        JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr);
-        JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr);
-        JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr);
-    }
-
-  public:
-    //////////////////////////
-    // JSON parser callback //
-    //////////////////////////
-
-    /*!
-    @brief parser event types
-
-    The parser callback distinguishes the following events:
-    - `object_start`: the parser read `{` and started to process a JSON object
-    - `key`: the parser read a key of a value in an object
-    - `object_end`: the parser read `}` and finished processing a JSON object
-    - `array_start`: the parser read `[` and started to process a JSON array
-    - `array_end`: the parser read `]` and finished processing a JSON array
-    - `value`: the parser finished reading a JSON value
-
-    @image html callback_events.png "Example when certain parse events are triggered"
-
-    @sa @ref parser_callback_t for more information and examples
-    */
-    using parse_event_t = detail::parse_event_t;
-
-    /*!
-    @brief per-element parser callback type
-
-    With a parser callback function, the result of parsing a JSON text can be
-    influenced. When passed to @ref parse, it is called on certain events
-    (passed as @ref parse_event_t via parameter @a event) with a set recursion
-    depth @a depth and context JSON value @a parsed. The return value of the
-    callback function is a boolean indicating whether the element that emitted
-    the callback shall be kept or not.
-
-    We distinguish six scenarios (determined by the event type) in which the
-    callback function can be called. The following table describes the values
-    of the parameters @a depth, @a event, and @a parsed.
-
-    parameter @a event | description | parameter @a depth | parameter @a parsed
-    ------------------ | ----------- | ------------------ | -------------------
-    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
-    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
-    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
-    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
-    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
-    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
-
-    @image html callback_events.png "Example when certain parse events are triggered"
-
-    Discarding a value (i.e., returning `false`) has different effects
-    depending on the context in which function was called:
-
-    - Discarded values in structured types are skipped. That is, the parser
-      will behave as if the discarded value was never read.
-    - In case a value outside a structured type is skipped, it is replaced
-      with `null`. This case happens if the top-level element is skipped.
-
-    @param[in] depth  the depth of the recursion during parsing
-
-    @param[in] event  an event of type parse_event_t indicating the context in
-    the callback function has been called
-
-    @param[in,out] parsed  the current intermediate parse result; note that
-    writing to this value has no effect for parse_event_t::key events
-
-    @return Whether the JSON value which called the function during parsing
-    should be kept (`true`) or not (`false`). In the latter case, it is either
-    skipped completely or replaced by an empty discarded object.
-
-    @sa @ref parse for examples
-
-    @since version 1.0.0
-    */
-    using parser_callback_t = detail::parser_callback_t<basic_json>;
-
-    //////////////////
-    // constructors //
-    //////////////////
-
-    /// @name constructors and destructors
-    /// Constructors of class @ref basic_json, copy/move constructor, copy
-    /// assignment, static functions creating objects, and the destructor.
-    /// @{
-
-    /*!
-    @brief create an empty value with a given type
-
-    Create an empty JSON value with a given type. The value will be default
-    initialized with an empty value which depends on the type:
-
-    Value type  | initial value
-    ----------- | -------------
-    null        | `null`
-    boolean     | `false`
-    string      | `""`
-    number      | `0`
-    object      | `{}`
-    array       | `[]`
-    binary      | empty array
-
-    @param[in] v  the type of the value to create
-
-    @complexity Constant.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @liveexample{The following code shows the constructor for different @ref
-    value_t values,basic_json__value_t}
-
-    @sa @ref clear() -- restores the postcondition of this constructor
-
-    @since version 1.0.0
-    */
-    basic_json(const value_t v)
-        : m_type(v), m_value(v)
-    {
-        assert_invariant();
-    }
-
-    /*!
-    @brief create a null object
-
-    Create a `null` JSON value. It either takes a null pointer as parameter
-    (explicitly creating `null`) or no parameter (implicitly creating `null`).
-    The passed null pointer itself is not read -- it is only used to choose
-    the right constructor.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this constructor never throws
-    exceptions.
-
-    @liveexample{The following code shows the constructor with and without a
-    null pointer parameter.,basic_json__nullptr_t}
-
-    @since version 1.0.0
-    */
-    basic_json(std::nullptr_t = nullptr) noexcept
-        : basic_json(value_t::null)
-    {
-        assert_invariant();
-    }
-
-    /*!
-    @brief create a JSON value
-
-    This is a "catch all" constructor for all compatible JSON types; that is,
-    types for which a `to_json()` method exists. The constructor forwards the
-    parameter @a val to that method (to `json_serializer<U>::to_json` method
-    with `U = uncvref_t<CompatibleType>`, to be exact).
-
-    Template type @a CompatibleType includes, but is not limited to, the
-    following types:
-    - **arrays**: @ref array_t and all kinds of compatible containers such as
-      `std::vector`, `std::deque`, `std::list`, `std::forward_list`,
-      `std::array`, `std::valarray`, `std::set`, `std::unordered_set`,
-      `std::multiset`, and `std::unordered_multiset` with a `value_type` from
-      which a @ref basic_json value can be constructed.
-    - **objects**: @ref object_t and all kinds of compatible associative
-      containers such as `std::map`, `std::unordered_map`, `std::multimap`,
-      and `std::unordered_multimap` with a `key_type` compatible to
-      @ref string_t and a `value_type` from which a @ref basic_json value can
-      be constructed.
-    - **strings**: @ref string_t, string literals, and all compatible string
-      containers can be used.
-    - **numbers**: @ref number_integer_t, @ref number_unsigned_t,
-      @ref number_float_t, and all convertible number types such as `int`,
-      `size_t`, `int64_t`, `float` or `double` can be used.
-    - **boolean**: @ref boolean_t / `bool` can be used.
-    - **binary**: @ref binary_t / `std::vector<uint8_t>` may be used,
-      unfortunately because string literals cannot be distinguished from binary
-      character arrays by the C++ type system, all types compatible with `const
-      char*` will be directed to the string constructor instead.  This is both
-      for backwards compatibility, and due to the fact that a binary type is not
-      a standard JSON type.
-
-    See the examples below.
-
-    @tparam CompatibleType a type such that:
-    - @a CompatibleType is not derived from `std::istream`,
-    - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
-         constructors),
-    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
-    - @a CompatibleType is not a @ref basic_json nested type (e.g.,
-         @ref json_pointer, @ref iterator, etc ...)
-    - @ref @ref json_serializer<U> has a
-         `to_json(basic_json_t&, CompatibleType&&)` method
-
-    @tparam U = `uncvref_t<CompatibleType>`
-
-    @param[in] val the value to be forwarded to the respective constructor
-
-    @complexity Usually linear in the size of the passed @a val, also
-                depending on the implementation of the called `to_json()`
-                method.
-
-    @exceptionsafety Depends on the called constructor. For types directly
-    supported by the library (i.e., all types for which no `to_json()` function
-    was provided), strong guarantee holds: if an exception is thrown, there are
-    no changes to any JSON value.
-
-    @liveexample{The following code shows the constructor with several
-    compatible types.,basic_json__CompatibleType}
-
-    @since version 2.1.0
-    */
-    template < typename CompatibleType,
-               typename U = detail::uncvref_t<CompatibleType>,
-               detail::enable_if_t <
-                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
-    basic_json(CompatibleType && val) noexcept(noexcept(
-                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
-                                           std::forward<CompatibleType>(val))))
-    {
-        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
-        assert_invariant();
-    }
-
-    /*!
-    @brief create a JSON value from an existing one
-
-    This is a constructor for existing @ref basic_json types.
-    It does not hijack copy/move constructors, since the parameter has different
-    template arguments than the current ones.
-
-    The constructor tries to convert the internal @ref m_value of the parameter.
-
-    @tparam BasicJsonType a type such that:
-    - @a BasicJsonType is a @ref basic_json type.
-    - @a BasicJsonType has different template arguments than @ref basic_json_t.
-
-    @param[in] val the @ref basic_json value to be converted.
-
-    @complexity Usually linear in the size of the passed @a val, also
-                depending on the implementation of the called `to_json()`
-                method.
-
-    @exceptionsafety Depends on the called constructor. For types directly
-    supported by the library (i.e., all types for which no `to_json()` function
-    was provided), strong guarantee holds: if an exception is thrown, there are
-    no changes to any JSON value.
-
-    @since version 3.2.0
-    */
-    template < typename BasicJsonType,
-               detail::enable_if_t <
-                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
-    basic_json(const BasicJsonType& val)
-    {
-        using other_boolean_t = typename BasicJsonType::boolean_t;
-        using other_number_float_t = typename BasicJsonType::number_float_t;
-        using other_number_integer_t = typename BasicJsonType::number_integer_t;
-        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
-        using other_string_t = typename BasicJsonType::string_t;
-        using other_object_t = typename BasicJsonType::object_t;
-        using other_array_t = typename BasicJsonType::array_t;
-        using other_binary_t = typename BasicJsonType::binary_t;
-
-        switch (val.type())
-        {
-            case value_t::boolean:
-                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
-                break;
-            case value_t::number_float:
-                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
-                break;
-            case value_t::number_integer:
-                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
-                break;
-            case value_t::number_unsigned:
-                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
-                break;
-            case value_t::string:
-                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
-                break;
-            case value_t::object:
-                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
-                break;
-            case value_t::array:
-                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
-                break;
-            case value_t::binary:
-                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
-                break;
-            case value_t::null:
-                *this = nullptr;
-                break;
-            case value_t::discarded:
-                m_type = value_t::discarded;
-                break;
-            default:            // LCOV_EXCL_LINE
-                JSON_ASSERT(false);  // LCOV_EXCL_LINE
-        }
-        assert_invariant();
-    }
-
-    /*!
-    @brief create a container (array or object) from an initializer list
-
-    Creates a JSON value of type array or object from the passed initializer
-    list @a init. In case @a type_deduction is `true` (default), the type of
-    the JSON value to be created is deducted from the initializer list @a init
-    according to the following rules:
-
-    1. If the list is empty, an empty JSON object value `{}` is created.
-    2. If the list consists of pairs whose first element is a string, a JSON
-       object value is created where the first elements of the pairs are
-       treated as keys and the second elements are as values.
-    3. In all other cases, an array is created.
-
-    The rules aim to create the best fit between a C++ initializer list and
-    JSON values. The rationale is as follows:
-
-    1. The empty initializer list is written as `{}` which is exactly an empty
-       JSON object.
-    2. C++ has no way of describing mapped types other than to list a list of
-       pairs. As JSON requires that keys must be of type string, rule 2 is the
-       weakest constraint one can pose on initializer lists to interpret them
-       as an object.
-    3. In all other cases, the initializer list could not be interpreted as
-       JSON object type, so interpreting it as JSON array type is safe.
-
-    With the rules described above, the following JSON values cannot be
-    expressed by an initializer list:
-
-    - the empty array (`[]`): use @ref array(initializer_list_t)
-      with an empty initializer list in this case
-    - arrays whose elements satisfy rule 2: use @ref
-      array(initializer_list_t) with the same initializer list
-      in this case
-
-    @note When used without parentheses around an empty initializer list, @ref
-    basic_json() is called instead of this function, yielding the JSON null
-    value.
-
-    @param[in] init  initializer list with JSON values
-
-    @param[in] type_deduction internal parameter; when set to `true`, the type
-    of the JSON value is deducted from the initializer list @a init; when set
-    to `false`, the type provided via @a manual_type is forced. This mode is
-    used by the functions @ref array(initializer_list_t) and
-    @ref object(initializer_list_t).
-
-    @param[in] manual_type internal parameter; when @a type_deduction is set
-    to `false`, the created JSON value will use the provided type (only @ref
-    value_t::array and @ref value_t::object are valid); when @a type_deduction
-    is set to `true`, this parameter has no effect
-
-    @throw type_error.301 if @a type_deduction is `false`, @a manual_type is
-    `value_t::object`, but @a init contains an element which is not a pair
-    whose first element is a string. In this case, the constructor could not
-    create an object. If @a type_deduction would have be `true`, an array
-    would have been created. See @ref object(initializer_list_t)
-    for an example.
-
-    @complexity Linear in the size of the initializer list @a init.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @liveexample{The example below shows how JSON values are created from
-    initializer lists.,basic_json__list_init_t}
-
-    @sa @ref array(initializer_list_t) -- create a JSON array
-    value from an initializer list
-    @sa @ref object(initializer_list_t) -- create a JSON object
-    value from an initializer list
-
-    @since version 1.0.0
-    */
-    basic_json(initializer_list_t init,
-               bool type_deduction = true,
-               value_t manual_type = value_t::array)
-    {
-        // check if each element is an array with two elements whose first
-        // element is a string
-        bool is_an_object = std::all_of(init.begin(), init.end(),
-                                        [](const detail::json_ref<basic_json>& element_ref)
-        {
-            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string();
-        });
-
-        // adjust type if type deduction is not wanted
-        if (!type_deduction)
-        {
-            // if array is wanted, do not create an object though possible
-            if (manual_type == value_t::array)
-            {
-                is_an_object = false;
-            }
-
-            // if object is wanted but impossible, throw an exception
-            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
-            {
-                JSON_THROW(type_error::create(301, "cannot create object from initializer list"));
-            }
-        }
-
-        if (is_an_object)
-        {
-            // the initializer list is a list of pairs -> create object
-            m_type = value_t::object;
-            m_value = value_t::object;
-
-            std::for_each(init.begin(), init.end(), [this](const detail::json_ref<basic_json>& element_ref)
-            {
-                auto element = element_ref.moved_or_copied();
-                m_value.object->emplace(
-                    std::move(*((*element.m_value.array)[0].m_value.string)),
-                    std::move((*element.m_value.array)[1]));
-            });
-        }
-        else
-        {
-            // the initializer list describes an array -> create array
-            m_type = value_t::array;
-            m_value.array = create<array_t>(init.begin(), init.end());
-        }
-
-        assert_invariant();
-    }
-
-    /*!
-    @brief explicitly create a binary array (without subtype)
-
-    Creates a JSON binary array value from a given binary container. Binary
-    values are part of various binary formats, such as CBOR, MessagePack, and
-    BSON. This constructor is used to create a value for serialization to those
-    formats.
-
-    @note Note, this function exists because of the difficulty in correctly
-    specifying the correct template overload in the standard value ctor, as both
-    JSON arrays and JSON binary arrays are backed with some form of a
-    `std::vector`. Because JSON binary arrays are a non-standard extension it
-    was decided that it would be best to prevent automatic initialization of a
-    binary array type, for backwards compatibility and so it does not happen on
-    accident.
-
-    @param[in] init container containing bytes to use as binary type
-
-    @return JSON binary array value
-
-    @complexity Linear in the size of @a init.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @since version 3.8.0
-    */
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(const typename binary_t::container_type& init)
-    {
-        auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = init;
-        return res;
-    }
-
-    /*!
-    @brief explicitly create a binary array (with subtype)
-
-    Creates a JSON binary array value from a given binary container. Binary
-    values are part of various binary formats, such as CBOR, MessagePack, and
-    BSON. This constructor is used to create a value for serialization to those
-    formats.
-
-    @note Note, this function exists because of the difficulty in correctly
-    specifying the correct template overload in the standard value ctor, as both
-    JSON arrays and JSON binary arrays are backed with some form of a
-    `std::vector`. Because JSON binary arrays are a non-standard extension it
-    was decided that it would be best to prevent automatic initialization of a
-    binary array type, for backwards compatibility and so it does not happen on
-    accident.
-
-    @param[in] init container containing bytes to use as binary type
-    @param[in] subtype subtype to use in MessagePack and BSON
-
-    @return JSON binary array value
-
-    @complexity Linear in the size of @a init.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @since version 3.8.0
-    */
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(const typename binary_t::container_type& init, std::uint8_t subtype)
-    {
-        auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = binary_t(init, subtype);
-        return res;
-    }
-
-    /// @copydoc binary(const typename binary_t::container_type&)
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(typename binary_t::container_type&& init)
-    {
-        auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = std::move(init);
-        return res;
-    }
-
-    /// @copydoc binary(const typename binary_t::container_type&, std::uint8_t)
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json binary(typename binary_t::container_type&& init, std::uint8_t subtype)
-    {
-        auto res = basic_json();
-        res.m_type = value_t::binary;
-        res.m_value = binary_t(std::move(init), subtype);
-        return res;
-    }
-
-    /*!
-    @brief explicitly create an array from an initializer list
-
-    Creates a JSON array value from a given initializer list. That is, given a
-    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
-    initializer list is empty, the empty array `[]` is created.
-
-    @note This function is only needed to express two edge cases that cannot
-    be realized with the initializer list constructor (@ref
-    basic_json(initializer_list_t, bool, value_t)). These cases
-    are:
-    1. creating an array whose elements are all pairs whose first element is a
-    string -- in this case, the initializer list constructor would create an
-    object, taking the first elements as keys
-    2. creating an empty array -- passing the empty initializer list to the
-    initializer list constructor yields an empty object
-
-    @param[in] init  initializer list with JSON values to create an array from
-    (optional)
-
-    @return JSON array value
-
-    @complexity Linear in the size of @a init.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @liveexample{The following code shows an example for the `array`
-    function.,array}
-
-    @sa @ref basic_json(initializer_list_t, bool, value_t) --
-    create a JSON value from an initializer list
-    @sa @ref object(initializer_list_t) -- create a JSON object
-    value from an initializer list
-
-    @since version 1.0.0
-    */
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json array(initializer_list_t init = {})
-    {
-        return basic_json(init, false, value_t::array);
-    }
-
-    /*!
-    @brief explicitly create an object from an initializer list
-
-    Creates a JSON object value from a given initializer list. The initializer
-    lists elements must be pairs, and their first elements must be strings. If
-    the initializer list is empty, the empty object `{}` is created.
-
-    @note This function is only added for symmetry reasons. In contrast to the
-    related function @ref array(initializer_list_t), there are
-    no cases which can only be expressed by this function. That is, any
-    initializer list @a init can also be passed to the initializer list
-    constructor @ref basic_json(initializer_list_t, bool, value_t).
-
-    @param[in] init  initializer list to create an object from (optional)
-
-    @return JSON object value
-
-    @throw type_error.301 if @a init is not a list of pairs whose first
-    elements are strings. In this case, no object can be created. When such a
-    value is passed to @ref basic_json(initializer_list_t, bool, value_t),
-    an array would have been created from the passed initializer list @a init.
-    See example below.
-
-    @complexity Linear in the size of @a init.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @liveexample{The following code shows an example for the `object`
-    function.,object}
-
-    @sa @ref basic_json(initializer_list_t, bool, value_t) --
-    create a JSON value from an initializer list
-    @sa @ref array(initializer_list_t) -- create a JSON array
-    value from an initializer list
-
-    @since version 1.0.0
-    */
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json object(initializer_list_t init = {})
-    {
-        return basic_json(init, false, value_t::object);
-    }
-
-    /*!
-    @brief construct an array with count copies of given value
-
-    Constructs a JSON array value by creating @a cnt copies of a passed value.
-    In case @a cnt is `0`, an empty array is created.
-
-    @param[in] cnt  the number of JSON copies of @a val to create
-    @param[in] val  the JSON value to copy
-
-    @post `std::distance(begin(),end()) == cnt` holds.
-
-    @complexity Linear in @a cnt.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @liveexample{The following code shows examples for the @ref
-    basic_json(size_type\, const basic_json&)
-    constructor.,basic_json__size_type_basic_json}
-
-    @since version 1.0.0
-    */
-    basic_json(size_type cnt, const basic_json& val)
-        : m_type(value_t::array)
-    {
-        m_value.array = create<array_t>(cnt, val);
-        assert_invariant();
-    }
-
-    /*!
-    @brief construct a JSON container given an iterator range
-
-    Constructs the JSON value with the contents of the range `[first, last)`.
-    The semantics depends on the different types a JSON value can have:
-    - In case of a null type, invalid_iterator.206 is thrown.
-    - In case of other primitive types (number, boolean, or string), @a first
-      must be `begin()` and @a last must be `end()`. In this case, the value is
-      copied. Otherwise, invalid_iterator.204 is thrown.
-    - In case of structured types (array, object), the constructor behaves as
-      similar versions for `std::vector` or `std::map`; that is, a JSON array
-      or object is constructed from the values in the range.
-
-    @tparam InputIT an input iterator type (@ref iterator or @ref
-    const_iterator)
-
-    @param[in] first begin of the range to copy from (included)
-    @param[in] last end of the range to copy from (excluded)
-
-    @pre Iterators @a first and @a last must be initialized. **This
-         precondition is enforced with an assertion (see warning).** If
-         assertions are switched off, a violation of this precondition yields
-         undefined behavior.
-
-    @pre Range `[first, last)` is valid. Usually, this precondition cannot be
-         checked efficiently. Only certain edge cases are detected; see the
-         description of the exceptions below. A violation of this precondition
-         yields undefined behavior.
-
-    @warning A precondition is enforced with a runtime assertion that will
-             result in calling `std::abort` if this precondition is not met.
-             Assertions can be disabled by defining `NDEBUG` at compile time.
-             See https://en.cppreference.com/w/cpp/error/assert for more
-             information.
-
-    @throw invalid_iterator.201 if iterators @a first and @a last are not
-    compatible (i.e., do not belong to the same JSON value). In this case,
-    the range `[first, last)` is undefined.
-    @throw invalid_iterator.204 if iterators @a first and @a last belong to a
-    primitive type (number, boolean, or string), but @a first does not point
-    to the first element any more. In this case, the range `[first, last)` is
-    undefined. See example code below.
-    @throw invalid_iterator.206 if iterators @a first and @a last belong to a
-    null value. In this case, the range `[first, last)` is undefined.
-
-    @complexity Linear in distance between @a first and @a last.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @liveexample{The example below shows several ways to create JSON values by
-    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
-
-    @since version 1.0.0
-    */
-    template < class InputIT, typename std::enable_if <
-                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
-                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
-    basic_json(InputIT first, InputIT last)
-    {
-        JSON_ASSERT(first.m_object != nullptr);
-        JSON_ASSERT(last.m_object != nullptr);
-
-        // make sure iterator fits the current value
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible"));
-        }
-
-        // copy type from first iterator
-        m_type = first.m_object->m_type;
-
-        // check if iterator range is complete for primitive values
-        switch (m_type)
-        {
-            case value_t::boolean:
-            case value_t::number_float:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::string:
-            {
-                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
-                                         || !last.m_it.primitive_iterator.is_end()))
-                {
-                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
-                }
-                break;
-            }
-
-            default:
-                break;
-        }
-
-        switch (m_type)
-        {
-            case value_t::number_integer:
-            {
-                m_value.number_integer = first.m_object->m_value.number_integer;
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                m_value.number_float = first.m_object->m_value.number_float;
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                m_value.boolean = first.m_object->m_value.boolean;
-                break;
-            }
-
-            case value_t::string:
-            {
-                m_value = *first.m_object->m_value.string;
-                break;
-            }
-
-            case value_t::object:
-            {
-                m_value.object = create<object_t>(first.m_it.object_iterator,
-                                                  last.m_it.object_iterator);
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_value.array = create<array_t>(first.m_it.array_iterator,
-                                                last.m_it.array_iterator);
-                break;
-            }
-
-            case value_t::binary:
-            {
-                m_value = *first.m_object->m_value.binary;
-                break;
-            }
-
-            default:
-                JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " +
-                                                    std::string(first.m_object->type_name())));
-        }
-
-        assert_invariant();
-    }
-
-
-    ///////////////////////////////////////
-    // other constructors and destructor //
-    ///////////////////////////////////////
-
-    template<typename JsonRef,
-             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
-                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
-    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
-
-    /*!
-    @brief copy constructor
-
-    Creates a copy of a given JSON value.
-
-    @param[in] other  the JSON value to copy
-
-    @post `*this == other`
-
-    @complexity Linear in the size of @a other.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes to any JSON value.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is linear.
-    - As postcondition, it holds: `other == basic_json(other)`.
-
-    @liveexample{The following code shows an example for the copy
-    constructor.,basic_json__basic_json}
-
-    @since version 1.0.0
-    */
-    basic_json(const basic_json& other)
-        : m_type(other.m_type)
-    {
-        // check of passed value is valid
-        other.assert_invariant();
-
-        switch (m_type)
-        {
-            case value_t::object:
-            {
-                m_value = *other.m_value.object;
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_value = *other.m_value.array;
-                break;
-            }
-
-            case value_t::string:
-            {
-                m_value = *other.m_value.string;
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                m_value = other.m_value.boolean;
-                break;
-            }
-
-            case value_t::number_integer:
-            {
-                m_value = other.m_value.number_integer;
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                m_value = other.m_value.number_unsigned;
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                m_value = other.m_value.number_float;
-                break;
-            }
-
-            case value_t::binary:
-            {
-                m_value = *other.m_value.binary;
-                break;
-            }
-
-            default:
-                break;
-        }
-
-        assert_invariant();
-    }
-
-    /*!
-    @brief move constructor
-
-    Move constructor. Constructs a JSON value with the contents of the given
-    value @a other using move semantics. It "steals" the resources from @a
-    other and leaves it as JSON null value.
-
-    @param[in,out] other  value to move to this object
-
-    @post `*this` has the same value as @a other before the call.
-    @post @a other is a JSON null value.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this constructor never throws
-    exceptions.
-
-    @requirement This function helps `basic_json` satisfying the
-    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
-    requirements.
-
-    @liveexample{The code below shows the move constructor explicitly called
-    via std::move.,basic_json__moveconstructor}
-
-    @since version 1.0.0
-    */
-    basic_json(basic_json&& other) noexcept
-        : m_type(std::move(other.m_type)),
-          m_value(std::move(other.m_value))
-    {
-        // check that passed value is valid
-        other.assert_invariant();
-
-        // invalidate payload
-        other.m_type = value_t::null;
-        other.m_value = {};
-
-        assert_invariant();
-    }
-
-    /*!
-    @brief copy assignment
-
-    Copy assignment operator. Copies a JSON value via the "copy and swap"
-    strategy: It is expressed in terms of the copy constructor, destructor,
-    and the `swap()` member function.
-
-    @param[in] other  value to copy from
-
-    @complexity Linear.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is linear.
-
-    @liveexample{The code below shows and example for the copy assignment. It
-    creates a copy of value `a` which is then swapped with `b`. Finally\, the
-    copy of `a` (which is the null value after the swap) is
-    destroyed.,basic_json__copyassignment}
-
-    @since version 1.0.0
-    */
-    basic_json& operator=(basic_json other) noexcept (
-        std::is_nothrow_move_constructible<value_t>::value&&
-        std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&&
-        std::is_nothrow_move_assignable<json_value>::value
-    )
-    {
-        // check that passed value is valid
-        other.assert_invariant();
-
-        using std::swap;
-        swap(m_type, other.m_type);
-        swap(m_value, other.m_value);
-
-        assert_invariant();
-        return *this;
-    }
-
-    /*!
-    @brief destructor
-
-    Destroys the JSON value and frees all allocated memory.
-
-    @complexity Linear.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is linear.
-    - All stored elements are destroyed and all memory is freed.
-
-    @since version 1.0.0
-    */
-    ~basic_json() noexcept
-    {
-        assert_invariant();
-        m_value.destroy(m_type);
-    }
-
-    /// @}
-
-  public:
-    ///////////////////////
-    // object inspection //
-    ///////////////////////
-
-    /// @name object inspection
-    /// Functions to inspect the type of a JSON value.
-    /// @{
-
-    /*!
-    @brief serialization
-
-    Serialization function for JSON values. The function tries to mimic
-    Python's `json.dumps()` function, and currently supports its @a indent
-    and @a ensure_ascii parameters.
-
-    @param[in] indent If indent is nonnegative, then array elements and object
-    members will be pretty-printed with that indent level. An indent level of
-    `0` will only insert newlines. `-1` (the default) selects the most compact
-    representation.
-    @param[in] indent_char The character to use for indentation if @a indent is
-    greater than `0`. The default is ` ` (space).
-    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
-    in the output are escaped with `\uXXXX` sequences, and the result consists
-    of ASCII characters only.
-    @param[in] error_handler  how to react on decoding errors; there are three
-    possible values: `strict` (throws and exception in case a decoding error
-    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
-    and `ignore` (ignore invalid UTF-8 sequences during serialization; all
-    bytes are copied to the output unchanged).
-
-    @return string containing the serialization of the JSON value
-
-    @throw type_error.316 if a string stored inside the JSON value is not
-                          UTF-8 encoded and @a error_handler is set to strict
-
-    @note Binary values are serialized as object containing two keys:
-      - "bytes": an array of bytes as integers
-      - "subtype": the subtype as integer or "null" if the binary has no subtype
-
-    @complexity Linear.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @liveexample{The following example shows the effect of different @a indent\,
-    @a indent_char\, and @a ensure_ascii parameters to the result of the
-    serialization.,dump}
-
-    @see https://docs.python.org/2/library/json.html#json.dump
-
-    @since version 1.0.0; indentation character @a indent_char, option
-           @a ensure_ascii and exceptions added in version 3.0.0; error
-           handlers added in version 3.4.0; serialization of binary values added
-           in version 3.8.0.
-    */
-    string_t dump(const int indent = -1,
-                  const char indent_char = ' ',
-                  const bool ensure_ascii = false,
-                  const error_handler_t error_handler = error_handler_t::strict) const
-    {
-        string_t result;
-        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
-
-        if (indent >= 0)
-        {
-            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
-        }
-        else
-        {
-            s.dump(*this, false, ensure_ascii, 0);
-        }
-
-        return result;
-    }
-
-    /*!
-    @brief return the type of the JSON value (explicit)
-
-    Return the type of the JSON value as a value from the @ref value_t
-    enumeration.
-
-    @return the type of the JSON value
-            Value type                | return value
-            ------------------------- | -------------------------
-            null                      | value_t::null
-            boolean                   | value_t::boolean
-            string                    | value_t::string
-            number (integer)          | value_t::number_integer
-            number (unsigned integer) | value_t::number_unsigned
-            number (floating-point)   | value_t::number_float
-            object                    | value_t::object
-            array                     | value_t::array
-            binary                    | value_t::binary
-            discarded                 | value_t::discarded
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `type()` for all JSON
-    types.,type}
-
-    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
-    @sa @ref type_name() -- return the type as string
-
-    @since version 1.0.0
-    */
-    constexpr value_t type() const noexcept
-    {
-        return m_type;
-    }
-
-    /*!
-    @brief return whether type is primitive
-
-    This function returns true if and only if the JSON type is primitive
-    (string, number, boolean, or null).
-
-    @return `true` if type is primitive (string, number, boolean, or null),
-    `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_primitive()` for all JSON
-    types.,is_primitive}
-
-    @sa @ref is_structured() -- returns whether JSON value is structured
-    @sa @ref is_null() -- returns whether JSON value is `null`
-    @sa @ref is_string() -- returns whether JSON value is a string
-    @sa @ref is_boolean() -- returns whether JSON value is a boolean
-    @sa @ref is_number() -- returns whether JSON value is a number
-    @sa @ref is_binary() -- returns whether JSON value is a binary array
-
-    @since version 1.0.0
-    */
-    constexpr bool is_primitive() const noexcept
-    {
-        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
-    }
-
-    /*!
-    @brief return whether type is structured
-
-    This function returns true if and only if the JSON type is structured
-    (array or object).
-
-    @return `true` if type is structured (array or object), `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_structured()` for all JSON
-    types.,is_structured}
-
-    @sa @ref is_primitive() -- returns whether value is primitive
-    @sa @ref is_array() -- returns whether value is an array
-    @sa @ref is_object() -- returns whether value is an object
-
-    @since version 1.0.0
-    */
-    constexpr bool is_structured() const noexcept
-    {
-        return is_array() || is_object();
-    }
-
-    /*!
-    @brief return whether value is null
-
-    This function returns true if and only if the JSON value is null.
-
-    @return `true` if type is null, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_null()` for all JSON
-    types.,is_null}
-
-    @since version 1.0.0
-    */
-    constexpr bool is_null() const noexcept
-    {
-        return m_type == value_t::null;
-    }
-
-    /*!
-    @brief return whether value is a boolean
-
-    This function returns true if and only if the JSON value is a boolean.
-
-    @return `true` if type is boolean, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_boolean()` for all JSON
-    types.,is_boolean}
-
-    @since version 1.0.0
-    */
-    constexpr bool is_boolean() const noexcept
-    {
-        return m_type == value_t::boolean;
-    }
-
-    /*!
-    @brief return whether value is a number
-
-    This function returns true if and only if the JSON value is a number. This
-    includes both integer (signed and unsigned) and floating-point values.
-
-    @return `true` if type is number (regardless whether integer, unsigned
-    integer or floating-type), `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_number()` for all JSON
-    types.,is_number}
-
-    @sa @ref is_number_integer() -- check if value is an integer or unsigned
-    integer number
-    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
-    number
-    @sa @ref is_number_float() -- check if value is a floating-point number
-
-    @since version 1.0.0
-    */
-    constexpr bool is_number() const noexcept
-    {
-        return is_number_integer() || is_number_float();
-    }
-
-    /*!
-    @brief return whether value is an integer number
-
-    This function returns true if and only if the JSON value is a signed or
-    unsigned integer number. This excludes floating-point values.
-
-    @return `true` if type is an integer or unsigned integer number, `false`
-    otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_number_integer()` for all
-    JSON types.,is_number_integer}
-
-    @sa @ref is_number() -- check if value is a number
-    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
-    number
-    @sa @ref is_number_float() -- check if value is a floating-point number
-
-    @since version 1.0.0
-    */
-    constexpr bool is_number_integer() const noexcept
-    {
-        return m_type == value_t::number_integer || m_type == value_t::number_unsigned;
-    }
-
-    /*!
-    @brief return whether value is an unsigned integer number
-
-    This function returns true if and only if the JSON value is an unsigned
-    integer number. This excludes floating-point and signed integer values.
-
-    @return `true` if type is an unsigned integer number, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_number_unsigned()` for all
-    JSON types.,is_number_unsigned}
-
-    @sa @ref is_number() -- check if value is a number
-    @sa @ref is_number_integer() -- check if value is an integer or unsigned
-    integer number
-    @sa @ref is_number_float() -- check if value is a floating-point number
-
-    @since version 2.0.0
-    */
-    constexpr bool is_number_unsigned() const noexcept
-    {
-        return m_type == value_t::number_unsigned;
-    }
-
-    /*!
-    @brief return whether value is a floating-point number
-
-    This function returns true if and only if the JSON value is a
-    floating-point number. This excludes signed and unsigned integer values.
-
-    @return `true` if type is a floating-point number, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_number_float()` for all
-    JSON types.,is_number_float}
-
-    @sa @ref is_number() -- check if value is number
-    @sa @ref is_number_integer() -- check if value is an integer number
-    @sa @ref is_number_unsigned() -- check if value is an unsigned integer
-    number
-
-    @since version 1.0.0
-    */
-    constexpr bool is_number_float() const noexcept
-    {
-        return m_type == value_t::number_float;
-    }
-
-    /*!
-    @brief return whether value is an object
-
-    This function returns true if and only if the JSON value is an object.
-
-    @return `true` if type is object, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_object()` for all JSON
-    types.,is_object}
-
-    @since version 1.0.0
-    */
-    constexpr bool is_object() const noexcept
-    {
-        return m_type == value_t::object;
-    }
-
-    /*!
-    @brief return whether value is an array
-
-    This function returns true if and only if the JSON value is an array.
-
-    @return `true` if type is array, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_array()` for all JSON
-    types.,is_array}
-
-    @since version 1.0.0
-    */
-    constexpr bool is_array() const noexcept
-    {
-        return m_type == value_t::array;
-    }
-
-    /*!
-    @brief return whether value is a string
-
-    This function returns true if and only if the JSON value is a string.
-
-    @return `true` if type is string, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_string()` for all JSON
-    types.,is_string}
-
-    @since version 1.0.0
-    */
-    constexpr bool is_string() const noexcept
-    {
-        return m_type == value_t::string;
-    }
-
-    /*!
-    @brief return whether value is a binary array
-
-    This function returns true if and only if the JSON value is a binary array.
-
-    @return `true` if type is binary array, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_binary()` for all JSON
-    types.,is_binary}
-
-    @since version 3.8.0
-    */
-    constexpr bool is_binary() const noexcept
-    {
-        return m_type == value_t::binary;
-    }
-
-    /*!
-    @brief return whether value is discarded
-
-    This function returns true if and only if the JSON value was discarded
-    during parsing with a callback function (see @ref parser_callback_t).
-
-    @note This function will always be `false` for JSON values after parsing.
-    That is, discarded values can only occur during parsing, but will be
-    removed when inside a structured value or replaced by null in other cases.
-
-    @return `true` if type is discarded, `false` otherwise.
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies `is_discarded()` for all JSON
-    types.,is_discarded}
-
-    @since version 1.0.0
-    */
-    constexpr bool is_discarded() const noexcept
-    {
-        return m_type == value_t::discarded;
-    }
-
-    /*!
-    @brief return the type of the JSON value (implicit)
-
-    Implicitly return the type of the JSON value as a value from the @ref
-    value_t enumeration.
-
-    @return the type of the JSON value
-
-    @complexity Constant.
-
-    @exceptionsafety No-throw guarantee: this member function never throws
-    exceptions.
-
-    @liveexample{The following code exemplifies the @ref value_t operator for
-    all JSON types.,operator__value_t}
-
-    @sa @ref type() -- return the type of the JSON value (explicit)
-    @sa @ref type_name() -- return the type as string
-
-    @since version 1.0.0
-    */
-    constexpr operator value_t() const noexcept
-    {
-        return m_type;
-    }
-
-    /// @}
-
-  private:
-    //////////////////
-    // value access //
-    //////////////////
-
-    /// get a boolean (explicit)
-    boolean_t get_impl(boolean_t* /*unused*/) const
-    {
-        if (JSON_HEDLEY_LIKELY(is_boolean()))
-        {
-            return m_value.boolean;
-        }
-
-        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name())));
-    }
-
-    /// get a pointer to the value (object)
-    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
-    {
-        return is_object() ? m_value.object : nullptr;
-    }
-
-    /// get a pointer to the value (object)
-    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
-    {
-        return is_object() ? m_value.object : nullptr;
-    }
-
-    /// get a pointer to the value (array)
-    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
-    {
-        return is_array() ? m_value.array : nullptr;
-    }
-
-    /// get a pointer to the value (array)
-    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
-    {
-        return is_array() ? m_value.array : nullptr;
-    }
-
-    /// get a pointer to the value (string)
-    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
-    {
-        return is_string() ? m_value.string : nullptr;
-    }
-
-    /// get a pointer to the value (string)
-    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
-    {
-        return is_string() ? m_value.string : nullptr;
-    }
-
-    /// get a pointer to the value (boolean)
-    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
-    {
-        return is_boolean() ? &m_value.boolean : nullptr;
-    }
-
-    /// get a pointer to the value (boolean)
-    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
-    {
-        return is_boolean() ? &m_value.boolean : nullptr;
-    }
-
-    /// get a pointer to the value (integer number)
-    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
-    {
-        return is_number_integer() ? &m_value.number_integer : nullptr;
-    }
-
-    /// get a pointer to the value (integer number)
-    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
-    {
-        return is_number_integer() ? &m_value.number_integer : nullptr;
-    }
-
-    /// get a pointer to the value (unsigned number)
-    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
-    {
-        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
-    }
-
-    /// get a pointer to the value (unsigned number)
-    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
-    {
-        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
-    }
-
-    /// get a pointer to the value (floating-point number)
-    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
-    {
-        return is_number_float() ? &m_value.number_float : nullptr;
-    }
-
-    /// get a pointer to the value (floating-point number)
-    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
-    {
-        return is_number_float() ? &m_value.number_float : nullptr;
-    }
-
-    /// get a pointer to the value (binary)
-    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
-    {
-        return is_binary() ? m_value.binary : nullptr;
-    }
-
-    /// get a pointer to the value (binary)
-    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
-    {
-        return is_binary() ? m_value.binary : nullptr;
-    }
-
-    /*!
-    @brief helper function to implement get_ref()
-
-    This function helps to implement get_ref() without code duplication for
-    const and non-const overloads
-
-    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
-
-    @throw type_error.303 if ReferenceType does not match underlying value
-    type of the current JSON
-    */
-    template<typename ReferenceType, typename ThisType>
-    static ReferenceType get_ref_impl(ThisType& obj)
-    {
-        // delegate the call to get_ptr<>()
-        auto ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
-
-        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
-        {
-            return *ptr;
-        }
-
-        JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name())));
-    }
-
-  public:
-    /// @name value access
-    /// Direct access to the stored value of a JSON value.
-    /// @{
-
-    /*!
-    @brief get special-case overload
-
-    This overloads avoids a lot of template boilerplate, it can be seen as the
-    identity method
-
-    @tparam BasicJsonType == @ref basic_json
-
-    @return a copy of *this
-
-    @complexity Constant.
-
-    @since version 2.1.0
-    */
-    template<typename BasicJsonType, detail::enable_if_t<
-                 std::is_same<typename std::remove_const<BasicJsonType>::type, basic_json_t>::value,
-                 int> = 0>
-    basic_json get() const
-    {
-        return *this;
-    }
-
-    /*!
-    @brief get special-case overload
-
-    This overloads converts the current @ref basic_json in a different
-    @ref basic_json type
-
-    @tparam BasicJsonType == @ref basic_json
-
-    @return a copy of *this, converted into @tparam BasicJsonType
-
-    @complexity Depending on the implementation of the called `from_json()`
-                method.
-
-    @since version 3.2.0
-    */
-    template < typename BasicJsonType, detail::enable_if_t <
-                   !std::is_same<BasicJsonType, basic_json>::value&&
-                   detail::is_basic_json<BasicJsonType>::value, int > = 0 >
-    BasicJsonType get() const
-    {
-        return *this;
-    }
-
-    /*!
-    @brief get a value (explicit)
-
-    Explicit type conversion between the JSON value and a compatible value
-    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
-    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
-    The value is converted by calling the @ref json_serializer<ValueType>
-    `from_json()` method.
-
-    The function is equivalent to executing
-    @code {.cpp}
-    ValueType ret;
-    JSONSerializer<ValueType>::from_json(*this, ret);
-    return ret;
-    @endcode
-
-    This overloads is chosen if:
-    - @a ValueType is not @ref basic_json,
-    - @ref json_serializer<ValueType> has a `from_json()` method of the form
-      `void from_json(const basic_json&, ValueType&)`, and
-    - @ref json_serializer<ValueType> does not have a `from_json()` method of
-      the form `ValueType from_json(const basic_json&)`
-
-    @tparam ValueTypeCV the provided value type
-    @tparam ValueType the returned value type
-
-    @return copy of the JSON value, converted to @a ValueType
-
-    @throw what @ref json_serializer<ValueType> `from_json()` method throws
-
-    @liveexample{The example below shows several conversions from JSON values
-    to other types. There a few things to note: (1) Floating-point numbers can
-    be converted to integers\, (2) A JSON array can be converted to a standard
-    `std::vector<short>`\, (3) A JSON object can be converted to C++
-    associative containers such as `std::unordered_map<std::string\,
-    json>`.,get__ValueType_const}
-
-    @since version 2.1.0
-    */
-    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
-               detail::enable_if_t <
-                   !detail::is_basic_json<ValueType>::value &&
-                   detail::has_from_json<basic_json_t, ValueType>::value &&
-                   !detail::has_non_default_from_json<basic_json_t, ValueType>::value,
-                   int > = 0 >
-    ValueType get() const noexcept(noexcept(
-                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
-    {
-        // we cannot static_assert on ValueTypeCV being non-const, because
-        // there is support for get<const basic_json_t>(), which is why we
-        // still need the uncvref
-        static_assert(!std::is_reference<ValueTypeCV>::value,
-                      "get() cannot be used with reference types, you might want to use get_ref()");
-        static_assert(std::is_default_constructible<ValueType>::value,
-                      "types must be DefaultConstructible when used with get()");
-
-        ValueType ret;
-        JSONSerializer<ValueType>::from_json(*this, ret);
-        return ret;
-    }
-
-    /*!
-    @brief get a value (explicit); special case
-
-    Explicit type conversion between the JSON value and a compatible value
-    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
-    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
-    The value is converted by calling the @ref json_serializer<ValueType>
-    `from_json()` method.
-
-    The function is equivalent to executing
-    @code {.cpp}
-    return JSONSerializer<ValueTypeCV>::from_json(*this);
-    @endcode
-
-    This overloads is chosen if:
-    - @a ValueType is not @ref basic_json and
-    - @ref json_serializer<ValueType> has a `from_json()` method of the form
-      `ValueType from_json(const basic_json&)`
-
-    @note If @ref json_serializer<ValueType> has both overloads of
-    `from_json()`, this one is chosen.
-
-    @tparam ValueTypeCV the provided value type
-    @tparam ValueType the returned value type
-
-    @return copy of the JSON value, converted to @a ValueType
-
-    @throw what @ref json_serializer<ValueType> `from_json()` method throws
-
-    @since version 2.1.0
-    */
-    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>,
-               detail::enable_if_t < !std::is_same<basic_json_t, ValueType>::value &&
-                                     detail::has_non_default_from_json<basic_json_t, ValueType>::value,
-                                     int > = 0 >
-    ValueType get() const noexcept(noexcept(
-                                       JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
-    {
-        static_assert(!std::is_reference<ValueTypeCV>::value,
-                      "get() cannot be used with reference types, you might want to use get_ref()");
-        return JSONSerializer<ValueType>::from_json(*this);
-    }
-
-    /*!
-    @brief get a value (explicit)
-
-    Explicit type conversion between the JSON value and a compatible value.
-    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
-    `from_json()` method.
-
-    The function is equivalent to executing
-    @code {.cpp}
-    ValueType v;
-    JSONSerializer<ValueType>::from_json(*this, v);
-    @endcode
-
-    This overloads is chosen if:
-    - @a ValueType is not @ref basic_json,
-    - @ref json_serializer<ValueType> has a `from_json()` method of the form
-      `void from_json(const basic_json&, ValueType&)`, and
-
-    @tparam ValueType the input parameter type.
-
-    @return the input parameter, allowing chaining calls.
-
-    @throw what @ref json_serializer<ValueType> `from_json()` method throws
-
-    @liveexample{The example below shows several conversions from JSON values
-    to other types. There a few things to note: (1) Floating-point numbers can
-    be converted to integers\, (2) A JSON array can be converted to a standard
-    `std::vector<short>`\, (3) A JSON object can be converted to C++
-    associative containers such as `std::unordered_map<std::string\,
-    json>`.,get_to}
-
-    @since version 3.3.0
-    */
-    template < typename ValueType,
-               detail::enable_if_t <
-                   !detail::is_basic_json<ValueType>::value&&
-                   detail::has_from_json<basic_json_t, ValueType>::value,
-                   int > = 0 >
-    ValueType & get_to(ValueType& v) const noexcept(noexcept(
-                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
-    {
-        JSONSerializer<ValueType>::from_json(*this, v);
-        return v;
-    }
-
-    // specialization to allow to call get_to with a basic_json value
-    // see https://github.com/nlohmann/json/issues/2175
-    template<typename ValueType,
-             detail::enable_if_t <
-                 detail::is_basic_json<ValueType>::value,
-                 int> = 0>
-    ValueType & get_to(ValueType& v) const
-    {
-        v = *this;
-        return v;
-    }
-
-    template <
-        typename T, std::size_t N,
-        typename Array = T (&)[N],
-        detail::enable_if_t <
-            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
-    Array get_to(T (&v)[N]) const
-    noexcept(noexcept(JSONSerializer<Array>::from_json(
-                          std::declval<const basic_json_t&>(), v)))
-    {
-        JSONSerializer<Array>::from_json(*this, v);
-        return v;
-    }
-
-
-    /*!
-    @brief get a pointer value (implicit)
-
-    Implicit pointer access to the internally stored JSON value. No copies are
-    made.
-
-    @warning Writing data to the pointee of the result yields an undefined
-    state.
-
-    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
-    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
-    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
-    assertion.
-
-    @return pointer to the internally stored JSON value if the requested
-    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how pointers to internal values of a
-    JSON value can be requested. Note that no type conversions are made and a
-    `nullptr` is returned if the value and the requested pointer type does not
-    match.,get_ptr}
-
-    @since version 1.0.0
-    */
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value, int>::type = 0>
-    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
-    {
-        // delegate the call to get_impl_ptr<>()
-        return get_impl_ptr(static_cast<PointerType>(nullptr));
-    }
-
-    /*!
-    @brief get a pointer value (implicit)
-    @copydoc get_ptr()
-    */
-    template < typename PointerType, typename std::enable_if <
-                   std::is_pointer<PointerType>::value&&
-                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
-    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
-    {
-        // delegate the call to get_impl_ptr<>() const
-        return get_impl_ptr(static_cast<PointerType>(nullptr));
-    }
-
-    /*!
-    @brief get a pointer value (explicit)
-
-    Explicit pointer access to the internally stored JSON value. No copies are
-    made.
-
-    @warning The pointer becomes invalid if the underlying JSON object
-    changes.
-
-    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
-    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
-    @ref number_unsigned_t, or @ref number_float_t.
-
-    @return pointer to the internally stored JSON value if the requested
-    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how pointers to internal values of a
-    JSON value can be requested. Note that no type conversions are made and a
-    `nullptr` is returned if the value and the requested pointer type does not
-    match.,get__PointerType}
-
-    @sa @ref get_ptr() for explicit pointer-member access
-
-    @since version 1.0.0
-    */
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value, int>::type = 0>
-    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
-    {
-        // delegate the call to get_ptr
-        return get_ptr<PointerType>();
-    }
-
-    /*!
-    @brief get a pointer value (explicit)
-    @copydoc get()
-    */
-    template<typename PointerType, typename std::enable_if<
-                 std::is_pointer<PointerType>::value, int>::type = 0>
-    constexpr auto get() const noexcept -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
-    {
-        // delegate the call to get_ptr
-        return get_ptr<PointerType>();
-    }
-
-    /*!
-    @brief get a reference value (implicit)
-
-    Implicit reference access to the internally stored JSON value. No copies
-    are made.
-
-    @warning Writing data to the referee of the result yields an undefined
-    state.
-
-    @tparam ReferenceType reference type; must be a reference to @ref array_t,
-    @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or
-    @ref number_float_t. Enforced by static assertion.
-
-    @return reference to the internally stored JSON value if the requested
-    reference type @a ReferenceType fits to the JSON value; throws
-    type_error.303 otherwise
-
-    @throw type_error.303 in case passed type @a ReferenceType is incompatible
-    with the stored JSON value; see example below
-
-    @complexity Constant.
-
-    @liveexample{The example shows several calls to `get_ref()`.,get_ref}
-
-    @since version 1.1.0
-    */
-    template<typename ReferenceType, typename std::enable_if<
-                 std::is_reference<ReferenceType>::value, int>::type = 0>
-    ReferenceType get_ref()
-    {
-        // delegate call to get_ref_impl
-        return get_ref_impl<ReferenceType>(*this);
-    }
-
-    /*!
-    @brief get a reference value (implicit)
-    @copydoc get_ref()
-    */
-    template < typename ReferenceType, typename std::enable_if <
-                   std::is_reference<ReferenceType>::value&&
-                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
-    ReferenceType get_ref() const
-    {
-        // delegate call to get_ref_impl
-        return get_ref_impl<ReferenceType>(*this);
-    }
-
-    /*!
-    @brief get a value (implicit)
-
-    Implicit type conversion between the JSON value and a compatible value.
-    The call is realized by calling @ref get() const.
-
-    @tparam ValueType non-pointer type compatible to the JSON value, for
-    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
-    `std::vector` types for JSON arrays. The character type of @ref string_t
-    as well as an initializer list of this type is excluded to avoid
-    ambiguities as these types implicitly convert to `std::string`.
-
-    @return copy of the JSON value, converted to type @a ValueType
-
-    @throw type_error.302 in case passed type @a ValueType is incompatible
-    to the JSON value type (e.g., the JSON value is of type boolean, but a
-    string is requested); see example below
-
-    @complexity Linear in the size of the JSON value.
-
-    @liveexample{The example below shows several conversions from JSON values
-    to other types. There a few things to note: (1) Floating-point numbers can
-    be converted to integers\, (2) A JSON array can be converted to a standard
-    `std::vector<short>`\, (3) A JSON object can be converted to C++
-    associative containers such as `std::unordered_map<std::string\,
-    json>`.,operator__ValueType}
-
-    @since version 1.0.0
-    */
-    template < typename ValueType, typename std::enable_if <
-                   !std::is_pointer<ValueType>::value&&
-                   !std::is_same<ValueType, detail::json_ref<basic_json>>::value&&
-                   !std::is_same<ValueType, typename string_t::value_type>::value&&
-                   !detail::is_basic_json<ValueType>::value
-                   && !std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>::value
-#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
-                   && !std::is_same<ValueType, typename std::string_view>::value
-#endif
-                   && detail::is_detected<detail::get_template_function, const basic_json_t&, ValueType>::value
-                   , int >::type = 0 >
-    JSON_EXPLICIT operator ValueType() const
-    {
-        // delegate the call to get<>() const
-        return get<ValueType>();
-    }
-
-    /*!
-    @return reference to the binary value
-
-    @throw type_error.302 if the value is not binary
-
-    @sa @ref is_binary() to check if the value is binary
-
-    @since version 3.8.0
-    */
-    binary_t& get_binary()
-    {
-        if (!is_binary())
-        {
-            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name())));
-        }
-
-        return *get_ptr<binary_t*>();
-    }
-
-    /// @copydoc get_binary()
-    const binary_t& get_binary() const
-    {
-        if (!is_binary())
-        {
-            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name())));
-        }
-
-        return *get_ptr<const binary_t*>();
-    }
-
-    /// @}
-
-
-    ////////////////////
-    // element access //
-    ////////////////////
-
-    /// @name element access
-    /// Access to the JSON value.
-    /// @{
-
-    /*!
-    @brief access specified array element with bounds checking
-
-    Returns a reference to the element at specified location @a idx, with
-    bounds checking.
-
-    @param[in] idx  index of the element to access
-
-    @return reference to the element at index @a idx
-
-    @throw type_error.304 if the JSON value is not an array; in this case,
-    calling `at` with an index makes no sense. See example below.
-    @throw out_of_range.401 if the index @a idx is out of range of the array;
-    that is, `idx >= size()`. See example below.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Constant.
-
-    @since version 1.0.0
-
-    @liveexample{The example below shows how array elements can be read and
-    written using `at()`. It also demonstrates the different exceptions that
-    can be thrown.,at__size_type}
-    */
-    reference at(size_type idx)
-    {
-        // at only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            JSON_TRY
-            {
-                return m_value.array->at(idx);
-            }
-            JSON_CATCH (std::out_of_range&)
-            {
-                // create better exception explanation
-                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
-            }
-        }
-        else
-        {
-            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief access specified array element with bounds checking
-
-    Returns a const reference to the element at specified location @a idx,
-    with bounds checking.
-
-    @param[in] idx  index of the element to access
-
-    @return const reference to the element at index @a idx
-
-    @throw type_error.304 if the JSON value is not an array; in this case,
-    calling `at` with an index makes no sense. See example below.
-    @throw out_of_range.401 if the index @a idx is out of range of the array;
-    that is, `idx >= size()`. See example below.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Constant.
-
-    @since version 1.0.0
-
-    @liveexample{The example below shows how array elements can be read using
-    `at()`. It also demonstrates the different exceptions that can be thrown.,
-    at__size_type_const}
-    */
-    const_reference at(size_type idx) const
-    {
-        // at only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            JSON_TRY
-            {
-                return m_value.array->at(idx);
-            }
-            JSON_CATCH (std::out_of_range&)
-            {
-                // create better exception explanation
-                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
-            }
-        }
-        else
-        {
-            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief access specified object element with bounds checking
-
-    Returns a reference to the element at with specified key @a key, with
-    bounds checking.
-
-    @param[in] key  key of the element to access
-
-    @return reference to the element at key @a key
-
-    @throw type_error.304 if the JSON value is not an object; in this case,
-    calling `at` with a key makes no sense. See example below.
-    @throw out_of_range.403 if the key @a key is is not stored in the object;
-    that is, `find(key) == end()`. See example below.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Logarithmic in the size of the container.
-
-    @sa @ref operator[](const typename object_t::key_type&) for unchecked
-    access by reference
-    @sa @ref value() for access by value with a default value
-
-    @since version 1.0.0
-
-    @liveexample{The example below shows how object elements can be read and
-    written using `at()`. It also demonstrates the different exceptions that
-    can be thrown.,at__object_t_key_type}
-    */
-    reference at(const typename object_t::key_type& key)
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            JSON_TRY
-            {
-                return m_value.object->at(key);
-            }
-            JSON_CATCH (std::out_of_range&)
-            {
-                // create better exception explanation
-                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
-            }
-        }
-        else
-        {
-            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief access specified object element with bounds checking
-
-    Returns a const reference to the element at with specified key @a key,
-    with bounds checking.
-
-    @param[in] key  key of the element to access
-
-    @return const reference to the element at key @a key
-
-    @throw type_error.304 if the JSON value is not an object; in this case,
-    calling `at` with a key makes no sense. See example below.
-    @throw out_of_range.403 if the key @a key is is not stored in the object;
-    that is, `find(key) == end()`. See example below.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Logarithmic in the size of the container.
-
-    @sa @ref operator[](const typename object_t::key_type&) for unchecked
-    access by reference
-    @sa @ref value() for access by value with a default value
-
-    @since version 1.0.0
-
-    @liveexample{The example below shows how object elements can be read using
-    `at()`. It also demonstrates the different exceptions that can be thrown.,
-    at__object_t_key_type_const}
-    */
-    const_reference at(const typename object_t::key_type& key) const
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            JSON_TRY
-            {
-                return m_value.object->at(key);
-            }
-            JSON_CATCH (std::out_of_range&)
-            {
-                // create better exception explanation
-                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found"));
-            }
-        }
-        else
-        {
-            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief access specified array element
-
-    Returns a reference to the element at specified location @a idx.
-
-    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
-    then the array is silently filled up with `null` values to make `idx` a
-    valid reference to the last stored element.
-
-    @param[in] idx  index of the element to access
-
-    @return reference to the element at index @a idx
-
-    @throw type_error.305 if the JSON value is not an array or null; in that
-    cases, using the [] operator with an index makes no sense.
-
-    @complexity Constant if @a idx is in the range of the array. Otherwise
-    linear in `idx - size()`.
-
-    @liveexample{The example below shows how array elements can be read and
-    written using `[]` operator. Note the addition of `null`
-    values.,operatorarray__size_type}
-
-    @since version 1.0.0
-    */
-    reference operator[](size_type idx)
-    {
-        // implicitly convert null value to an empty array
-        if (is_null())
-        {
-            m_type = value_t::array;
-            m_value.array = create<array_t>();
-            assert_invariant();
-        }
-
-        // operator[] only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            // fill up array with null values if given idx is outside range
-            if (idx >= m_value.array->size())
-            {
-                m_value.array->insert(m_value.array->end(),
-                                      idx - m_value.array->size() + 1,
-                                      basic_json());
-            }
-
-            return m_value.array->operator[](idx);
-        }
-
-        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief access specified array element
-
-    Returns a const reference to the element at specified location @a idx.
-
-    @param[in] idx  index of the element to access
-
-    @return const reference to the element at index @a idx
-
-    @throw type_error.305 if the JSON value is not an array; in that case,
-    using the [] operator with an index makes no sense.
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how array elements can be read using
-    the `[]` operator.,operatorarray__size_type_const}
-
-    @since version 1.0.0
-    */
-    const_reference operator[](size_type idx) const
-    {
-        // const operator[] only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            return m_value.array->operator[](idx);
-        }
-
-        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief access specified object element
-
-    Returns a reference to the element at with specified key @a key.
-
-    @note If @a key is not found in the object, then it is silently added to
-    the object and filled with a `null` value to make `key` a valid reference.
-    In case the value was `null` before, it is converted to an object.
-
-    @param[in] key  key of the element to access
-
-    @return reference to the element at key @a key
-
-    @throw type_error.305 if the JSON value is not an object or null; in that
-    cases, using the [] operator with a key makes no sense.
-
-    @complexity Logarithmic in the size of the container.
-
-    @liveexample{The example below shows how object elements can be read and
-    written using the `[]` operator.,operatorarray__key_type}
-
-    @sa @ref at(const typename object_t::key_type&) for access by reference
-    with range checking
-    @sa @ref value() for access by value with a default value
-
-    @since version 1.0.0
-    */
-    reference operator[](const typename object_t::key_type& key)
-    {
-        // implicitly convert null value to an empty object
-        if (is_null())
-        {
-            m_type = value_t::object;
-            m_value.object = create<object_t>();
-            assert_invariant();
-        }
-
-        // operator[] only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            return m_value.object->operator[](key);
-        }
-
-        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief read-only access specified object element
-
-    Returns a const reference to the element at with specified key @a key. No
-    bounds checking is performed.
-
-    @warning If the element with key @a key does not exist, the behavior is
-    undefined.
-
-    @param[in] key  key of the element to access
-
-    @return const reference to the element at key @a key
-
-    @pre The element with key @a key must exist. **This precondition is
-         enforced with an assertion.**
-
-    @throw type_error.305 if the JSON value is not an object; in that case,
-    using the [] operator with a key makes no sense.
-
-    @complexity Logarithmic in the size of the container.
-
-    @liveexample{The example below shows how object elements can be read using
-    the `[]` operator.,operatorarray__key_type_const}
-
-    @sa @ref at(const typename object_t::key_type&) for access by reference
-    with range checking
-    @sa @ref value() for access by value with a default value
-
-    @since version 1.0.0
-    */
-    const_reference operator[](const typename object_t::key_type& key) const
-    {
-        // const operator[] only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
-            return m_value.object->find(key)->second;
-        }
-
-        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief access specified object element
-
-    Returns a reference to the element at with specified key @a key.
-
-    @note If @a key is not found in the object, then it is silently added to
-    the object and filled with a `null` value to make `key` a valid reference.
-    In case the value was `null` before, it is converted to an object.
-
-    @param[in] key  key of the element to access
-
-    @return reference to the element at key @a key
-
-    @throw type_error.305 if the JSON value is not an object or null; in that
-    cases, using the [] operator with a key makes no sense.
-
-    @complexity Logarithmic in the size of the container.
-
-    @liveexample{The example below shows how object elements can be read and
-    written using the `[]` operator.,operatorarray__key_type}
-
-    @sa @ref at(const typename object_t::key_type&) for access by reference
-    with range checking
-    @sa @ref value() for access by value with a default value
-
-    @since version 1.1.0
-    */
-    template<typename T>
-    JSON_HEDLEY_NON_NULL(2)
-    reference operator[](T* key)
-    {
-        // implicitly convert null to object
-        if (is_null())
-        {
-            m_type = value_t::object;
-            m_value = value_t::object;
-            assert_invariant();
-        }
-
-        // at only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            return m_value.object->operator[](key);
-        }
-
-        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief read-only access specified object element
-
-    Returns a const reference to the element at with specified key @a key. No
-    bounds checking is performed.
-
-    @warning If the element with key @a key does not exist, the behavior is
-    undefined.
-
-    @param[in] key  key of the element to access
-
-    @return const reference to the element at key @a key
-
-    @pre The element with key @a key must exist. **This precondition is
-         enforced with an assertion.**
-
-    @throw type_error.305 if the JSON value is not an object; in that case,
-    using the [] operator with a key makes no sense.
-
-    @complexity Logarithmic in the size of the container.
-
-    @liveexample{The example below shows how object elements can be read using
-    the `[]` operator.,operatorarray__key_type_const}
-
-    @sa @ref at(const typename object_t::key_type&) for access by reference
-    with range checking
-    @sa @ref value() for access by value with a default value
-
-    @since version 1.1.0
-    */
-    template<typename T>
-    JSON_HEDLEY_NON_NULL(2)
-    const_reference operator[](T* key) const
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
-            return m_value.object->find(key)->second;
-        }
-
-        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief access specified object element with default value
-
-    Returns either a copy of an object's element at the specified key @a key
-    or a given default value if no element with key @a key exists.
-
-    The function is basically equivalent to executing
-    @code {.cpp}
-    try {
-        return at(key);
-    } catch(out_of_range) {
-        return default_value;
-    }
-    @endcode
-
-    @note Unlike @ref at(const typename object_t::key_type&), this function
-    does not throw if the given key @a key was not found.
-
-    @note Unlike @ref operator[](const typename object_t::key_type& key), this
-    function does not implicitly add an element to the position defined by @a
-    key. This function is furthermore also applicable to const objects.
-
-    @param[in] key  key of the element to access
-    @param[in] default_value  the value to return if @a key is not found
-
-    @tparam ValueType type compatible to JSON values, for instance `int` for
-    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
-    JSON arrays. Note the type of the expected value at @a key and the default
-    value @a default_value must be compatible.
-
-    @return copy of the element at key @a key or @a default_value if @a key
-    is not found
-
-    @throw type_error.302 if @a default_value does not match the type of the
-    value at @a key
-    @throw type_error.306 if the JSON value is not an object; in that case,
-    using `value()` with a key makes no sense.
-
-    @complexity Logarithmic in the size of the container.
-
-    @liveexample{The example below shows how object elements can be queried
-    with a default value.,basic_json__value}
-
-    @sa @ref at(const typename object_t::key_type&) for access by reference
-    with range checking
-    @sa @ref operator[](const typename object_t::key_type&) for unchecked
-    access by reference
-
-    @since version 1.0.0
-    */
-    // using std::is_convertible in a std::enable_if will fail when using explicit conversions
-    template < class ValueType, typename std::enable_if <
-                   detail::is_getable<basic_json_t, ValueType>::value
-                   && !std::is_same<value_t, ValueType>::value, int >::type = 0 >
-    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if key is found, return value and given default value otherwise
-            const auto it = find(key);
-            if (it != end())
-            {
-                return it->template get<ValueType>();
-            }
-
-            return default_value;
-        }
-
-        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief overload for a default value of type const char*
-    @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const
-    */
-    string_t value(const typename object_t::key_type& key, const char* default_value) const
-    {
-        return value(key, string_t(default_value));
-    }
-
-    /*!
-    @brief access specified object element via JSON Pointer with default value
-
-    Returns either a copy of an object's element at the specified key @a key
-    or a given default value if no element with key @a key exists.
-
-    The function is basically equivalent to executing
-    @code {.cpp}
-    try {
-        return at(ptr);
-    } catch(out_of_range) {
-        return default_value;
-    }
-    @endcode
-
-    @note Unlike @ref at(const json_pointer&), this function does not throw
-    if the given key @a key was not found.
-
-    @param[in] ptr  a JSON pointer to the element to access
-    @param[in] default_value  the value to return if @a ptr found no value
-
-    @tparam ValueType type compatible to JSON values, for instance `int` for
-    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
-    JSON arrays. Note the type of the expected value at @a key and the default
-    value @a default_value must be compatible.
-
-    @return copy of the element at key @a key or @a default_value if @a key
-    is not found
-
-    @throw type_error.302 if @a default_value does not match the type of the
-    value at @a ptr
-    @throw type_error.306 if the JSON value is not an object; in that case,
-    using `value()` with a key makes no sense.
-
-    @complexity Logarithmic in the size of the container.
-
-    @liveexample{The example below shows how object elements can be queried
-    with a default value.,basic_json__value_ptr}
-
-    @sa @ref operator[](const json_pointer&) for unchecked access by reference
-
-    @since version 2.0.2
-    */
-    template<class ValueType, typename std::enable_if<
-                 detail::is_getable<basic_json_t, ValueType>::value, int>::type = 0>
-    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
-    {
-        // at only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            // if pointer resolves a value, return it or use default value
-            JSON_TRY
-            {
-                return ptr.get_checked(this).template get<ValueType>();
-            }
-            JSON_INTERNAL_CATCH (out_of_range&)
-            {
-                return default_value;
-            }
-        }
-
-        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief overload for a default value of type const char*
-    @copydoc basic_json::value(const json_pointer&, ValueType) const
-    */
-    JSON_HEDLEY_NON_NULL(3)
-    string_t value(const json_pointer& ptr, const char* default_value) const
-    {
-        return value(ptr, string_t(default_value));
-    }
-
-    /*!
-    @brief access the first element
-
-    Returns a reference to the first element in the container. For a JSON
-    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
-
-    @return In case of a structured type (array or object), a reference to the
-    first element is returned. In case of number, string, boolean, or binary
-    values, a reference to the value is returned.
-
-    @complexity Constant.
-
-    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
-    or an empty array or object (undefined behavior, **guarded by
-    assertions**).
-    @post The JSON value remains unchanged.
-
-    @throw invalid_iterator.214 when called on `null` value
-
-    @liveexample{The following code shows an example for `front()`.,front}
-
-    @sa @ref back() -- access the last element
-
-    @since version 1.0.0
-    */
-    reference front()
-    {
-        return *begin();
-    }
-
-    /*!
-    @copydoc basic_json::front()
-    */
-    const_reference front() const
-    {
-        return *cbegin();
-    }
-
-    /*!
-    @brief access the last element
-
-    Returns a reference to the last element in the container. For a JSON
-    container `c`, the expression `c.back()` is equivalent to
-    @code {.cpp}
-    auto tmp = c.end();
-    --tmp;
-    return *tmp;
-    @endcode
-
-    @return In case of a structured type (array or object), a reference to the
-    last element is returned. In case of number, string, boolean, or binary
-    values, a reference to the value is returned.
-
-    @complexity Constant.
-
-    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
-    or an empty array or object (undefined behavior, **guarded by
-    assertions**).
-    @post The JSON value remains unchanged.
-
-    @throw invalid_iterator.214 when called on a `null` value. See example
-    below.
-
-    @liveexample{The following code shows an example for `back()`.,back}
-
-    @sa @ref front() -- access the first element
-
-    @since version 1.0.0
-    */
-    reference back()
-    {
-        auto tmp = end();
-        --tmp;
-        return *tmp;
-    }
-
-    /*!
-    @copydoc basic_json::back()
-    */
-    const_reference back() const
-    {
-        auto tmp = cend();
-        --tmp;
-        return *tmp;
-    }
-
-    /*!
-    @brief remove element given an iterator
-
-    Removes the element specified by iterator @a pos. The iterator @a pos must
-    be valid and dereferenceable. Thus the `end()` iterator (which is valid,
-    but is not dereferenceable) cannot be used as a value for @a pos.
-
-    If called on a primitive type other than `null`, the resulting JSON value
-    will be `null`.
-
-    @param[in] pos iterator to the element to remove
-    @return Iterator following the last removed element. If the iterator @a
-    pos refers to the last element, the `end()` iterator is returned.
-
-    @tparam IteratorType an @ref iterator or @ref const_iterator
-
-    @post Invalidates iterators and references at or after the point of the
-    erase, including the `end()` iterator.
-
-    @throw type_error.307 if called on a `null` value; example: `"cannot use
-    erase() with null"`
-    @throw invalid_iterator.202 if called on an iterator which does not belong
-    to the current JSON value; example: `"iterator does not fit current
-    value"`
-    @throw invalid_iterator.205 if called on a primitive type with invalid
-    iterator (i.e., any iterator which is not `begin()`); example: `"iterator
-    out of range"`
-
-    @complexity The complexity depends on the type:
-    - objects: amortized constant
-    - arrays: linear in distance between @a pos and the end of the container
-    - strings and binary: linear in the length of the member
-    - other types: constant
-
-    @liveexample{The example shows the result of `erase()` for different JSON
-    types.,erase__IteratorType}
-
-    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
-    the given range
-    @sa @ref erase(const typename object_t::key_type&) -- removes the element
-    from an object at the given key
-    @sa @ref erase(const size_type) -- removes the element from an array at
-    the given index
-
-    @since version 1.0.0
-    */
-    template < class IteratorType, typename std::enable_if <
-                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
-                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
-               = 0 >
-    IteratorType erase(IteratorType pos)
-    {
-        // make sure iterator fits the current value
-        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
-        }
-
-        IteratorType result = end();
-
-        switch (m_type)
-        {
-            case value_t::boolean:
-            case value_t::number_float:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::string:
-            case value_t::binary:
-            {
-                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
-                {
-                    JSON_THROW(invalid_iterator::create(205, "iterator out of range"));
-                }
-
-                if (is_string())
-                {
-                    AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
-                    m_value.string = nullptr;
-                }
-                else if (is_binary())
-                {
-                    AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
-                    m_value.binary = nullptr;
-                }
-
-                m_type = value_t::null;
-                assert_invariant();
-                break;
-            }
-
-            case value_t::object:
-            {
-                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
-                break;
-            }
-
-            case value_t::array:
-            {
-                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
-                break;
-            }
-
-            default:
-                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
-        }
-
-        return result;
-    }
-
-    /*!
-    @brief remove elements given an iterator range
-
-    Removes the element specified by the range `[first; last)`. The iterator
-    @a first does not need to be dereferenceable if `first == last`: erasing
-    an empty range is a no-op.
-
-    If called on a primitive type other than `null`, the resulting JSON value
-    will be `null`.
-
-    @param[in] first iterator to the beginning of the range to remove
-    @param[in] last iterator past the end of the range to remove
-    @return Iterator following the last removed element. If the iterator @a
-    second refers to the last element, the `end()` iterator is returned.
-
-    @tparam IteratorType an @ref iterator or @ref const_iterator
-
-    @post Invalidates iterators and references at or after the point of the
-    erase, including the `end()` iterator.
-
-    @throw type_error.307 if called on a `null` value; example: `"cannot use
-    erase() with null"`
-    @throw invalid_iterator.203 if called on iterators which does not belong
-    to the current JSON value; example: `"iterators do not fit current value"`
-    @throw invalid_iterator.204 if called on a primitive type with invalid
-    iterators (i.e., if `first != begin()` and `last != end()`); example:
-    `"iterators out of range"`
-
-    @complexity The complexity depends on the type:
-    - objects: `log(size()) + std::distance(first, last)`
-    - arrays: linear in the distance between @a first and @a last, plus linear
-      in the distance between @a last and end of the container
-    - strings and binary: linear in the length of the member
-    - other types: constant
-
-    @liveexample{The example shows the result of `erase()` for different JSON
-    types.,erase__IteratorType_IteratorType}
-
-    @sa @ref erase(IteratorType) -- removes the element at a given position
-    @sa @ref erase(const typename object_t::key_type&) -- removes the element
-    from an object at the given key
-    @sa @ref erase(const size_type) -- removes the element from an array at
-    the given index
-
-    @since version 1.0.0
-    */
-    template < class IteratorType, typename std::enable_if <
-                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
-                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
-               = 0 >
-    IteratorType erase(IteratorType first, IteratorType last)
-    {
-        // make sure iterator fits the current value
-        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value"));
-        }
-
-        IteratorType result = end();
-
-        switch (m_type)
-        {
-            case value_t::boolean:
-            case value_t::number_float:
-            case value_t::number_integer:
-            case value_t::number_unsigned:
-            case value_t::string:
-            case value_t::binary:
-            {
-                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
-                                       || !last.m_it.primitive_iterator.is_end()))
-                {
-                    JSON_THROW(invalid_iterator::create(204, "iterators out of range"));
-                }
-
-                if (is_string())
-                {
-                    AllocatorType<string_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
-                    m_value.string = nullptr;
-                }
-                else if (is_binary())
-                {
-                    AllocatorType<binary_t> alloc;
-                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
-                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
-                    m_value.binary = nullptr;
-                }
-
-                m_type = value_t::null;
-                assert_invariant();
-                break;
-            }
-
-            case value_t::object:
-            {
-                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
-                                              last.m_it.object_iterator);
-                break;
-            }
-
-            case value_t::array:
-            {
-                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
-                                             last.m_it.array_iterator);
-                break;
-            }
-
-            default:
-                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
-        }
-
-        return result;
-    }
-
-    /*!
-    @brief remove element from a JSON object given a key
-
-    Removes elements from a JSON object with the key value @a key.
-
-    @param[in] key value of the elements to remove
-
-    @return Number of elements removed. If @a ObjectType is the default
-    `std::map` type, the return value will always be `0` (@a key was not
-    found) or `1` (@a key was found).
-
-    @post References and iterators to the erased elements are invalidated.
-    Other references and iterators are not affected.
-
-    @throw type_error.307 when called on a type other than JSON object;
-    example: `"cannot use erase() with null"`
-
-    @complexity `log(size()) + count(key)`
-
-    @liveexample{The example shows the effect of `erase()`.,erase__key_type}
-
-    @sa @ref erase(IteratorType) -- removes the element at a given position
-    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
-    the given range
-    @sa @ref erase(const size_type) -- removes the element from an array at
-    the given index
-
-    @since version 1.0.0
-    */
-    size_type erase(const typename object_t::key_type& key)
-    {
-        // this erase only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            return m_value.object->erase(key);
-        }
-
-        JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief remove element from a JSON array given an index
-
-    Removes element from a JSON array at the index @a idx.
-
-    @param[in] idx index of the element to remove
-
-    @throw type_error.307 when called on a type other than JSON object;
-    example: `"cannot use erase() with null"`
-    @throw out_of_range.401 when `idx >= size()`; example: `"array index 17
-    is out of range"`
-
-    @complexity Linear in distance between @a idx and the end of the container.
-
-    @liveexample{The example shows the effect of `erase()`.,erase__size_type}
-
-    @sa @ref erase(IteratorType) -- removes the element at a given position
-    @sa @ref erase(IteratorType, IteratorType) -- removes the elements in
-    the given range
-    @sa @ref erase(const typename object_t::key_type&) -- removes the element
-    from an object at the given key
-
-    @since version 1.0.0
-    */
-    void erase(const size_type idx)
-    {
-        // this erase only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
-            {
-                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
-            }
-
-            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
-        }
-        else
-        {
-            JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name())));
-        }
-    }
-
-    /// @}
-
-
-    ////////////
-    // lookup //
-    ////////////
-
-    /// @name lookup
-    /// @{
-
-    /*!
-    @brief find an element in a JSON object
-
-    Finds an element in a JSON object with key equivalent to @a key. If the
-    element is not found or the JSON value is not an object, end() is
-    returned.
-
-    @note This method always returns @ref end() when executed on a JSON type
-          that is not an object.
-
-    @param[in] key key value of the element to search for.
-
-    @return Iterator to an element with key equivalent to @a key. If no such
-    element is found or the JSON value is not an object, past-the-end (see
-    @ref end()) iterator is returned.
-
-    @complexity Logarithmic in the size of the JSON object.
-
-    @liveexample{The example shows how `find()` is used.,find__key_type}
-
-    @sa @ref contains(KeyT&&) const -- checks whether a key exists
-
-    @since version 1.0.0
-    */
-    template<typename KeyT>
-    iterator find(KeyT&& key)
-    {
-        auto result = end();
-
-        if (is_object())
-        {
-            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
-        }
-
-        return result;
-    }
-
-    /*!
-    @brief find an element in a JSON object
-    @copydoc find(KeyT&&)
-    */
-    template<typename KeyT>
-    const_iterator find(KeyT&& key) const
-    {
-        auto result = cend();
-
-        if (is_object())
-        {
-            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
-        }
-
-        return result;
-    }
-
-    /*!
-    @brief returns the number of occurrences of a key in a JSON object
-
-    Returns the number of elements with key @a key. If ObjectType is the
-    default `std::map` type, the return value will always be `0` (@a key was
-    not found) or `1` (@a key was found).
-
-    @note This method always returns `0` when executed on a JSON type that is
-          not an object.
-
-    @param[in] key key value of the element to count
-
-    @return Number of elements with key @a key. If the JSON value is not an
-    object, the return value will be `0`.
-
-    @complexity Logarithmic in the size of the JSON object.
-
-    @liveexample{The example shows how `count()` is used.,count}
-
-    @since version 1.0.0
-    */
-    template<typename KeyT>
-    size_type count(KeyT&& key) const
-    {
-        // return 0 for all nonobject types
-        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
-    }
-
-    /*!
-    @brief check the existence of an element in a JSON object
-
-    Check whether an element exists in a JSON object with key equivalent to
-    @a key. If the element is not found or the JSON value is not an object,
-    false is returned.
-
-    @note This method always returns false when executed on a JSON type
-          that is not an object.
-
-    @param[in] key key value to check its existence.
-
-    @return true if an element with specified @a key exists. If no such
-    element with such key is found or the JSON value is not an object,
-    false is returned.
-
-    @complexity Logarithmic in the size of the JSON object.
-
-    @liveexample{The following code shows an example for `contains()`.,contains}
-
-    @sa @ref find(KeyT&&) -- returns an iterator to an object element
-    @sa @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer
-
-    @since version 3.6.0
-    */
-    template < typename KeyT, typename std::enable_if <
-                   !std::is_same<typename std::decay<KeyT>::type, json_pointer>::value, int >::type = 0 >
-    bool contains(KeyT && key) const
-    {
-        return is_object() && m_value.object->find(std::forward<KeyT>(key)) != m_value.object->end();
-    }
-
-    /*!
-    @brief check the existence of an element in a JSON object given a JSON pointer
-
-    Check whether the given JSON pointer @a ptr can be resolved in the current
-    JSON value.
-
-    @note This method can be executed on any JSON value type.
-
-    @param[in] ptr JSON pointer to check its existence.
-
-    @return true if the JSON pointer can be resolved to a stored value, false
-    otherwise.
-
-    @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`.
-
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-
-    @complexity Logarithmic in the size of the JSON object.
-
-    @liveexample{The following code shows an example for `contains()`.,contains_json_pointer}
-
-    @sa @ref contains(KeyT &&) const -- checks the existence of a key
-
-    @since version 3.7.0
-    */
-    bool contains(const json_pointer& ptr) const
-    {
-        return ptr.contains(this);
-    }
-
-    /// @}
-
-
-    ///////////////
-    // iterators //
-    ///////////////
-
-    /// @name iterators
-    /// @{
-
-    /*!
-    @brief returns an iterator to the first element
-
-    Returns an iterator to the first element.
-
-    @image html range-begin-end.svg "Illustration from cppreference.com"
-
-    @return iterator to the first element
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-
-    @liveexample{The following code shows an example for `begin()`.,begin}
-
-    @sa @ref cbegin() -- returns a const iterator to the beginning
-    @sa @ref end() -- returns an iterator to the end
-    @sa @ref cend() -- returns a const iterator to the end
-
-    @since version 1.0.0
-    */
-    iterator begin() noexcept
-    {
-        iterator result(this);
-        result.set_begin();
-        return result;
-    }
-
-    /*!
-    @copydoc basic_json::cbegin()
-    */
-    const_iterator begin() const noexcept
-    {
-        return cbegin();
-    }
-
-    /*!
-    @brief returns a const iterator to the first element
-
-    Returns a const iterator to the first element.
-
-    @image html range-begin-end.svg "Illustration from cppreference.com"
-
-    @return const iterator to the first element
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
-
-    @liveexample{The following code shows an example for `cbegin()`.,cbegin}
-
-    @sa @ref begin() -- returns an iterator to the beginning
-    @sa @ref end() -- returns an iterator to the end
-    @sa @ref cend() -- returns a const iterator to the end
-
-    @since version 1.0.0
-    */
-    const_iterator cbegin() const noexcept
-    {
-        const_iterator result(this);
-        result.set_begin();
-        return result;
-    }
-
-    /*!
-    @brief returns an iterator to one past the last element
-
-    Returns an iterator to one past the last element.
-
-    @image html range-begin-end.svg "Illustration from cppreference.com"
-
-    @return iterator one past the last element
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-
-    @liveexample{The following code shows an example for `end()`.,end}
-
-    @sa @ref cend() -- returns a const iterator to the end
-    @sa @ref begin() -- returns an iterator to the beginning
-    @sa @ref cbegin() -- returns a const iterator to the beginning
-
-    @since version 1.0.0
-    */
-    iterator end() noexcept
-    {
-        iterator result(this);
-        result.set_end();
-        return result;
-    }
-
-    /*!
-    @copydoc basic_json::cend()
-    */
-    const_iterator end() const noexcept
-    {
-        return cend();
-    }
-
-    /*!
-    @brief returns a const iterator to one past the last element
-
-    Returns a const iterator to one past the last element.
-
-    @image html range-begin-end.svg "Illustration from cppreference.com"
-
-    @return const iterator one past the last element
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
-
-    @liveexample{The following code shows an example for `cend()`.,cend}
-
-    @sa @ref end() -- returns an iterator to the end
-    @sa @ref begin() -- returns an iterator to the beginning
-    @sa @ref cbegin() -- returns a const iterator to the beginning
-
-    @since version 1.0.0
-    */
-    const_iterator cend() const noexcept
-    {
-        const_iterator result(this);
-        result.set_end();
-        return result;
-    }
-
-    /*!
-    @brief returns an iterator to the reverse-beginning
-
-    Returns an iterator to the reverse-beginning; that is, the last element.
-
-    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `reverse_iterator(end())`.
-
-    @liveexample{The following code shows an example for `rbegin()`.,rbegin}
-
-    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
-    @sa @ref rend() -- returns a reverse iterator to the end
-    @sa @ref crend() -- returns a const reverse iterator to the end
-
-    @since version 1.0.0
-    */
-    reverse_iterator rbegin() noexcept
-    {
-        return reverse_iterator(end());
-    }
-
-    /*!
-    @copydoc basic_json::crbegin()
-    */
-    const_reverse_iterator rbegin() const noexcept
-    {
-        return crbegin();
-    }
-
-    /*!
-    @brief returns an iterator to the reverse-end
-
-    Returns an iterator to the reverse-end; that is, one before the first
-    element.
-
-    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `reverse_iterator(begin())`.
-
-    @liveexample{The following code shows an example for `rend()`.,rend}
-
-    @sa @ref crend() -- returns a const reverse iterator to the end
-    @sa @ref rbegin() -- returns a reverse iterator to the beginning
-    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
-
-    @since version 1.0.0
-    */
-    reverse_iterator rend() noexcept
-    {
-        return reverse_iterator(begin());
-    }
-
-    /*!
-    @copydoc basic_json::crend()
-    */
-    const_reverse_iterator rend() const noexcept
-    {
-        return crend();
-    }
-
-    /*!
-    @brief returns a const reverse iterator to the last element
-
-    Returns a const iterator to the reverse-beginning; that is, the last
-    element.
-
-    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
-
-    @liveexample{The following code shows an example for `crbegin()`.,crbegin}
-
-    @sa @ref rbegin() -- returns a reverse iterator to the beginning
-    @sa @ref rend() -- returns a reverse iterator to the end
-    @sa @ref crend() -- returns a const reverse iterator to the end
-
-    @since version 1.0.0
-    */
-    const_reverse_iterator crbegin() const noexcept
-    {
-        return const_reverse_iterator(cend());
-    }
-
-    /*!
-    @brief returns a const reverse iterator to one before the first
-
-    Returns a const reverse iterator to the reverse-end; that is, one before
-    the first element.
-
-    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
-
-    @complexity Constant.
-
-    @requirement This function helps `basic_json` satisfying the
-    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
-
-    @liveexample{The following code shows an example for `crend()`.,crend}
-
-    @sa @ref rend() -- returns a reverse iterator to the end
-    @sa @ref rbegin() -- returns a reverse iterator to the beginning
-    @sa @ref crbegin() -- returns a const reverse iterator to the beginning
-
-    @since version 1.0.0
-    */
-    const_reverse_iterator crend() const noexcept
-    {
-        return const_reverse_iterator(cbegin());
-    }
-
-  public:
-    /*!
-    @brief wrapper to access iterator member functions in range-based for
-
-    This function allows to access @ref iterator::key() and @ref
-    iterator::value() during range-based for loops. In these loops, a
-    reference to the JSON values is returned, so there is no access to the
-    underlying iterator.
-
-    For loop without iterator_wrapper:
-
-    @code{cpp}
-    for (auto it = j_object.begin(); it != j_object.end(); ++it)
-    {
-        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
-    }
-    @endcode
-
-    Range-based for loop without iterator proxy:
-
-    @code{cpp}
-    for (auto it : j_object)
-    {
-        // "it" is of type json::reference and has no key() member
-        std::cout << "value: " << it << '\n';
-    }
-    @endcode
-
-    Range-based for loop with iterator proxy:
-
-    @code{cpp}
-    for (auto it : json::iterator_wrapper(j_object))
-    {
-        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
-    }
-    @endcode
-
-    @note When iterating over an array, `key()` will return the index of the
-          element as string (see example).
-
-    @param[in] ref  reference to a JSON value
-    @return iteration proxy object wrapping @a ref with an interface to use in
-            range-based for loops
-
-    @liveexample{The following code shows how the wrapper is used,iterator_wrapper}
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Constant.
-
-    @note The name of this function is not yet final and may change in the
-    future.
-
-    @deprecated This stream operator is deprecated and will be removed in
-                future 4.0.0 of the library. Please use @ref items() instead;
-                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
-    */
-    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
-    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
-    {
-        return ref.items();
-    }
-
-    /*!
-    @copydoc iterator_wrapper(reference)
-    */
-    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
-    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
-    {
-        return ref.items();
-    }
-
-    /*!
-    @brief helper to access iterator member functions in range-based for
-
-    This function allows to access @ref iterator::key() and @ref
-    iterator::value() during range-based for loops. In these loops, a
-    reference to the JSON values is returned, so there is no access to the
-    underlying iterator.
-
-    For loop without `items()` function:
-
-    @code{cpp}
-    for (auto it = j_object.begin(); it != j_object.end(); ++it)
-    {
-        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
-    }
-    @endcode
-
-    Range-based for loop without `items()` function:
-
-    @code{cpp}
-    for (auto it : j_object)
-    {
-        // "it" is of type json::reference and has no key() member
-        std::cout << "value: " << it << '\n';
-    }
-    @endcode
-
-    Range-based for loop with `items()` function:
-
-    @code{cpp}
-    for (auto& el : j_object.items())
-    {
-        std::cout << "key: " << el.key() << ", value:" << el.value() << '\n';
-    }
-    @endcode
-
-    The `items()` function also allows to use
-    [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding)
-    (C++17):
-
-    @code{cpp}
-    for (auto& [key, val] : j_object.items())
-    {
-        std::cout << "key: " << key << ", value:" << val << '\n';
-    }
-    @endcode
-
-    @note When iterating over an array, `key()` will return the index of the
-          element as string (see example). For primitive types (e.g., numbers),
-          `key()` returns an empty string.
-
-    @warning Using `items()` on temporary objects is dangerous. Make sure the
-             object's lifetime exeeds the iteration. See
-             <https://github.com/nlohmann/json/issues/2040> for more
-             information.
-
-    @return iteration proxy object wrapping @a ref with an interface to use in
-            range-based for loops
-
-    @liveexample{The following code shows how the function is used.,items}
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Constant.
-
-    @since version 3.1.0, structured bindings support since 3.5.0.
-    */
-    iteration_proxy<iterator> items() noexcept
-    {
-        return iteration_proxy<iterator>(*this);
-    }
-
-    /*!
-    @copydoc items()
-    */
-    iteration_proxy<const_iterator> items() const noexcept
-    {
-        return iteration_proxy<const_iterator>(*this);
-    }
-
-    /// @}
-
-
-    //////////////
-    // capacity //
-    //////////////
-
-    /// @name capacity
-    /// @{
-
-    /*!
-    @brief checks whether the container is empty.
-
-    Checks if a JSON value has no elements (i.e. whether its @ref size is `0`).
-
-    @return The return value depends on the different types and is
-            defined as follows:
-            Value type  | return value
-            ----------- | -------------
-            null        | `true`
-            boolean     | `false`
-            string      | `false`
-            number      | `false`
-            binary      | `false`
-            object      | result of function `object_t::empty()`
-            array       | result of function `array_t::empty()`
-
-    @liveexample{The following code uses `empty()` to check if a JSON
-    object contains any elements.,empty}
-
-    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
-    the Container concept; that is, their `empty()` functions have constant
-    complexity.
-
-    @iterators No changes.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @note This function does not return whether a string stored as JSON value
-    is empty - it returns whether the JSON container itself is empty which is
-    false in the case of a string.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `begin() == end()`.
-
-    @sa @ref size() -- returns the number of elements
-
-    @since version 1.0.0
-    */
-    bool empty() const noexcept
-    {
-        switch (m_type)
-        {
-            case value_t::null:
-            {
-                // null values are empty
-                return true;
-            }
-
-            case value_t::array:
-            {
-                // delegate call to array_t::empty()
-                return m_value.array->empty();
-            }
-
-            case value_t::object:
-            {
-                // delegate call to object_t::empty()
-                return m_value.object->empty();
-            }
-
-            default:
-            {
-                // all other types are nonempty
-                return false;
-            }
-        }
-    }
-
-    /*!
-    @brief returns the number of elements
-
-    Returns the number of elements in a JSON value.
-
-    @return The return value depends on the different types and is
-            defined as follows:
-            Value type  | return value
-            ----------- | -------------
-            null        | `0`
-            boolean     | `1`
-            string      | `1`
-            number      | `1`
-            binary      | `1`
-            object      | result of function object_t::size()
-            array       | result of function array_t::size()
-
-    @liveexample{The following code calls `size()` on the different value
-    types.,size}
-
-    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
-    the Container concept; that is, their size() functions have constant
-    complexity.
-
-    @iterators No changes.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @note This function does not return the length of a string stored as JSON
-    value - it returns the number of elements in the JSON value which is 1 in
-    the case of a string.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of `std::distance(begin(), end())`.
-
-    @sa @ref empty() -- checks whether the container is empty
-    @sa @ref max_size() -- returns the maximal number of elements
-
-    @since version 1.0.0
-    */
-    size_type size() const noexcept
-    {
-        switch (m_type)
-        {
-            case value_t::null:
-            {
-                // null values are empty
-                return 0;
-            }
-
-            case value_t::array:
-            {
-                // delegate call to array_t::size()
-                return m_value.array->size();
-            }
-
-            case value_t::object:
-            {
-                // delegate call to object_t::size()
-                return m_value.object->size();
-            }
-
-            default:
-            {
-                // all other types have size 1
-                return 1;
-            }
-        }
-    }
-
-    /*!
-    @brief returns the maximum possible number of elements
-
-    Returns the maximum number of elements a JSON value is able to hold due to
-    system or library implementation limitations, i.e. `std::distance(begin(),
-    end())` for the JSON value.
-
-    @return The return value depends on the different types and is
-            defined as follows:
-            Value type  | return value
-            ----------- | -------------
-            null        | `0` (same as `size()`)
-            boolean     | `1` (same as `size()`)
-            string      | `1` (same as `size()`)
-            number      | `1` (same as `size()`)
-            binary      | `1` (same as `size()`)
-            object      | result of function `object_t::max_size()`
-            array       | result of function `array_t::max_size()`
-
-    @liveexample{The following code calls `max_size()` on the different value
-    types. Note the output is implementation specific.,max_size}
-
-    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
-    the Container concept; that is, their `max_size()` functions have constant
-    complexity.
-
-    @iterators No changes.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @requirement This function helps `basic_json` satisfying the
-    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
-    requirements:
-    - The complexity is constant.
-    - Has the semantics of returning `b.size()` where `b` is the largest
-      possible JSON value.
-
-    @sa @ref size() -- returns the number of elements
-
-    @since version 1.0.0
-    */
-    size_type max_size() const noexcept
-    {
-        switch (m_type)
-        {
-            case value_t::array:
-            {
-                // delegate call to array_t::max_size()
-                return m_value.array->max_size();
-            }
-
-            case value_t::object:
-            {
-                // delegate call to object_t::max_size()
-                return m_value.object->max_size();
-            }
-
-            default:
-            {
-                // all other types have max_size() == size()
-                return size();
-            }
-        }
-    }
-
-    /// @}
-
-
-    ///////////////
-    // modifiers //
-    ///////////////
-
-    /// @name modifiers
-    /// @{
-
-    /*!
-    @brief clears the contents
-
-    Clears the content of a JSON value and resets it to the default value as
-    if @ref basic_json(value_t) would have been called with the current value
-    type from @ref type():
-
-    Value type  | initial value
-    ----------- | -------------
-    null        | `null`
-    boolean     | `false`
-    string      | `""`
-    number      | `0`
-    binary      | An empty byte vector
-    object      | `{}`
-    array       | `[]`
-
-    @post Has the same effect as calling
-    @code {.cpp}
-    *this = basic_json(type());
-    @endcode
-
-    @liveexample{The example below shows the effect of `clear()` to different
-    JSON types.,clear}
-
-    @complexity Linear in the size of the JSON value.
-
-    @iterators All iterators, pointers and references related to this container
-               are invalidated.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @sa @ref basic_json(value_t) -- constructor that creates an object with the
-        same value than calling `clear()`
-
-    @since version 1.0.0
-    */
-    void clear() noexcept
-    {
-        switch (m_type)
-        {
-            case value_t::number_integer:
-            {
-                m_value.number_integer = 0;
-                break;
-            }
-
-            case value_t::number_unsigned:
-            {
-                m_value.number_unsigned = 0;
-                break;
-            }
-
-            case value_t::number_float:
-            {
-                m_value.number_float = 0.0;
-                break;
-            }
-
-            case value_t::boolean:
-            {
-                m_value.boolean = false;
-                break;
-            }
-
-            case value_t::string:
-            {
-                m_value.string->clear();
-                break;
-            }
-
-            case value_t::binary:
-            {
-                m_value.binary->clear();
-                break;
-            }
-
-            case value_t::array:
-            {
-                m_value.array->clear();
-                break;
-            }
-
-            case value_t::object:
-            {
-                m_value.object->clear();
-                break;
-            }
-
-            default:
-                break;
-        }
-    }
-
-    /*!
-    @brief add an object to an array
-
-    Appends the given element @a val to the end of the JSON value. If the
-    function is called on a JSON null value, an empty array is created before
-    appending @a val.
-
-    @param[in] val the value to add to the JSON array
-
-    @throw type_error.308 when called on a type other than JSON array or
-    null; example: `"cannot use push_back() with number"`
-
-    @complexity Amortized constant.
-
-    @liveexample{The example shows how `push_back()` and `+=` can be used to
-    add elements to a JSON array. Note how the `null` value was silently
-    converted to a JSON array.,push_back}
-
-    @since version 1.0.0
-    */
-    void push_back(basic_json&& val)
-    {
-        // push_back only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
-        {
-            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
-        }
-
-        // transform null object into an array
-        if (is_null())
-        {
-            m_type = value_t::array;
-            m_value = value_t::array;
-            assert_invariant();
-        }
-
-        // add element to array (move semantics)
-        m_value.array->push_back(std::move(val));
-        // if val is moved from, basic_json move constructor marks it null so we do not call the destructor
-    }
-
-    /*!
-    @brief add an object to an array
-    @copydoc push_back(basic_json&&)
-    */
-    reference operator+=(basic_json&& val)
-    {
-        push_back(std::move(val));
-        return *this;
-    }
-
-    /*!
-    @brief add an object to an array
-    @copydoc push_back(basic_json&&)
-    */
-    void push_back(const basic_json& val)
-    {
-        // push_back only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
-        {
-            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
-        }
-
-        // transform null object into an array
-        if (is_null())
-        {
-            m_type = value_t::array;
-            m_value = value_t::array;
-            assert_invariant();
-        }
-
-        // add element to array
-        m_value.array->push_back(val);
-    }
-
-    /*!
-    @brief add an object to an array
-    @copydoc push_back(basic_json&&)
-    */
-    reference operator+=(const basic_json& val)
-    {
-        push_back(val);
-        return *this;
-    }
-
-    /*!
-    @brief add an object to an object
-
-    Inserts the given element @a val to the JSON object. If the function is
-    called on a JSON null value, an empty object is created before inserting
-    @a val.
-
-    @param[in] val the value to add to the JSON object
-
-    @throw type_error.308 when called on a type other than JSON object or
-    null; example: `"cannot use push_back() with number"`
-
-    @complexity Logarithmic in the size of the container, O(log(`size()`)).
-
-    @liveexample{The example shows how `push_back()` and `+=` can be used to
-    add elements to a JSON object. Note how the `null` value was silently
-    converted to a JSON object.,push_back__object_t__value}
-
-    @since version 1.0.0
-    */
-    void push_back(const typename object_t::value_type& val)
-    {
-        // push_back only works for null objects or objects
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
-        {
-            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name())));
-        }
-
-        // transform null object into an object
-        if (is_null())
-        {
-            m_type = value_t::object;
-            m_value = value_t::object;
-            assert_invariant();
-        }
-
-        // add element to array
-        m_value.object->insert(val);
-    }
-
-    /*!
-    @brief add an object to an object
-    @copydoc push_back(const typename object_t::value_type&)
-    */
-    reference operator+=(const typename object_t::value_type& val)
-    {
-        push_back(val);
-        return *this;
-    }
-
-    /*!
-    @brief add an object to an object
-
-    This function allows to use `push_back` with an initializer list. In case
-
-    1. the current value is an object,
-    2. the initializer list @a init contains only two elements, and
-    3. the first element of @a init is a string,
-
-    @a init is converted into an object element and added using
-    @ref push_back(const typename object_t::value_type&). Otherwise, @a init
-    is converted to a JSON value and added using @ref push_back(basic_json&&).
-
-    @param[in] init  an initializer list
-
-    @complexity Linear in the size of the initializer list @a init.
-
-    @note This function is required to resolve an ambiguous overload error,
-          because pairs like `{"key", "value"}` can be both interpreted as
-          `object_t::value_type` or `std::initializer_list<basic_json>`, see
-          https://github.com/nlohmann/json/issues/235 for more information.
-
-    @liveexample{The example shows how initializer lists are treated as
-    objects when possible.,push_back__initializer_list}
-    */
-    void push_back(initializer_list_t init)
-    {
-        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
-        {
-            basic_json&& key = init.begin()->moved_or_copied();
-            push_back(typename object_t::value_type(
-                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
-        }
-        else
-        {
-            push_back(basic_json(init));
-        }
-    }
-
-    /*!
-    @brief add an object to an object
-    @copydoc push_back(initializer_list_t)
-    */
-    reference operator+=(initializer_list_t init)
-    {
-        push_back(init);
-        return *this;
-    }
-
-    /*!
-    @brief add an object to an array
-
-    Creates a JSON value from the passed parameters @a args to the end of the
-    JSON value. If the function is called on a JSON null value, an empty array
-    is created before appending the value created from @a args.
-
-    @param[in] args arguments to forward to a constructor of @ref basic_json
-    @tparam Args compatible types to create a @ref basic_json object
-
-    @return reference to the inserted element
-
-    @throw type_error.311 when called on a type other than JSON array or
-    null; example: `"cannot use emplace_back() with number"`
-
-    @complexity Amortized constant.
-
-    @liveexample{The example shows how `push_back()` can be used to add
-    elements to a JSON array. Note how the `null` value was silently converted
-    to a JSON array.,emplace_back}
-
-    @since version 2.0.8, returns reference since 3.7.0
-    */
-    template<class... Args>
-    reference emplace_back(Args&& ... args)
-    {
-        // emplace_back only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
-        {
-            JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name())));
-        }
-
-        // transform null object into an array
-        if (is_null())
-        {
-            m_type = value_t::array;
-            m_value = value_t::array;
-            assert_invariant();
-        }
-
-        // add element to array (perfect forwarding)
-#ifdef JSON_HAS_CPP_17
-        return m_value.array->emplace_back(std::forward<Args>(args)...);
-#else
-        m_value.array->emplace_back(std::forward<Args>(args)...);
-        return m_value.array->back();
-#endif
-    }
-
-    /*!
-    @brief add an object to an object if key does not exist
-
-    Inserts a new element into a JSON object constructed in-place with the
-    given @a args if there is no element with the key in the container. If the
-    function is called on a JSON null value, an empty object is created before
-    appending the value created from @a args.
-
-    @param[in] args arguments to forward to a constructor of @ref basic_json
-    @tparam Args compatible types to create a @ref basic_json object
-
-    @return a pair consisting of an iterator to the inserted element, or the
-            already-existing element if no insertion happened, and a bool
-            denoting whether the insertion took place.
-
-    @throw type_error.311 when called on a type other than JSON object or
-    null; example: `"cannot use emplace() with number"`
-
-    @complexity Logarithmic in the size of the container, O(log(`size()`)).
-
-    @liveexample{The example shows how `emplace()` can be used to add elements
-    to a JSON object. Note how the `null` value was silently converted to a
-    JSON object. Further note how no value is added if there was already one
-    value stored with the same key.,emplace}
-
-    @since version 2.0.8
-    */
-    template<class... Args>
-    std::pair<iterator, bool> emplace(Args&& ... args)
-    {
-        // emplace only works for null objects or arrays
-        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
-        {
-            JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name())));
-        }
-
-        // transform null object into an object
-        if (is_null())
-        {
-            m_type = value_t::object;
-            m_value = value_t::object;
-            assert_invariant();
-        }
-
-        // add element to array (perfect forwarding)
-        auto res = m_value.object->emplace(std::forward<Args>(args)...);
-        // create result iterator and set iterator to the result of emplace
-        auto it = begin();
-        it.m_it.object_iterator = res.first;
-
-        // return pair of iterator and boolean
-        return {it, res.second};
-    }
-
-    /// Helper for insertion of an iterator
-    /// @note: This uses std::distance to support GCC 4.8,
-    ///        see https://github.com/nlohmann/json/pull/1257
-    template<typename... Args>
-    iterator insert_iterator(const_iterator pos, Args&& ... args)
-    {
-        iterator result(this);
-        JSON_ASSERT(m_value.array != nullptr);
-
-        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
-        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
-        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
-
-        // This could have been written as:
-        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
-        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
-
-        return result;
-    }
-
-    /*!
-    @brief inserts element
-
-    Inserts element @a val before iterator @a pos.
-
-    @param[in] pos iterator before which the content will be inserted; may be
-    the end() iterator
-    @param[in] val element to insert
-    @return iterator pointing to the inserted @a val.
-
-    @throw type_error.309 if called on JSON values other than arrays;
-    example: `"cannot use insert() with string"`
-    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
-    example: `"iterator does not fit current value"`
-
-    @complexity Constant plus linear in the distance between @a pos and end of
-    the container.
-
-    @liveexample{The example shows how `insert()` is used.,insert}
-
-    @since version 1.0.0
-    */
-    iterator insert(const_iterator pos, const basic_json& val)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            // check if iterator pos fits to this JSON value
-            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-            {
-                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
-            }
-
-            // insert to array and return iterator
-            return insert_iterator(pos, val);
-        }
-
-        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief inserts element
-    @copydoc insert(const_iterator, const basic_json&)
-    */
-    iterator insert(const_iterator pos, basic_json&& val)
-    {
-        return insert(pos, val);
-    }
-
-    /*!
-    @brief inserts elements
-
-    Inserts @a cnt copies of @a val before iterator @a pos.
-
-    @param[in] pos iterator before which the content will be inserted; may be
-    the end() iterator
-    @param[in] cnt number of copies of @a val to insert
-    @param[in] val element to insert
-    @return iterator pointing to the first element inserted, or @a pos if
-    `cnt==0`
-
-    @throw type_error.309 if called on JSON values other than arrays; example:
-    `"cannot use insert() with string"`
-    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
-    example: `"iterator does not fit current value"`
-
-    @complexity Linear in @a cnt plus linear in the distance between @a pos
-    and end of the container.
-
-    @liveexample{The example shows how `insert()` is used.,insert__count}
-
-    @since version 1.0.0
-    */
-    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            // check if iterator pos fits to this JSON value
-            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-            {
-                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
-            }
-
-            // insert to array and return iterator
-            return insert_iterator(pos, cnt, val);
-        }
-
-        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
-    }
-
-    /*!
-    @brief inserts elements
-
-    Inserts elements from range `[first, last)` before iterator @a pos.
-
-    @param[in] pos iterator before which the content will be inserted; may be
-    the end() iterator
-    @param[in] first begin of the range of elements to insert
-    @param[in] last end of the range of elements to insert
-
-    @throw type_error.309 if called on JSON values other than arrays; example:
-    `"cannot use insert() with string"`
-    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
-    example: `"iterator does not fit current value"`
-    @throw invalid_iterator.210 if @a first and @a last do not belong to the
-    same JSON value; example: `"iterators do not fit"`
-    @throw invalid_iterator.211 if @a first or @a last are iterators into
-    container for which insert is called; example: `"passed iterators may not
-    belong to container"`
-
-    @return iterator pointing to the first element inserted, or @a pos if
-    `first==last`
-
-    @complexity Linear in `std::distance(first, last)` plus linear in the
-    distance between @a pos and end of the container.
-
-    @liveexample{The example shows how `insert()` is used.,insert__range}
-
-    @since version 1.0.0
-    */
-    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_UNLIKELY(!is_array()))
-        {
-            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
-        }
-
-        // check if iterator pos fits to this JSON value
-        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
-        }
-
-        // check if range iterators belong to the same JSON object
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
-        {
-            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container"));
-        }
-
-        // insert to array and return iterator
-        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
-    }
-
-    /*!
-    @brief inserts elements
-
-    Inserts elements from initializer list @a ilist before iterator @a pos.
-
-    @param[in] pos iterator before which the content will be inserted; may be
-    the end() iterator
-    @param[in] ilist initializer list to insert the values from
-
-    @throw type_error.309 if called on JSON values other than arrays; example:
-    `"cannot use insert() with string"`
-    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
-    example: `"iterator does not fit current value"`
-
-    @return iterator pointing to the first element inserted, or @a pos if
-    `ilist` is empty
-
-    @complexity Linear in `ilist.size()` plus linear in the distance between
-    @a pos and end of the container.
-
-    @liveexample{The example shows how `insert()` is used.,insert__ilist}
-
-    @since version 1.0.0
-    */
-    iterator insert(const_iterator pos, initializer_list_t ilist)
-    {
-        // insert only works for arrays
-        if (JSON_HEDLEY_UNLIKELY(!is_array()))
-        {
-            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
-        }
-
-        // check if iterator pos fits to this JSON value
-        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value"));
-        }
-
-        // insert to array and return iterator
-        return insert_iterator(pos, ilist.begin(), ilist.end());
-    }
-
-    /*!
-    @brief inserts elements
-
-    Inserts elements from range `[first, last)`.
-
-    @param[in] first begin of the range of elements to insert
-    @param[in] last end of the range of elements to insert
-
-    @throw type_error.309 if called on JSON values other than objects; example:
-    `"cannot use insert() with string"`
-    @throw invalid_iterator.202 if iterator @a first or @a last does does not
-    point to an object; example: `"iterators first and last must point to
-    objects"`
-    @throw invalid_iterator.210 if @a first and @a last do not belong to the
-    same JSON value; example: `"iterators do not fit"`
-
-    @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number
-    of elements to insert.
-
-    @liveexample{The example shows how `insert()` is used.,insert__range_object}
-
-    @since version 3.0.0
-    */
-    void insert(const_iterator first, const_iterator last)
-    {
-        // insert only works for objects
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name())));
-        }
-
-        // check if range iterators belong to the same JSON object
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
-        }
-
-        // passed iterators must belong to objects
-        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
-        }
-
-        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
-    }
-
-    /*!
-    @brief updates a JSON object from another object, overwriting existing keys
-
-    Inserts all values from JSON object @a j and overwrites existing keys.
-
-    @param[in] j  JSON object to read values from
-
-    @throw type_error.312 if called on JSON values other than objects; example:
-    `"cannot use update() with string"`
-
-    @complexity O(N*log(size() + N)), where N is the number of elements to
-                insert.
-
-    @liveexample{The example shows how `update()` is used.,update}
-
-    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
-
-    @since version 3.0.0
-    */
-    void update(const_reference j)
-    {
-        // implicitly convert null value to an empty object
-        if (is_null())
-        {
-            m_type = value_t::object;
-            m_value.object = create<object_t>();
-            assert_invariant();
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
-        }
-        if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
-        {
-            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name())));
-        }
-
-        for (auto it = j.cbegin(); it != j.cend(); ++it)
-        {
-            m_value.object->operator[](it.key()) = it.value();
-        }
-    }
-
-    /*!
-    @brief updates a JSON object from another object, overwriting existing keys
-
-    Inserts all values from from range `[first, last)` and overwrites existing
-    keys.
-
-    @param[in] first begin of the range of elements to insert
-    @param[in] last end of the range of elements to insert
-
-    @throw type_error.312 if called on JSON values other than objects; example:
-    `"cannot use update() with string"`
-    @throw invalid_iterator.202 if iterator @a first or @a last does does not
-    point to an object; example: `"iterators first and last must point to
-    objects"`
-    @throw invalid_iterator.210 if @a first and @a last do not belong to the
-    same JSON value; example: `"iterators do not fit"`
-
-    @complexity O(N*log(size() + N)), where N is the number of elements to
-                insert.
-
-    @liveexample{The example shows how `update()` is used__range.,update}
-
-    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
-
-    @since version 3.0.0
-    */
-    void update(const_iterator first, const_iterator last)
-    {
-        // implicitly convert null value to an empty object
-        if (is_null())
-        {
-            m_type = value_t::object;
-            m_value.object = create<object_t>();
-            assert_invariant();
-        }
-
-        if (JSON_HEDLEY_UNLIKELY(!is_object()))
-        {
-            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name())));
-        }
-
-        // check if range iterators belong to the same JSON object
-        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
-        {
-            JSON_THROW(invalid_iterator::create(210, "iterators do not fit"));
-        }
-
-        // passed iterators must belong to objects
-        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()
-                                 || !last.m_object->is_object()))
-        {
-            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects"));
-        }
-
-        for (auto it = first; it != last; ++it)
-        {
-            m_value.object->operator[](it.key()) = it.value();
-        }
-    }
-
-    /*!
-    @brief exchanges the values
-
-    Exchanges the contents of the JSON value with those of @a other. Does not
-    invoke any move, copy, or swap operations on individual elements. All
-    iterators and references remain valid. The past-the-end iterator is
-    invalidated.
-
-    @param[in,out] other JSON value to exchange the contents with
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how JSON values can be swapped with
-    `swap()`.,swap__reference}
-
-    @since version 1.0.0
-    */
-    void swap(reference other) noexcept (
-        std::is_nothrow_move_constructible<value_t>::value&&
-        std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&&
-        std::is_nothrow_move_assignable<json_value>::value
-    )
-    {
-        std::swap(m_type, other.m_type);
-        std::swap(m_value, other.m_value);
-        assert_invariant();
-    }
-
-    /*!
-    @brief exchanges the values
-
-    Exchanges the contents of the JSON value from @a left with those of @a right. Does not
-    invoke any move, copy, or swap operations on individual elements. All
-    iterators and references remain valid. The past-the-end iterator is
-    invalidated. implemented as a friend function callable via ADL.
-
-    @param[in,out] left JSON value to exchange the contents with
-    @param[in,out] right JSON value to exchange the contents with
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how JSON values can be swapped with
-    `swap()`.,swap__reference}
-
-    @since version 1.0.0
-    */
-    friend void swap(reference left, reference right) noexcept (
-        std::is_nothrow_move_constructible<value_t>::value&&
-        std::is_nothrow_move_assignable<value_t>::value&&
-        std::is_nothrow_move_constructible<json_value>::value&&
-        std::is_nothrow_move_assignable<json_value>::value
-    )
-    {
-        left.swap(right);
-    }
-
-    /*!
-    @brief exchanges the values
-
-    Exchanges the contents of a JSON array with those of @a other. Does not
-    invoke any move, copy, or swap operations on individual elements. All
-    iterators and references remain valid. The past-the-end iterator is
-    invalidated.
-
-    @param[in,out] other array to exchange the contents with
-
-    @throw type_error.310 when JSON value is not an array; example: `"cannot
-    use swap() with string"`
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how arrays can be swapped with
-    `swap()`.,swap__array_t}
-
-    @since version 1.0.0
-    */
-    void swap(array_t& other)
-    {
-        // swap only works for arrays
-        if (JSON_HEDLEY_LIKELY(is_array()))
-        {
-            std::swap(*(m_value.array), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief exchanges the values
-
-    Exchanges the contents of a JSON object with those of @a other. Does not
-    invoke any move, copy, or swap operations on individual elements. All
-    iterators and references remain valid. The past-the-end iterator is
-    invalidated.
-
-    @param[in,out] other object to exchange the contents with
-
-    @throw type_error.310 when JSON value is not an object; example:
-    `"cannot use swap() with string"`
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how objects can be swapped with
-    `swap()`.,swap__object_t}
-
-    @since version 1.0.0
-    */
-    void swap(object_t& other)
-    {
-        // swap only works for objects
-        if (JSON_HEDLEY_LIKELY(is_object()))
-        {
-            std::swap(*(m_value.object), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief exchanges the values
-
-    Exchanges the contents of a JSON string with those of @a other. Does not
-    invoke any move, copy, or swap operations on individual elements. All
-    iterators and references remain valid. The past-the-end iterator is
-    invalidated.
-
-    @param[in,out] other string to exchange the contents with
-
-    @throw type_error.310 when JSON value is not a string; example: `"cannot
-    use swap() with boolean"`
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how strings can be swapped with
-    `swap()`.,swap__string_t}
-
-    @since version 1.0.0
-    */
-    void swap(string_t& other)
-    {
-        // swap only works for strings
-        if (JSON_HEDLEY_LIKELY(is_string()))
-        {
-            std::swap(*(m_value.string), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
-        }
-    }
-
-    /*!
-    @brief exchanges the values
-
-    Exchanges the contents of a JSON string with those of @a other. Does not
-    invoke any move, copy, or swap operations on individual elements. All
-    iterators and references remain valid. The past-the-end iterator is
-    invalidated.
-
-    @param[in,out] other binary to exchange the contents with
-
-    @throw type_error.310 when JSON value is not a string; example: `"cannot
-    use swap() with boolean"`
-
-    @complexity Constant.
-
-    @liveexample{The example below shows how strings can be swapped with
-    `swap()`.,swap__binary_t}
-
-    @since version 3.8.0
-    */
-    void swap(binary_t& other)
-    {
-        // swap only works for strings
-        if (JSON_HEDLEY_LIKELY(is_binary()))
-        {
-            std::swap(*(m_value.binary), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
-        }
-    }
-
-    /// @copydoc swap(binary_t)
-    void swap(typename binary_t::container_type& other)
-    {
-        // swap only works for strings
-        if (JSON_HEDLEY_LIKELY(is_binary()))
-        {
-            std::swap(*(m_value.binary), other);
-        }
-        else
-        {
-            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name())));
-        }
-    }
-
-    /// @}
-
-  public:
-    //////////////////////////////////////////
-    // lexicographical comparison operators //
-    //////////////////////////////////////////
-
-    /// @name lexicographical comparison operators
-    /// @{
-
-    /*!
-    @brief comparison: equal
-
-    Compares two JSON values for equality according to the following rules:
-    - Two JSON values are equal if (1) they are from the same type and (2)
-      their stored values are the same according to their respective
-      `operator==`.
-    - Integer and floating-point numbers are automatically converted before
-      comparison. Note that two NaN values are always treated as unequal.
-    - Two JSON null values are equal.
-
-    @note Floating-point inside JSON values numbers are compared with
-    `json::number_float_t::operator==` which is `double::operator==` by
-    default. To compare floating-point while respecting an epsilon, an alternative
-    [comparison function](https://github.com/mariokonrad/marnav/blob/master/include/marnav/math/floatingpoint.hpp#L34-#L39)
-    could be used, for instance
-    @code {.cpp}
-    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
-    inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
-    {
-        return std::abs(a - b) <= epsilon;
-    }
-    @endcode
-    Or you can self-defined operator equal function like this:
-    @code {.cpp}
-    bool my_equal(const_reference lhs, const_reference rhs) {
-    const auto lhs_type lhs.type();
-    const auto rhs_type rhs.type();
-    if (lhs_type == rhs_type) {
-        switch(lhs_type)
-            // self_defined case
-            case value_t::number_float:
-                return std::abs(lhs - rhs) <= std::numeric_limits<float>::epsilon();
-            // other cases remain the same with the original
-            ...
-    }
-    ...
-    }
-    @endcode
-
-    @note NaN values never compare equal to themselves or to other NaN values.
-
-    @param[in] lhs  first JSON value to consider
-    @param[in] rhs  second JSON value to consider
-    @return whether the values @a lhs and @a rhs are equal
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @complexity Linear.
-
-    @liveexample{The example demonstrates comparing several JSON
-    types.,operator__equal}
-
-    @since version 1.0.0
-    */
-    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
-    {
-        const auto lhs_type = lhs.type();
-        const auto rhs_type = rhs.type();
-
-        if (lhs_type == rhs_type)
-        {
-            switch (lhs_type)
-            {
-                case value_t::array:
-                    return *lhs.m_value.array == *rhs.m_value.array;
-
-                case value_t::object:
-                    return *lhs.m_value.object == *rhs.m_value.object;
-
-                case value_t::null:
-                    return true;
-
-                case value_t::string:
-                    return *lhs.m_value.string == *rhs.m_value.string;
-
-                case value_t::boolean:
-                    return lhs.m_value.boolean == rhs.m_value.boolean;
-
-                case value_t::number_integer:
-                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
-
-                case value_t::number_unsigned:
-                    return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned;
-
-                case value_t::number_float:
-                    return lhs.m_value.number_float == rhs.m_value.number_float;
-
-                case value_t::binary:
-                    return *lhs.m_value.binary == *rhs.m_value.binary;
-
-                default:
-                    return false;
-            }
-        }
-        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
-        {
-            return static_cast<number_float_t>(lhs.m_value.number_integer) == rhs.m_value.number_float;
-        }
-        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
-        {
-            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_integer);
-        }
-        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
-        {
-            return static_cast<number_float_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_float;
-        }
-        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
-        {
-            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_unsigned);
-        }
-        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
-        {
-            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_integer;
-        }
-        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
-        {
-            return lhs.m_value.number_integer == static_cast<number_integer_t>(rhs.m_value.number_unsigned);
-        }
-
-        return false;
-    }
-
-    /*!
-    @brief comparison: equal
-    @copydoc operator==(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator==(const_reference lhs, const ScalarType rhs) noexcept
-    {
-        return lhs == basic_json(rhs);
-    }
-
-    /*!
-    @brief comparison: equal
-    @copydoc operator==(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator==(const ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) == rhs;
-    }
-
-    /*!
-    @brief comparison: not equal
-
-    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
-
-    @param[in] lhs  first JSON value to consider
-    @param[in] rhs  second JSON value to consider
-    @return whether the values @a lhs and @a rhs are not equal
-
-    @complexity Linear.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @liveexample{The example demonstrates comparing several JSON
-    types.,operator__notequal}
-
-    @since version 1.0.0
-    */
-    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
-    {
-        return !(lhs == rhs);
-    }
-
-    /*!
-    @brief comparison: not equal
-    @copydoc operator!=(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator!=(const_reference lhs, const ScalarType rhs) noexcept
-    {
-        return lhs != basic_json(rhs);
-    }
-
-    /*!
-    @brief comparison: not equal
-    @copydoc operator!=(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator!=(const ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) != rhs;
-    }
-
-    /*!
-    @brief comparison: less than
-
-    Compares whether one JSON value @a lhs is less than another JSON value @a
-    rhs according to the following rules:
-    - If @a lhs and @a rhs have the same type, the values are compared using
-      the default `<` operator.
-    - Integer and floating-point numbers are automatically converted before
-      comparison
-    - In case @a lhs and @a rhs have different types, the values are ignored
-      and the order of the types is considered, see
-      @ref operator<(const value_t, const value_t).
-
-    @param[in] lhs  first JSON value to consider
-    @param[in] rhs  second JSON value to consider
-    @return whether @a lhs is less than @a rhs
-
-    @complexity Linear.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @liveexample{The example demonstrates comparing several JSON
-    types.,operator__less}
-
-    @since version 1.0.0
-    */
-    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
-    {
-        const auto lhs_type = lhs.type();
-        const auto rhs_type = rhs.type();
-
-        if (lhs_type == rhs_type)
-        {
-            switch (lhs_type)
-            {
-                case value_t::array:
-                    // note parentheses are necessary, see
-                    // https://github.com/nlohmann/json/issues/1530
-                    return (*lhs.m_value.array) < (*rhs.m_value.array);
-
-                case value_t::object:
-                    return (*lhs.m_value.object) < (*rhs.m_value.object);
-
-                case value_t::null:
-                    return false;
-
-                case value_t::string:
-                    return (*lhs.m_value.string) < (*rhs.m_value.string);
-
-                case value_t::boolean:
-                    return (lhs.m_value.boolean) < (rhs.m_value.boolean);
-
-                case value_t::number_integer:
-                    return (lhs.m_value.number_integer) < (rhs.m_value.number_integer);
-
-                case value_t::number_unsigned:
-                    return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned);
-
-                case value_t::number_float:
-                    return (lhs.m_value.number_float) < (rhs.m_value.number_float);
-
-                case value_t::binary:
-                    return (*lhs.m_value.binary) < (*rhs.m_value.binary);
-
-                default:
-                    return false;
-            }
-        }
-        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
-        {
-            return static_cast<number_float_t>(lhs.m_value.number_integer) < rhs.m_value.number_float;
-        }
-        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
-        {
-            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_integer);
-        }
-        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
-        {
-            return static_cast<number_float_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_float;
-        }
-        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
-        {
-            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_unsigned);
-        }
-        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
-        {
-            return lhs.m_value.number_integer < static_cast<number_integer_t>(rhs.m_value.number_unsigned);
-        }
-        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
-        {
-            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_integer;
-        }
-
-        // We only reach this line if we cannot compare values. In that case,
-        // we compare types. Note we have to call the operator explicitly,
-        // because MSVC has problems otherwise.
-        return operator<(lhs_type, rhs_type);
-    }
-
-    /*!
-    @brief comparison: less than
-    @copydoc operator<(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<(const_reference lhs, const ScalarType rhs) noexcept
-    {
-        return lhs < basic_json(rhs);
-    }
-
-    /*!
-    @brief comparison: less than
-    @copydoc operator<(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<(const ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) < rhs;
-    }
-
-    /*!
-    @brief comparison: less than or equal
-
-    Compares whether one JSON value @a lhs is less than or equal to another
-    JSON value by calculating `not (rhs < lhs)`.
-
-    @param[in] lhs  first JSON value to consider
-    @param[in] rhs  second JSON value to consider
-    @return whether @a lhs is less than or equal to @a rhs
-
-    @complexity Linear.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @liveexample{The example demonstrates comparing several JSON
-    types.,operator__greater}
-
-    @since version 1.0.0
-    */
-    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
-    {
-        return !(rhs < lhs);
-    }
-
-    /*!
-    @brief comparison: less than or equal
-    @copydoc operator<=(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<=(const_reference lhs, const ScalarType rhs) noexcept
-    {
-        return lhs <= basic_json(rhs);
-    }
-
-    /*!
-    @brief comparison: less than or equal
-    @copydoc operator<=(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator<=(const ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) <= rhs;
-    }
-
-    /*!
-    @brief comparison: greater than
-
-    Compares whether one JSON value @a lhs is greater than another
-    JSON value by calculating `not (lhs <= rhs)`.
-
-    @param[in] lhs  first JSON value to consider
-    @param[in] rhs  second JSON value to consider
-    @return whether @a lhs is greater than to @a rhs
-
-    @complexity Linear.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @liveexample{The example demonstrates comparing several JSON
-    types.,operator__lessequal}
-
-    @since version 1.0.0
-    */
-    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
-    {
-        return !(lhs <= rhs);
-    }
-
-    /*!
-    @brief comparison: greater than
-    @copydoc operator>(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>(const_reference lhs, const ScalarType rhs) noexcept
-    {
-        return lhs > basic_json(rhs);
-    }
-
-    /*!
-    @brief comparison: greater than
-    @copydoc operator>(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>(const ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) > rhs;
-    }
-
-    /*!
-    @brief comparison: greater than or equal
-
-    Compares whether one JSON value @a lhs is greater than or equal to another
-    JSON value by calculating `not (lhs < rhs)`.
-
-    @param[in] lhs  first JSON value to consider
-    @param[in] rhs  second JSON value to consider
-    @return whether @a lhs is greater than or equal to @a rhs
-
-    @complexity Linear.
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @liveexample{The example demonstrates comparing several JSON
-    types.,operator__greaterequal}
-
-    @since version 1.0.0
-    */
-    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
-    {
-        return !(lhs < rhs);
-    }
-
-    /*!
-    @brief comparison: greater than or equal
-    @copydoc operator>=(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>=(const_reference lhs, const ScalarType rhs) noexcept
-    {
-        return lhs >= basic_json(rhs);
-    }
-
-    /*!
-    @brief comparison: greater than or equal
-    @copydoc operator>=(const_reference, const_reference)
-    */
-    template<typename ScalarType, typename std::enable_if<
-                 std::is_scalar<ScalarType>::value, int>::type = 0>
-    friend bool operator>=(const ScalarType lhs, const_reference rhs) noexcept
-    {
-        return basic_json(lhs) >= rhs;
-    }
-
-    /// @}
-
-    ///////////////////
-    // serialization //
-    ///////////////////
-
-    /// @name serialization
-    /// @{
-
-    /*!
-    @brief serialize to stream
-
-    Serialize the given JSON value @a j to the output stream @a o. The JSON
-    value will be serialized using the @ref dump member function.
-
-    - The indentation of the output can be controlled with the member variable
-      `width` of the output stream @a o. For instance, using the manipulator
-      `std::setw(4)` on @a o sets the indentation level to `4` and the
-      serialization result is the same as calling `dump(4)`.
-
-    - The indentation character can be controlled with the member variable
-      `fill` of the output stream @a o. For instance, the manipulator
-      `std::setfill('\\t')` sets indentation to use a tab character rather than
-      the default space character.
-
-    @param[in,out] o  stream to serialize to
-    @param[in] j  JSON value to serialize
-
-    @return the stream @a o
-
-    @throw type_error.316 if a string stored inside the JSON value is not
-                          UTF-8 encoded
-
-    @complexity Linear.
-
-    @liveexample{The example below shows the serialization with different
-    parameters to `width` to adjust the indentation level.,operator_serialize}
-
-    @since version 1.0.0; indentation character added in version 3.0.0
-    */
-    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
-    {
-        // read width member and use it as indentation parameter if nonzero
-        const bool pretty_print = o.width() > 0;
-        const auto indentation = pretty_print ? o.width() : 0;
-
-        // reset width to 0 for subsequent calls to this stream
-        o.width(0);
-
-        // do the actual serialization
-        serializer s(detail::output_adapter<char>(o), o.fill());
-        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
-        return o;
-    }
-
-    /*!
-    @brief serialize to stream
-    @deprecated This stream operator is deprecated and will be removed in
-                future 4.0.0 of the library. Please use
-                @ref operator<<(std::ostream&, const basic_json&)
-                instead; that is, replace calls like `j >> o;` with `o << j;`.
-    @since version 1.0.0; deprecated since version 3.0.0
-    */
-    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
-    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
-    {
-        return o << j;
-    }
-
-    /// @}
-
-
-    /////////////////////
-    // deserialization //
-    /////////////////////
-
-    /// @name deserialization
-    /// @{
-
-    /*!
-    @brief deserialize from a compatible input
-
-    @tparam InputType A compatible input, for instance
-    - an std::istream object
-    - a FILE pointer
-    - a C-style array of characters
-    - a pointer to a null-terminated string of single byte characters
-    - an object obj for which begin(obj) and end(obj) produces a valid pair of
-      iterators.
-
-    @param[in] i  input to read from
-    @param[in] cb  a parser callback function of type @ref parser_callback_t
-    which is used to control the deserialization by filtering unwanted values
-    (optional)
-    @param[in] allow_exceptions  whether to throw exceptions in case of a
-    parse error (optional, true by default)
-    @param[in] ignore_comments  whether comments should be ignored and treated
-    like whitespace (true) or yield a parse error (true); (optional, false by
-    default)
-
-    @return deserialized JSON value; in case of a parse error and
-            @a allow_exceptions set to `false`, the return value will be
-            value_t::discarded.
-
-    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
-    of input; expected string literal""`
-    @throw parse_error.102 if to_unicode fails or surrogate error
-    @throw parse_error.103 if to_unicode fails
-
-    @complexity Linear in the length of the input. The parser is a predictive
-    LL(1) parser. The complexity can be higher if the parser callback function
-    @a cb or reading from the input @a i has a super-linear complexity.
-
-    @note A UTF-8 byte order mark is silently ignored.
-
-    @liveexample{The example below demonstrates the `parse()` function reading
-    from an array.,parse__array__parser_callback_t}
-
-    @liveexample{The example below demonstrates the `parse()` function with
-    and without callback function.,parse__string__parser_callback_t}
-
-    @liveexample{The example below demonstrates the `parse()` function with
-    and without callback function.,parse__istream__parser_callback_t}
-
-    @liveexample{The example below demonstrates the `parse()` function reading
-    from a contiguous container.,parse__contiguouscontainer__parser_callback_t}
-
-    @since version 2.0.3 (contiguous containers); version 3.9.0 allowed to
-    ignore comments.
-    */
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json parse(InputType&& i,
-                            const parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true,
-                            const bool ignore_comments = false)
-    {
-        basic_json result;
-        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
-        return result;
-    }
-
-    /*!
-    @brief deserialize from a pair of character iterators
-
-    The value_type of the iterator must be a integral type with size of 1, 2 or
-    4 bytes, which will be interpreted respectively as UTF-8, UTF-16 and UTF-32.
-
-    @param[in] first iterator to start of character range
-    @param[in] last  iterator to end of character range
-    @param[in] cb  a parser callback function of type @ref parser_callback_t
-    which is used to control the deserialization by filtering unwanted values
-    (optional)
-    @param[in] allow_exceptions  whether to throw exceptions in case of a
-    parse error (optional, true by default)
-    @param[in] ignore_comments  whether comments should be ignored and treated
-    like whitespace (true) or yield a parse error (true); (optional, false by
-    default)
-
-    @return deserialized JSON value; in case of a parse error and
-            @a allow_exceptions set to `false`, the return value will be
-            value_t::discarded.
-
-    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
-    of input; expected string literal""`
-    @throw parse_error.102 if to_unicode fails or surrogate error
-    @throw parse_error.103 if to_unicode fails
-    */
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json parse(IteratorType first,
-                            IteratorType last,
-                            const parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true,
-                            const bool ignore_comments = false)
-    {
-        basic_json result;
-        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
-        return result;
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
-    static basic_json parse(detail::span_input_adapter&& i,
-                            const parser_callback_t cb = nullptr,
-                            const bool allow_exceptions = true,
-                            const bool ignore_comments = false)
-    {
-        basic_json result;
-        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
-        return result;
-    }
-
-    /*!
-    @brief check if the input is valid JSON
-
-    Unlike the @ref parse(InputType&&, const parser_callback_t,const bool)
-    function, this function neither throws an exception in case of invalid JSON
-    input (i.e., a parse error) nor creates diagnostic information.
-
-    @tparam InputType A compatible input, for instance
-    - an std::istream object
-    - a FILE pointer
-    - a C-style array of characters
-    - a pointer to a null-terminated string of single byte characters
-    - an object obj for which begin(obj) and end(obj) produces a valid pair of
-      iterators.
-
-    @param[in] i input to read from
-    @param[in] ignore_comments  whether comments should be ignored and treated
-    like whitespace (true) or yield a parse error (true); (optional, false by
-    default)
-
-    @return Whether the input read from @a i is valid JSON.
-
-    @complexity Linear in the length of the input. The parser is a predictive
-    LL(1) parser.
-
-    @note A UTF-8 byte order mark is silently ignored.
-
-    @liveexample{The example below demonstrates the `accept()` function reading
-    from a string.,accept__string}
-    */
-    template<typename InputType>
-    static bool accept(InputType&& i,
-                       const bool ignore_comments = false)
-    {
-        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
-    }
-
-    template<typename IteratorType>
-    static bool accept(IteratorType first, IteratorType last,
-                       const bool ignore_comments = false)
-    {
-        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
-    static bool accept(detail::span_input_adapter&& i,
-                       const bool ignore_comments = false)
-    {
-        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
-    }
-
-    /*!
-    @brief generate SAX events
-
-    The SAX event lister must follow the interface of @ref json_sax.
-
-    This function reads from a compatible input. Examples are:
-    - an std::istream object
-    - a FILE pointer
-    - a C-style array of characters
-    - a pointer to a null-terminated string of single byte characters
-    - an object obj for which begin(obj) and end(obj) produces a valid pair of
-      iterators.
-
-    @param[in] i  input to read from
-    @param[in,out] sax  SAX event listener
-    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
-    @param[in] strict  whether the input has to be consumed completely
-    @param[in] ignore_comments  whether comments should be ignored and treated
-    like whitespace (true) or yield a parse error (true); (optional, false by
-    default); only applies to the JSON file format.
-
-    @return return value of the last processed SAX event
-
-    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
-    of input; expected string literal""`
-    @throw parse_error.102 if to_unicode fails or surrogate error
-    @throw parse_error.103 if to_unicode fails
-
-    @complexity Linear in the length of the input. The parser is a predictive
-    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
-    a super-linear complexity.
-
-    @note A UTF-8 byte order mark is silently ignored.
-
-    @liveexample{The example below demonstrates the `sax_parse()` function
-    reading from string and processing the events with a user-defined SAX
-    event consumer.,sax_parse}
-
-    @since version 3.2.0
-    */
-    template <typename InputType, typename SAX>
-    JSON_HEDLEY_NON_NULL(2)
-    static bool sax_parse(InputType&& i, SAX* sax,
-                          input_format_t format = input_format_t::json,
-                          const bool strict = true,
-                          const bool ignore_comments = false)
-    {
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        return format == input_format_t::json
-               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
-               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
-    }
-
-    template<class IteratorType, class SAX>
-    JSON_HEDLEY_NON_NULL(3)
-    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
-                          input_format_t format = input_format_t::json,
-                          const bool strict = true,
-                          const bool ignore_comments = false)
-    {
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        return format == input_format_t::json
-               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
-               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
-    }
-
-    template <typename SAX>
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
-    JSON_HEDLEY_NON_NULL(2)
-    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
-                          input_format_t format = input_format_t::json,
-                          const bool strict = true,
-                          const bool ignore_comments = false)
-    {
-        auto ia = i.get();
-        return format == input_format_t::json
-               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
-               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
-    }
-
-    /*!
-    @brief deserialize from stream
-    @deprecated This stream operator is deprecated and will be removed in
-                version 4.0.0 of the library. Please use
-                @ref operator>>(std::istream&, basic_json&)
-                instead; that is, replace calls like `j << i;` with `i >> j;`.
-    @since version 1.0.0; deprecated since version 3.0.0
-    */
-    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
-    friend std::istream& operator<<(basic_json& j, std::istream& i)
-    {
-        return operator>>(i, j);
-    }
-
-    /*!
-    @brief deserialize from stream
-
-    Deserializes an input stream to a JSON value.
-
-    @param[in,out] i  input stream to read a serialized JSON value from
-    @param[in,out] j  JSON value to write the deserialized input to
-
-    @throw parse_error.101 in case of an unexpected token
-    @throw parse_error.102 if to_unicode fails or surrogate error
-    @throw parse_error.103 if to_unicode fails
-
-    @complexity Linear in the length of the input. The parser is a predictive
-    LL(1) parser.
-
-    @note A UTF-8 byte order mark is silently ignored.
-
-    @liveexample{The example below shows how a JSON value is constructed by
-    reading a serialization from a stream.,operator_deserialize}
-
-    @sa parse(std::istream&, const parser_callback_t) for a variant with a
-    parser callback function to filter values while parsing
-
-    @since version 1.0.0
-    */
-    friend std::istream& operator>>(std::istream& i, basic_json& j)
-    {
-        parser(detail::input_adapter(i)).parse(false, j);
-        return i;
-    }
-
-    /// @}
-
-    ///////////////////////////
-    // convenience functions //
-    ///////////////////////////
-
-    /*!
-    @brief return the type as string
-
-    Returns the type name as string to be used in error messages - usually to
-    indicate that a function was called on a wrong JSON type.
-
-    @return a string representation of a the @a m_type member:
-            Value type  | return value
-            ----------- | -------------
-            null        | `"null"`
-            boolean     | `"boolean"`
-            string      | `"string"`
-            number      | `"number"` (for all number types)
-            object      | `"object"`
-            array       | `"array"`
-            binary      | `"binary"`
-            discarded   | `"discarded"`
-
-    @exceptionsafety No-throw guarantee: this function never throws exceptions.
-
-    @complexity Constant.
-
-    @liveexample{The following code exemplifies `type_name()` for all JSON
-    types.,type_name}
-
-    @sa @ref type() -- return the type of the JSON value
-    @sa @ref operator value_t() -- return the type of the JSON value (implicit)
-
-    @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
-    since 3.0.0
-    */
-    JSON_HEDLEY_RETURNS_NON_NULL
-    const char* type_name() const noexcept
-    {
-        {
-            switch (m_type)
-            {
-                case value_t::null:
-                    return "null";
-                case value_t::object:
-                    return "object";
-                case value_t::array:
-                    return "array";
-                case value_t::string:
-                    return "string";
-                case value_t::boolean:
-                    return "boolean";
-                case value_t::binary:
-                    return "binary";
-                case value_t::discarded:
-                    return "discarded";
-                default:
-                    return "number";
-            }
-        }
-    }
-
-
-  private:
-    //////////////////////
-    // member variables //
-    //////////////////////
-
-    /// the type of the current element
-    value_t m_type = value_t::null;
-
-    /// the value of the current element
-    json_value m_value = {};
-
-    //////////////////////////////////////////
-    // binary serialization/deserialization //
-    //////////////////////////////////////////
-
-    /// @name binary serialization/deserialization support
-    /// @{
-
-  public:
-    /*!
-    @brief create a CBOR serialization of a given JSON value
-
-    Serializes a given JSON value @a j to a byte vector using the CBOR (Concise
-    Binary Object Representation) serialization format. CBOR is a binary
-    serialization format which aims to be more compact than JSON itself, yet
-    more efficient to parse.
-
-    The library uses the following mapping from JSON values types to
-    CBOR types according to the CBOR specification (RFC 7049):
-
-    JSON value type | value/range                                | CBOR type                          | first byte
-    --------------- | ------------------------------------------ | ---------------------------------- | ---------------
-    null            | `null`                                     | Null                               | 0xF6
-    boolean         | `true`                                     | True                               | 0xF5
-    boolean         | `false`                                    | False                              | 0xF4
-    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
-    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
-    number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
-    number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
-    number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
-    number_integer  | 0..23                                      | Integer                            | 0x00..0x17
-    number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
-    number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
-    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
-    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
-    number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
-    number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
-    number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
-    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
-    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
-    number_float    | *any value representable by a float*       | Single-Precision Float             | 0xFA
-    number_float    | *any value NOT representable by a float*   | Double-Precision Float             | 0xFB
-    string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
-    string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
-    string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
-    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
-    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
-    array           | *size*: 0..23                              | array                              | 0x80..0x97
-    array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
-    array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
-    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
-    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
-    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
-    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
-    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
-    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
-    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
-    binary          | *size*: 0..23                              | byte string                        | 0x40..0x57
-    binary          | *size*: 23..255                            | byte string (1 byte follow)        | 0x58
-    binary          | *size*: 256..65535                         | byte string (2 bytes follow)       | 0x59
-    binary          | *size*: 65536..4294967295                  | byte string (4 bytes follow)       | 0x5A
-    binary          | *size*: 4294967296..18446744073709551615   | byte string (8 bytes follow)       | 0x5B
-
-    @note The mapping is **complete** in the sense that any JSON value type
-          can be converted to a CBOR value.
-
-    @note If NaN or Infinity are stored inside a JSON number, they are
-          serialized properly. This behavior differs from the @ref dump()
-          function which serializes NaN or Infinity to `null`.
-
-    @note The following CBOR types are not used in the conversion:
-          - UTF-8 strings terminated by "break" (0x7F)
-          - arrays terminated by "break" (0x9F)
-          - maps terminated by "break" (0xBF)
-          - byte strings terminated by "break" (0x5F)
-          - date/time (0xC0..0xC1)
-          - bignum (0xC2..0xC3)
-          - decimal fraction (0xC4)
-          - bigfloat (0xC5)
-          - expected conversions (0xD5..0xD7)
-          - simple values (0xE0..0xF3, 0xF8)
-          - undefined (0xF7)
-          - half-precision floats (0xF9)
-          - break (0xFF)
-
-    @param[in] j  JSON value to serialize
-    @return CBOR serialization as byte vector
-
-    @complexity Linear in the size of the JSON value @a j.
-
-    @liveexample{The example shows the serialization of a JSON value to a byte
-    vector in CBOR format.,to_cbor}
-
-    @sa http://cbor.io
-    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
-        analogous deserialization
-    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
-    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
-             related UBJSON format
-
-    @since version 2.0.9; compact representation of floating-point numbers
-           since version 3.8.0
-    */
-    static std::vector<uint8_t> to_cbor(const basic_json& j)
-    {
-        std::vector<uint8_t> result;
-        to_cbor(j, result);
-        return result;
-    }
-
-    static void to_cbor(const basic_json& j, detail::output_adapter<uint8_t> o)
-    {
-        binary_writer<uint8_t>(o).write_cbor(j);
-    }
-
-    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
-    {
-        binary_writer<char>(o).write_cbor(j);
-    }
-
-    /*!
-    @brief create a MessagePack serialization of a given JSON value
-
-    Serializes a given JSON value @a j to a byte vector using the MessagePack
-    serialization format. MessagePack is a binary serialization format which
-    aims to be more compact than JSON itself, yet more efficient to parse.
-
-    The library uses the following mapping from JSON values types to
-    MessagePack types according to the MessagePack specification:
-
-    JSON value type | value/range                       | MessagePack type | first byte
-    --------------- | --------------------------------- | ---------------- | ----------
-    null            | `null`                            | nil              | 0xC0
-    boolean         | `true`                            | true             | 0xC3
-    boolean         | `false`                           | false            | 0xC2
-    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
-    number_integer  | -2147483648..-32769               | int32            | 0xD2
-    number_integer  | -32768..-129                      | int16            | 0xD1
-    number_integer  | -128..-33                         | int8             | 0xD0
-    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
-    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
-    number_integer  | 128..255                          | uint 8           | 0xCC
-    number_integer  | 256..65535                        | uint 16          | 0xCD
-    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
-    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
-    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
-    number_unsigned | 128..255                          | uint 8           | 0xCC
-    number_unsigned | 256..65535                        | uint 16          | 0xCD
-    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
-    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
-    number_float    | *any value representable by a float*     | float 32 | 0xCA
-    number_float    | *any value NOT representable by a float* | float 64 | 0xCB
-    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
-    string          | *length*: 32..255                 | str 8            | 0xD9
-    string          | *length*: 256..65535              | str 16           | 0xDA
-    string          | *length*: 65536..4294967295       | str 32           | 0xDB
-    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
-    array           | *size*: 16..65535                 | array 16         | 0xDC
-    array           | *size*: 65536..4294967295         | array 32         | 0xDD
-    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
-    object          | *size*: 16..65535                 | map 16           | 0xDE
-    object          | *size*: 65536..4294967295         | map 32           | 0xDF
-    binary          | *size*: 0..255                    | bin 8            | 0xC4
-    binary          | *size*: 256..65535                | bin 16           | 0xC5
-    binary          | *size*: 65536..4294967295         | bin 32           | 0xC6
-
-    @note The mapping is **complete** in the sense that any JSON value type
-          can be converted to a MessagePack value.
-
-    @note The following values can **not** be converted to a MessagePack value:
-          - strings with more than 4294967295 bytes
-          - byte strings with more than 4294967295 bytes
-          - arrays with more than 4294967295 elements
-          - objects with more than 4294967295 elements
-
-    @note Any MessagePack output created @ref to_msgpack can be successfully
-          parsed by @ref from_msgpack.
-
-    @note If NaN or Infinity are stored inside a JSON number, they are
-          serialized properly. This behavior differs from the @ref dump()
-          function which serializes NaN or Infinity to `null`.
-
-    @param[in] j  JSON value to serialize
-    @return MessagePack serialization as byte vector
-
-    @complexity Linear in the size of the JSON value @a j.
-
-    @liveexample{The example shows the serialization of a JSON value to a byte
-    vector in MessagePack format.,to_msgpack}
-
-    @sa http://msgpack.org
-    @sa @ref from_msgpack for the analogous deserialization
-    @sa @ref to_cbor(const basic_json& for the related CBOR format
-    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
-             related UBJSON format
-
-    @since version 2.0.9
-    */
-    static std::vector<uint8_t> to_msgpack(const basic_json& j)
-    {
-        std::vector<uint8_t> result;
-        to_msgpack(j, result);
-        return result;
-    }
-
-    static void to_msgpack(const basic_json& j, detail::output_adapter<uint8_t> o)
-    {
-        binary_writer<uint8_t>(o).write_msgpack(j);
-    }
-
-    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
-    {
-        binary_writer<char>(o).write_msgpack(j);
-    }
-
-    /*!
-    @brief create a UBJSON serialization of a given JSON value
-
-    Serializes a given JSON value @a j to a byte vector using the UBJSON
-    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
-    than JSON itself, yet more efficient to parse.
-
-    The library uses the following mapping from JSON values types to
-    UBJSON types according to the UBJSON specification:
-
-    JSON value type | value/range                       | UBJSON type | marker
-    --------------- | --------------------------------- | ----------- | ------
-    null            | `null`                            | null        | `Z`
-    boolean         | `true`                            | true        | `T`
-    boolean         | `false`                           | false       | `F`
-    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
-    number_integer  | -2147483648..-32769               | int32       | `l`
-    number_integer  | -32768..-129                      | int16       | `I`
-    number_integer  | -128..127                         | int8        | `i`
-    number_integer  | 128..255                          | uint8       | `U`
-    number_integer  | 256..32767                        | int16       | `I`
-    number_integer  | 32768..2147483647                 | int32       | `l`
-    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
-    number_unsigned | 0..127                            | int8        | `i`
-    number_unsigned | 128..255                          | uint8       | `U`
-    number_unsigned | 256..32767                        | int16       | `I`
-    number_unsigned | 32768..2147483647                 | int32       | `l`
-    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
-    number_unsigned | 2147483649..18446744073709551615  | high-precision | `H`
-    number_float    | *any value*                       | float64     | `D`
-    string          | *with shortest length indicator*  | string      | `S`
-    array           | *see notes on optimized format*   | array       | `[`
-    object          | *see notes on optimized format*   | map         | `{`
-
-    @note The mapping is **complete** in the sense that any JSON value type
-          can be converted to a UBJSON value.
-
-    @note The following values can **not** be converted to a UBJSON value:
-          - strings with more than 9223372036854775807 bytes (theoretical)
-
-    @note The following markers are not used in the conversion:
-          - `Z`: no-op values are not created.
-          - `C`: single-byte strings are serialized with `S` markers.
-
-    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
-          by @ref from_ubjson.
-
-    @note If NaN or Infinity are stored inside a JSON number, they are
-          serialized properly. This behavior differs from the @ref dump()
-          function which serializes NaN or Infinity to `null`.
-
-    @note The optimized formats for containers are supported: Parameter
-          @a use_size adds size information to the beginning of a container and
-          removes the closing marker. Parameter @a use_type further checks
-          whether all elements of a container have the same type and adds the
-          type marker to the beginning of the container. The @a use_type
-          parameter must only be used together with @a use_size = true. Note
-          that @a use_size = true alone may result in larger representations -
-          the benefit of this parameter is that the receiving side is
-          immediately informed on the number of elements of the container.
-
-    @note If the JSON data contains the binary type, the value stored is a list
-          of integers, as suggested by the UBJSON documentation.  In particular,
-          this means that serialization and the deserialization of a JSON
-          containing binary values into UBJSON and back will result in a
-          different JSON object.
-
-    @param[in] j  JSON value to serialize
-    @param[in] use_size  whether to add size annotations to container types
-    @param[in] use_type  whether to add type annotations to container types
-                         (must be combined with @a use_size = true)
-    @return UBJSON serialization as byte vector
-
-    @complexity Linear in the size of the JSON value @a j.
-
-    @liveexample{The example shows the serialization of a JSON value to a byte
-    vector in UBJSON format.,to_ubjson}
-
-    @sa http://ubjson.org
-    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
-        analogous deserialization
-    @sa @ref to_cbor(const basic_json& for the related CBOR format
-    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
-
-    @since version 3.1.0
-    */
-    static std::vector<uint8_t> to_ubjson(const basic_json& j,
-                                          const bool use_size = false,
-                                          const bool use_type = false)
-    {
-        std::vector<uint8_t> result;
-        to_ubjson(j, result, use_size, use_type);
-        return result;
-    }
-
-    static void to_ubjson(const basic_json& j, detail::output_adapter<uint8_t> o,
-                          const bool use_size = false, const bool use_type = false)
-    {
-        binary_writer<uint8_t>(o).write_ubjson(j, use_size, use_type);
-    }
-
-    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
-                          const bool use_size = false, const bool use_type = false)
-    {
-        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
-    }
-
-
-    /*!
-    @brief Serializes the given JSON object `j` to BSON and returns a vector
-           containing the corresponding BSON-representation.
-
-    BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are
-    stored as a single entity (a so-called document).
-
-    The library uses the following mapping from JSON values types to BSON types:
-
-    JSON value type | value/range                       | BSON type   | marker
-    --------------- | --------------------------------- | ----------- | ------
-    null            | `null`                            | null        | 0x0A
-    boolean         | `true`, `false`                   | boolean     | 0x08
-    number_integer  | -9223372036854775808..-2147483649 | int64       | 0x12
-    number_integer  | -2147483648..2147483647           | int32       | 0x10
-    number_integer  | 2147483648..9223372036854775807   | int64       | 0x12
-    number_unsigned | 0..2147483647                     | int32       | 0x10
-    number_unsigned | 2147483648..9223372036854775807   | int64       | 0x12
-    number_unsigned | 9223372036854775808..18446744073709551615| --   | --
-    number_float    | *any value*                       | double      | 0x01
-    string          | *any value*                       | string      | 0x02
-    array           | *any value*                       | document    | 0x04
-    object          | *any value*                       | document    | 0x03
-    binary          | *any value*                       | binary      | 0x05
-
-    @warning The mapping is **incomplete**, since only JSON-objects (and things
-    contained therein) can be serialized to BSON.
-    Also, integers larger than 9223372036854775807 cannot be serialized to BSON,
-    and the keys may not contain U+0000, since they are serialized a
-    zero-terminated c-strings.
-
-    @throw out_of_range.407  if `j.is_number_unsigned() && j.get<std::uint64_t>() > 9223372036854775807`
-    @throw out_of_range.409  if a key in `j` contains a NULL (U+0000)
-    @throw type_error.317    if `!j.is_object()`
-
-    @pre The input `j` is required to be an object: `j.is_object() == true`.
-
-    @note Any BSON output created via @ref to_bson can be successfully parsed
-          by @ref from_bson.
-
-    @param[in] j  JSON value to serialize
-    @return BSON serialization as byte vector
-
-    @complexity Linear in the size of the JSON value @a j.
-
-    @liveexample{The example shows the serialization of a JSON value to a byte
-    vector in BSON format.,to_bson}
-
-    @sa http://bsonspec.org/spec.html
-    @sa @ref from_bson(detail::input_adapter&&, const bool strict) for the
-        analogous deserialization
-    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
-             related UBJSON format
-    @sa @ref to_cbor(const basic_json&) for the related CBOR format
-    @sa @ref to_msgpack(const basic_json&) for the related MessagePack format
-    */
-    static std::vector<uint8_t> to_bson(const basic_json& j)
-    {
-        std::vector<uint8_t> result;
-        to_bson(j, result);
-        return result;
-    }
-
-    /*!
-    @brief Serializes the given JSON object `j` to BSON and forwards the
-           corresponding BSON-representation to the given output_adapter `o`.
-    @param j The JSON object to convert to BSON.
-    @param o The output adapter that receives the binary BSON representation.
-    @pre The input `j` shall be an object: `j.is_object() == true`
-    @sa @ref to_bson(const basic_json&)
-    */
-    static void to_bson(const basic_json& j, detail::output_adapter<uint8_t> o)
-    {
-        binary_writer<uint8_t>(o).write_bson(j);
-    }
-
-    /*!
-    @copydoc to_bson(const basic_json&, detail::output_adapter<uint8_t>)
-    */
-    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
-    {
-        binary_writer<char>(o).write_bson(j);
-    }
-
-
-    /*!
-    @brief create a JSON value from an input in CBOR format
-
-    Deserializes a given input @a i to a JSON value using the CBOR (Concise
-    Binary Object Representation) serialization format.
-
-    The library maps CBOR types to JSON value types as follows:
-
-    CBOR type              | JSON value type | first byte
-    ---------------------- | --------------- | ----------
-    Integer                | number_unsigned | 0x00..0x17
-    Unsigned integer       | number_unsigned | 0x18
-    Unsigned integer       | number_unsigned | 0x19
-    Unsigned integer       | number_unsigned | 0x1A
-    Unsigned integer       | number_unsigned | 0x1B
-    Negative integer       | number_integer  | 0x20..0x37
-    Negative integer       | number_integer  | 0x38
-    Negative integer       | number_integer  | 0x39
-    Negative integer       | number_integer  | 0x3A
-    Negative integer       | number_integer  | 0x3B
-    Byte string            | binary          | 0x40..0x57
-    Byte string            | binary          | 0x58
-    Byte string            | binary          | 0x59
-    Byte string            | binary          | 0x5A
-    Byte string            | binary          | 0x5B
-    UTF-8 string           | string          | 0x60..0x77
-    UTF-8 string           | string          | 0x78
-    UTF-8 string           | string          | 0x79
-    UTF-8 string           | string          | 0x7A
-    UTF-8 string           | string          | 0x7B
-    UTF-8 string           | string          | 0x7F
-    array                  | array           | 0x80..0x97
-    array                  | array           | 0x98
-    array                  | array           | 0x99
-    array                  | array           | 0x9A
-    array                  | array           | 0x9B
-    array                  | array           | 0x9F
-    map                    | object          | 0xA0..0xB7
-    map                    | object          | 0xB8
-    map                    | object          | 0xB9
-    map                    | object          | 0xBA
-    map                    | object          | 0xBB
-    map                    | object          | 0xBF
-    False                  | `false`         | 0xF4
-    True                   | `true`          | 0xF5
-    Null                   | `null`          | 0xF6
-    Half-Precision Float   | number_float    | 0xF9
-    Single-Precision Float | number_float    | 0xFA
-    Double-Precision Float | number_float    | 0xFB
-
-    @warning The mapping is **incomplete** in the sense that not all CBOR
-             types can be converted to a JSON value. The following CBOR types
-             are not supported and will yield parse errors (parse_error.112):
-             - date/time (0xC0..0xC1)
-             - bignum (0xC2..0xC3)
-             - decimal fraction (0xC4)
-             - bigfloat (0xC5)
-             - expected conversions (0xD5..0xD7)
-             - simple values (0xE0..0xF3, 0xF8)
-             - undefined (0xF7)
-
-    @warning CBOR allows map keys of any type, whereas JSON only allows
-             strings as keys in object values. Therefore, CBOR maps with keys
-             other than UTF-8 strings are rejected (parse_error.113).
-
-    @note Any CBOR output created @ref to_cbor can be successfully parsed by
-          @ref from_cbor.
-
-    @param[in] i  an input in CBOR format convertible to an input adapter
-    @param[in] strict  whether to expect the input to be consumed until EOF
-                       (true by default)
-    @param[in] allow_exceptions  whether to throw exceptions in case of a
-    parse error (optional, true by default)
-    @param[in] tag_handler how to treat CBOR tags (optional, error by default)
-
-    @return deserialized JSON value; in case of a parse error and
-            @a allow_exceptions set to `false`, the return value will be
-            value_t::discarded.
-
-    @throw parse_error.110 if the given input ends prematurely or the end of
-    file was not reached when @a strict was set to true
-    @throw parse_error.112 if unsupported features from CBOR were
-    used in the given input @a v or if the input is not valid CBOR
-    @throw parse_error.113 if a string was expected as map key, but not found
-
-    @complexity Linear in the size of the input @a i.
-
-    @liveexample{The example shows the deserialization of a byte vector in CBOR
-    format to a JSON value.,from_cbor}
-
-    @sa http://cbor.io
-    @sa @ref to_cbor(const basic_json&) for the analogous serialization
-    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for the
-        related MessagePack format
-    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
-        related UBJSON format
-
-    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
-           consume input adapters, removed start_index parameter, and added
-           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
-           since 3.2.0; added @a tag_handler parameter since 3.9.0.
-    */
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_cbor(InputType&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /*!
-    @copydoc from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t)
-    */
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_cbor(IteratorType first, IteratorType last,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
-    static basic_json from_cbor(const T* ptr, std::size_t len,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
-    }
-
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
-    static basic_json from_cbor(detail::span_input_adapter&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true,
-                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = i.get();
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /*!
-    @brief create a JSON value from an input in MessagePack format
-
-    Deserializes a given input @a i to a JSON value using the MessagePack
-    serialization format.
-
-    The library maps MessagePack types to JSON value types as follows:
-
-    MessagePack type | JSON value type | first byte
-    ---------------- | --------------- | ----------
-    positive fixint  | number_unsigned | 0x00..0x7F
-    fixmap           | object          | 0x80..0x8F
-    fixarray         | array           | 0x90..0x9F
-    fixstr           | string          | 0xA0..0xBF
-    nil              | `null`          | 0xC0
-    false            | `false`         | 0xC2
-    true             | `true`          | 0xC3
-    float 32         | number_float    | 0xCA
-    float 64         | number_float    | 0xCB
-    uint 8           | number_unsigned | 0xCC
-    uint 16          | number_unsigned | 0xCD
-    uint 32          | number_unsigned | 0xCE
-    uint 64          | number_unsigned | 0xCF
-    int 8            | number_integer  | 0xD0
-    int 16           | number_integer  | 0xD1
-    int 32           | number_integer  | 0xD2
-    int 64           | number_integer  | 0xD3
-    str 8            | string          | 0xD9
-    str 16           | string          | 0xDA
-    str 32           | string          | 0xDB
-    array 16         | array           | 0xDC
-    array 32         | array           | 0xDD
-    map 16           | object          | 0xDE
-    map 32           | object          | 0xDF
-    bin 8            | binary          | 0xC4
-    bin 16           | binary          | 0xC5
-    bin 32           | binary          | 0xC6
-    ext 8            | binary          | 0xC7
-    ext 16           | binary          | 0xC8
-    ext 32           | binary          | 0xC9
-    fixext 1         | binary          | 0xD4
-    fixext 2         | binary          | 0xD5
-    fixext 4         | binary          | 0xD6
-    fixext 8         | binary          | 0xD7
-    fixext 16        | binary          | 0xD8
-    negative fixint  | number_integer  | 0xE0-0xFF
-
-    @note Any MessagePack output created @ref to_msgpack can be successfully
-          parsed by @ref from_msgpack.
-
-    @param[in] i  an input in MessagePack format convertible to an input
-                  adapter
-    @param[in] strict  whether to expect the input to be consumed until EOF
-                       (true by default)
-    @param[in] allow_exceptions  whether to throw exceptions in case of a
-    parse error (optional, true by default)
-
-    @return deserialized JSON value; in case of a parse error and
-            @a allow_exceptions set to `false`, the return value will be
-            value_t::discarded.
-
-    @throw parse_error.110 if the given input ends prematurely or the end of
-    file was not reached when @a strict was set to true
-    @throw parse_error.112 if unsupported features from MessagePack were
-    used in the given input @a i or if the input is not valid MessagePack
-    @throw parse_error.113 if a string was expected as map key, but not found
-
-    @complexity Linear in the size of the input @a i.
-
-    @liveexample{The example shows the deserialization of a byte vector in
-    MessagePack format to a JSON value.,from_msgpack}
-
-    @sa http://msgpack.org
-    @sa @ref to_msgpack(const basic_json&) for the analogous serialization
-    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
-        related CBOR format
-    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for
-        the related UBJSON format
-    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
-        the related BSON format
-
-    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
-           consume input adapters, removed start_index parameter, and added
-           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
-           since 3.2.0
-    */
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_msgpack(InputType&& i,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /*!
-    @copydoc from_msgpack(detail::input_adapter&&, const bool, const bool)
-    */
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_msgpack(IteratorType first, IteratorType last,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
-    static basic_json from_msgpack(const T* ptr, std::size_t len,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
-    static basic_json from_msgpack(detail::span_input_adapter&& i,
-                                   const bool strict = true,
-                                   const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = i.get();
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-
-    /*!
-    @brief create a JSON value from an input in UBJSON format
-
-    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
-    Binary JSON) serialization format.
-
-    The library maps UBJSON types to JSON value types as follows:
-
-    UBJSON type | JSON value type                         | marker
-    ----------- | --------------------------------------- | ------
-    no-op       | *no value, next value is read*          | `N`
-    null        | `null`                                  | `Z`
-    false       | `false`                                 | `F`
-    true        | `true`                                  | `T`
-    float32     | number_float                            | `d`
-    float64     | number_float                            | `D`
-    uint8       | number_unsigned                         | `U`
-    int8        | number_integer                          | `i`
-    int16       | number_integer                          | `I`
-    int32       | number_integer                          | `l`
-    int64       | number_integer                          | `L`
-    high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H'
-    string      | string                                  | `S`
-    char        | string                                  | `C`
-    array       | array (optimized values are supported)  | `[`
-    object      | object (optimized values are supported) | `{`
-
-    @note The mapping is **complete** in the sense that any UBJSON value can
-          be converted to a JSON value.
-
-    @param[in] i  an input in UBJSON format convertible to an input adapter
-    @param[in] strict  whether to expect the input to be consumed until EOF
-                       (true by default)
-    @param[in] allow_exceptions  whether to throw exceptions in case of a
-    parse error (optional, true by default)
-
-    @return deserialized JSON value; in case of a parse error and
-            @a allow_exceptions set to `false`, the return value will be
-            value_t::discarded.
-
-    @throw parse_error.110 if the given input ends prematurely or the end of
-    file was not reached when @a strict was set to true
-    @throw parse_error.112 if a parse error occurs
-    @throw parse_error.113 if a string could not be parsed successfully
-
-    @complexity Linear in the size of the input @a i.
-
-    @liveexample{The example shows the deserialization of a byte vector in
-    UBJSON format to a JSON value.,from_ubjson}
-
-    @sa http://ubjson.org
-    @sa @ref to_ubjson(const basic_json&, const bool, const bool) for the
-             analogous serialization
-    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
-        related CBOR format
-    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
-        the related MessagePack format
-    @sa @ref from_bson(detail::input_adapter&&, const bool, const bool) for
-        the related BSON format
-
-    @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0
-    */
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_ubjson(InputType&& i,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /*!
-    @copydoc from_ubjson(detail::input_adapter&&, const bool, const bool)
-    */
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_ubjson(IteratorType first, IteratorType last,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
-    static basic_json from_ubjson(const T* ptr, std::size_t len,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
-    static basic_json from_ubjson(detail::span_input_adapter&& i,
-                                  const bool strict = true,
-                                  const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = i.get();
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-
-    /*!
-    @brief Create a JSON value from an input in BSON format
-
-    Deserializes a given input @a i to a JSON value using the BSON (Binary JSON)
-    serialization format.
-
-    The library maps BSON record types to JSON value types as follows:
-
-    BSON type       | BSON marker byte | JSON value type
-    --------------- | ---------------- | ---------------------------
-    double          | 0x01             | number_float
-    string          | 0x02             | string
-    document        | 0x03             | object
-    array           | 0x04             | array
-    binary          | 0x05             | still unsupported
-    undefined       | 0x06             | still unsupported
-    ObjectId        | 0x07             | still unsupported
-    boolean         | 0x08             | boolean
-    UTC Date-Time   | 0x09             | still unsupported
-    null            | 0x0A             | null
-    Regular Expr.   | 0x0B             | still unsupported
-    DB Pointer      | 0x0C             | still unsupported
-    JavaScript Code | 0x0D             | still unsupported
-    Symbol          | 0x0E             | still unsupported
-    JavaScript Code | 0x0F             | still unsupported
-    int32           | 0x10             | number_integer
-    Timestamp       | 0x11             | still unsupported
-    128-bit decimal float | 0x13       | still unsupported
-    Max Key         | 0x7F             | still unsupported
-    Min Key         | 0xFF             | still unsupported
-
-    @warning The mapping is **incomplete**. The unsupported mappings
-             are indicated in the table above.
-
-    @param[in] i  an input in BSON format convertible to an input adapter
-    @param[in] strict  whether to expect the input to be consumed until EOF
-                       (true by default)
-    @param[in] allow_exceptions  whether to throw exceptions in case of a
-    parse error (optional, true by default)
-
-    @return deserialized JSON value; in case of a parse error and
-            @a allow_exceptions set to `false`, the return value will be
-            value_t::discarded.
-
-    @throw parse_error.114 if an unsupported BSON record type is encountered
-
-    @complexity Linear in the size of the input @a i.
-
-    @liveexample{The example shows the deserialization of a byte vector in
-    BSON format to a JSON value.,from_bson}
-
-    @sa http://bsonspec.org/spec.html
-    @sa @ref to_bson(const basic_json&) for the analogous serialization
-    @sa @ref from_cbor(detail::input_adapter&&, const bool, const bool, const cbor_tag_handler_t) for the
-        related CBOR format
-    @sa @ref from_msgpack(detail::input_adapter&&, const bool, const bool) for
-        the related MessagePack format
-    @sa @ref from_ubjson(detail::input_adapter&&, const bool, const bool) for the
-        related UBJSON format
-    */
-    template<typename InputType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_bson(InputType&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::forward<InputType>(i));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    /*!
-    @copydoc from_bson(detail::input_adapter&&, const bool, const bool)
-    */
-    template<typename IteratorType>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json from_bson(IteratorType first, IteratorType last,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = detail::input_adapter(std::move(first), std::move(last));
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-
-    template<typename T>
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
-    static basic_json from_bson(const T* ptr, std::size_t len,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        return from_bson(ptr, ptr + len, strict, allow_exceptions);
-    }
-
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
-    static basic_json from_bson(detail::span_input_adapter&& i,
-                                const bool strict = true,
-                                const bool allow_exceptions = true)
-    {
-        basic_json result;
-        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
-        auto ia = i.get();
-        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
-        return res ? result : basic_json(value_t::discarded);
-    }
-    /// @}
-
-    //////////////////////////
-    // JSON Pointer support //
-    //////////////////////////
-
-    /// @name JSON Pointer functions
-    /// @{
-
-    /*!
-    @brief access specified element via JSON Pointer
-
-    Uses a JSON pointer to retrieve a reference to the respective JSON value.
-    No bound checking is performed. Similar to @ref operator[](const typename
-    object_t::key_type&), `null` values are created in arrays and objects if
-    necessary.
-
-    In particular:
-    - If the JSON pointer points to an object key that does not exist, it
-      is created an filled with a `null` value before a reference to it
-      is returned.
-    - If the JSON pointer points to an array index that does not exist, it
-      is created an filled with a `null` value before a reference to it
-      is returned. All indices between the current maximum and the given
-      index are also filled with `null`.
-    - The special value `-` is treated as a synonym for the index past the
-      end.
-
-    @param[in] ptr  a JSON pointer
-
-    @return reference to the element pointed to by @a ptr
-
-    @complexity Constant.
-
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    @throw out_of_range.404  if the JSON pointer can not be resolved
-
-    @liveexample{The behavior is shown in the example.,operatorjson_pointer}
-
-    @since version 2.0.0
-    */
-    reference operator[](const json_pointer& ptr)
-    {
-        return ptr.get_unchecked(this);
-    }
-
-    /*!
-    @brief access specified element via JSON Pointer
-
-    Uses a JSON pointer to retrieve a reference to the respective JSON value.
-    No bound checking is performed. The function does not change the JSON
-    value; no `null` values are created. In particular, the special value
-    `-` yields an exception.
-
-    @param[in] ptr  JSON pointer to the desired element
-
-    @return const reference to the element pointed to by @a ptr
-
-    @complexity Constant.
-
-    @throw parse_error.106   if an array index begins with '0'
-    @throw parse_error.109   if an array index was not a number
-    @throw out_of_range.402  if the array index '-' is used
-    @throw out_of_range.404  if the JSON pointer can not be resolved
-
-    @liveexample{The behavior is shown in the example.,operatorjson_pointer_const}
-
-    @since version 2.0.0
-    */
-    const_reference operator[](const json_pointer& ptr) const
-    {
-        return ptr.get_unchecked(this);
-    }
-
-    /*!
-    @brief access specified element via JSON Pointer
-
-    Returns a reference to the element at with specified JSON pointer @a ptr,
-    with bounds checking.
-
-    @param[in] ptr  JSON pointer to the desired element
-
-    @return reference to the element pointed to by @a ptr
-
-    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
-    begins with '0'. See example below.
-
-    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
-    is not a number. See example below.
-
-    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
-    is out of range. See example below.
-
-    @throw out_of_range.402 if the array index '-' is used in the passed JSON
-    pointer @a ptr. As `at` provides checked access (and no elements are
-    implicitly inserted), the index '-' is always invalid. See example below.
-
-    @throw out_of_range.403 if the JSON pointer describes a key of an object
-    which cannot be found. See example below.
-
-    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
-    See example below.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Constant.
-
-    @since version 2.0.0
-
-    @liveexample{The behavior is shown in the example.,at_json_pointer}
-    */
-    reference at(const json_pointer& ptr)
-    {
-        return ptr.get_checked(this);
-    }
-
-    /*!
-    @brief access specified element via JSON Pointer
-
-    Returns a const reference to the element at with specified JSON pointer @a
-    ptr, with bounds checking.
-
-    @param[in] ptr  JSON pointer to the desired element
-
-    @return reference to the element pointed to by @a ptr
-
-    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
-    begins with '0'. See example below.
-
-    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
-    is not a number. See example below.
-
-    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
-    is out of range. See example below.
-
-    @throw out_of_range.402 if the array index '-' is used in the passed JSON
-    pointer @a ptr. As `at` provides checked access (and no elements are
-    implicitly inserted), the index '-' is always invalid. See example below.
-
-    @throw out_of_range.403 if the JSON pointer describes a key of an object
-    which cannot be found. See example below.
-
-    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
-    See example below.
-
-    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
-    changes in the JSON value.
-
-    @complexity Constant.
-
-    @since version 2.0.0
-
-    @liveexample{The behavior is shown in the example.,at_json_pointer_const}
-    */
-    const_reference at(const json_pointer& ptr) const
-    {
-        return ptr.get_checked(this);
-    }
-
-    /*!
-    @brief return flattened JSON value
-
-    The function creates a JSON object whose keys are JSON pointers (see [RFC
-    6901](https://tools.ietf.org/html/rfc6901)) and whose values are all
-    primitive. The original JSON value can be restored using the @ref
-    unflatten() function.
-
-    @return an object that maps JSON pointers to primitive values
-
-    @note Empty objects and arrays are flattened to `null` and will not be
-          reconstructed correctly by the @ref unflatten() function.
-
-    @complexity Linear in the size the JSON value.
-
-    @liveexample{The following code shows how a JSON object is flattened to an
-    object whose keys consist of JSON pointers.,flatten}
-
-    @sa @ref unflatten() for the reverse function
-
-    @since version 2.0.0
-    */
-    basic_json flatten() const
-    {
-        basic_json result(value_t::object);
-        json_pointer::flatten("", *this, result);
-        return result;
-    }
-
-    /*!
-    @brief unflatten a previously flattened JSON value
-
-    The function restores the arbitrary nesting of a JSON value that has been
-    flattened before using the @ref flatten() function. The JSON value must
-    meet certain constraints:
-    1. The value must be an object.
-    2. The keys must be JSON pointers (see
-       [RFC 6901](https://tools.ietf.org/html/rfc6901))
-    3. The mapped values must be primitive JSON types.
-
-    @return the original JSON from a flattened version
-
-    @note Empty objects and arrays are flattened by @ref flatten() to `null`
-          values and can not unflattened to their original type. Apart from
-          this example, for a JSON value `j`, the following is always true:
-          `j == j.flatten().unflatten()`.
-
-    @complexity Linear in the size the JSON value.
-
-    @throw type_error.314  if value is not an object
-    @throw type_error.315  if object values are not primitive
-
-    @liveexample{The following code shows how a flattened JSON object is
-    unflattened into the original nested JSON object.,unflatten}
-
-    @sa @ref flatten() for the reverse function
-
-    @since version 2.0.0
-    */
-    basic_json unflatten() const
-    {
-        return json_pointer::unflatten(*this);
-    }
-
-    /// @}
-
-    //////////////////////////
-    // JSON Patch functions //
-    //////////////////////////
-
-    /// @name JSON Patch functions
-    /// @{
-
-    /*!
-    @brief applies a JSON patch
-
-    [JSON Patch](http://jsonpatch.com) defines a JSON document structure for
-    expressing a sequence of operations to apply to a JSON) document. With
-    this function, a JSON Patch is applied to the current JSON value by
-    executing all operations from the patch.
-
-    @param[in] json_patch  JSON patch document
-    @return patched document
-
-    @note The application of a patch is atomic: Either all operations succeed
-          and the patched document is returned or an exception is thrown. In
-          any case, the original value is not changed: the patch is applied
-          to a copy of the value.
-
-    @throw parse_error.104 if the JSON patch does not consist of an array of
-    objects
-
-    @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory
-    attributes are missing); example: `"operation add must have member path"`
-
-    @throw out_of_range.401 if an array index is out of range.
-
-    @throw out_of_range.403 if a JSON pointer inside the patch could not be
-    resolved successfully in the current JSON value; example: `"key baz not
-    found"`
-
-    @throw out_of_range.405 if JSON pointer has no parent ("add", "remove",
-    "move")
-
-    @throw other_error.501 if "test" operation was unsuccessful
-
-    @complexity Linear in the size of the JSON value and the length of the
-    JSON patch. As usually only a fraction of the JSON value is affected by
-    the patch, the complexity can usually be neglected.
-
-    @liveexample{The following code shows how a JSON patch is applied to a
-    value.,patch}
-
-    @sa @ref diff -- create a JSON patch by comparing two JSON values
-
-    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
-    @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901)
-
-    @since version 2.0.0
-    */
-    basic_json patch(const basic_json& json_patch) const
-    {
-        // make a working copy to apply the patch to
-        basic_json result = *this;
-
-        // the valid JSON Patch operations
-        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
-
-        const auto get_op = [](const std::string & op)
-        {
-            if (op == "add")
-            {
-                return patch_operations::add;
-            }
-            if (op == "remove")
-            {
-                return patch_operations::remove;
-            }
-            if (op == "replace")
-            {
-                return patch_operations::replace;
-            }
-            if (op == "move")
-            {
-                return patch_operations::move;
-            }
-            if (op == "copy")
-            {
-                return patch_operations::copy;
-            }
-            if (op == "test")
-            {
-                return patch_operations::test;
-            }
-
-            return patch_operations::invalid;
-        };
-
-        // wrapper for "add" operation; add value at ptr
-        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
-        {
-            // adding to the root of the target document means replacing it
-            if (ptr.empty())
-            {
-                result = val;
-                return;
-            }
-
-            // make sure the top element of the pointer exists
-            json_pointer top_pointer = ptr.top();
-            if (top_pointer != ptr)
-            {
-                result.at(top_pointer);
-            }
-
-            // get reference to parent of JSON pointer ptr
-            const auto last_path = ptr.back();
-            ptr.pop_back();
-            basic_json& parent = result[ptr];
-
-            switch (parent.m_type)
-            {
-                case value_t::null:
-                case value_t::object:
-                {
-                    // use operator[] to add value
-                    parent[last_path] = val;
-                    break;
-                }
-
-                case value_t::array:
-                {
-                    if (last_path == "-")
-                    {
-                        // special case: append to back
-                        parent.push_back(val);
-                    }
-                    else
-                    {
-                        const auto idx = json_pointer::array_index(last_path);
-                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
-                        {
-                            // avoid undefined behavior
-                            JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range"));
-                        }
-
-                        // default case: insert add offset
-                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
-                    }
-                    break;
-                }
-
-                // if there exists a parent it cannot be primitive
-                default:            // LCOV_EXCL_LINE
-                    JSON_ASSERT(false);  // LCOV_EXCL_LINE
-            }
-        };
-
-        // wrapper for "remove" operation; remove value at ptr
-        const auto operation_remove = [&result](json_pointer & ptr)
-        {
-            // get reference to parent of JSON pointer ptr
-            const auto last_path = ptr.back();
-            ptr.pop_back();
-            basic_json& parent = result.at(ptr);
-
-            // remove child
-            if (parent.is_object())
-            {
-                // perform range check
-                auto it = parent.find(last_path);
-                if (JSON_HEDLEY_LIKELY(it != parent.end()))
-                {
-                    parent.erase(it);
-                }
-                else
-                {
-                    JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found"));
-                }
-            }
-            else if (parent.is_array())
-            {
-                // note erase performs range check
-                parent.erase(json_pointer::array_index(last_path));
-            }
-        };
-
-        // type check: top level value must be an array
-        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
-        {
-            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
-        }
-
-        // iterate and apply the operations
-        for (const auto& val : json_patch)
-        {
-            // wrapper to get a value for an operation
-            const auto get_value = [&val](const std::string & op,
-                                          const std::string & member,
-                                          bool string_type) -> basic_json &
-            {
-                // find value
-                auto it = val.m_value.object->find(member);
-
-                // context-sensitive error message
-                const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'";
-
-                // check if desired value is present
-                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
-                {
-                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'"));
-                }
-
-                // check if result is of type string
-                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
-                {
-                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'"));
-                }
-
-                // no error: return value
-                return it->second;
-            };
-
-            // type check: every element of the array must be an object
-            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
-            {
-                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects"));
-            }
-
-            // collect mandatory members
-            const auto op = get_value("op", "op", true).template get<std::string>();
-            const auto path = get_value(op, "path", true).template get<std::string>();
-            json_pointer ptr(path);
-
-            switch (get_op(op))
-            {
-                case patch_operations::add:
-                {
-                    operation_add(ptr, get_value("add", "value", false));
-                    break;
-                }
-
-                case patch_operations::remove:
-                {
-                    operation_remove(ptr);
-                    break;
-                }
-
-                case patch_operations::replace:
-                {
-                    // the "path" location must exist - use at()
-                    result.at(ptr) = get_value("replace", "value", false);
-                    break;
-                }
-
-                case patch_operations::move:
-                {
-                    const auto from_path = get_value("move", "from", true).template get<std::string>();
-                    json_pointer from_ptr(from_path);
-
-                    // the "from" location must exist - use at()
-                    basic_json v = result.at(from_ptr);
-
-                    // The move operation is functionally identical to a
-                    // "remove" operation on the "from" location, followed
-                    // immediately by an "add" operation at the target
-                    // location with the value that was just removed.
-                    operation_remove(from_ptr);
-                    operation_add(ptr, v);
-                    break;
-                }
-
-                case patch_operations::copy:
-                {
-                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
-                    const json_pointer from_ptr(from_path);
-
-                    // the "from" location must exist - use at()
-                    basic_json v = result.at(from_ptr);
-
-                    // The copy is functionally identical to an "add"
-                    // operation at the target location using the value
-                    // specified in the "from" member.
-                    operation_add(ptr, v);
-                    break;
-                }
-
-                case patch_operations::test:
-                {
-                    bool success = false;
-                    JSON_TRY
-                    {
-                        // check if "value" matches the one at "path"
-                        // the "path" location must exist - use at()
-                        success = (result.at(ptr) == get_value("test", "value", false));
-                    }
-                    JSON_INTERNAL_CATCH (out_of_range&)
-                    {
-                        // ignore out of range errors: success remains false
-                    }
-
-                    // throw an exception if test fails
-                    if (JSON_HEDLEY_UNLIKELY(!success))
-                    {
-                        JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump()));
-                    }
-
-                    break;
-                }
-
-                default:
-                {
-                    // op must be "add", "remove", "replace", "move", "copy", or
-                    // "test"
-                    JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid"));
-                }
-            }
-        }
-
-        return result;
-    }
-
-    /*!
-    @brief creates a diff as a JSON patch
-
-    Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can
-    be changed into the value @a target by calling @ref patch function.
-
-    @invariant For two JSON values @a source and @a target, the following code
-    yields always `true`:
-    @code {.cpp}
-    source.patch(diff(source, target)) == target;
-    @endcode
-
-    @note Currently, only `remove`, `add`, and `replace` operations are
-          generated.
-
-    @param[in] source  JSON value to compare from
-    @param[in] target  JSON value to compare against
-    @param[in] path    helper value to create JSON pointers
-
-    @return a JSON patch to convert the @a source to @a target
-
-    @complexity Linear in the lengths of @a source and @a target.
-
-    @liveexample{The following code shows how a JSON patch is created as a
-    diff for two JSON values.,diff}
-
-    @sa @ref patch -- apply a JSON patch
-    @sa @ref merge_patch -- apply a JSON Merge Patch
-
-    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
-
-    @since version 2.0.0
-    */
-    JSON_HEDLEY_WARN_UNUSED_RESULT
-    static basic_json diff(const basic_json& source, const basic_json& target,
-                           const std::string& path = "")
-    {
-        // the patch
-        basic_json result(value_t::array);
-
-        // if the values are the same, return empty patch
-        if (source == target)
-        {
-            return result;
-        }
-
-        if (source.type() != target.type())
-        {
-            // different types: replace value
-            result.push_back(
-            {
-                {"op", "replace"}, {"path", path}, {"value", target}
-            });
-            return result;
-        }
-
-        switch (source.type())
-        {
-            case value_t::array:
-            {
-                // first pass: traverse common elements
-                std::size_t i = 0;
-                while (i < source.size() && i < target.size())
-                {
-                    // recursive call to compare array values at index i
-                    auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i));
-                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
-                    ++i;
-                }
-
-                // i now reached the end of at least one array
-                // in a second pass, traverse the remaining elements
-
-                // remove my remaining elements
-                const auto end_index = static_cast<difference_type>(result.size());
-                while (i < source.size())
-                {
-                    // add operations in reverse order to avoid invalid
-                    // indices
-                    result.insert(result.begin() + end_index, object(
-                    {
-                        {"op", "remove"},
-                        {"path", path + "/" + std::to_string(i)}
-                    }));
-                    ++i;
-                }
-
-                // add other remaining elements
-                while (i < target.size())
-                {
-                    result.push_back(
-                    {
-                        {"op", "add"},
-                        {"path", path + "/-"},
-                        {"value", target[i]}
-                    });
-                    ++i;
-                }
-
-                break;
-            }
-
-            case value_t::object:
-            {
-                // first pass: traverse this object's elements
-                for (auto it = source.cbegin(); it != source.cend(); ++it)
-                {
-                    // escape the key name to be used in a JSON patch
-                    const auto key = json_pointer::escape(it.key());
-
-                    if (target.find(it.key()) != target.end())
-                    {
-                        // recursive call to compare object values at key it
-                        auto temp_diff = diff(it.value(), target[it.key()], path + "/" + key);
-                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
-                    }
-                    else
-                    {
-                        // found a key that is not in o -> remove it
-                        result.push_back(object(
-                        {
-                            {"op", "remove"}, {"path", path + "/" + key}
-                        }));
-                    }
-                }
-
-                // second pass: traverse other object's elements
-                for (auto it = target.cbegin(); it != target.cend(); ++it)
-                {
-                    if (source.find(it.key()) == source.end())
-                    {
-                        // found a key that is not in this -> add it
-                        const auto key = json_pointer::escape(it.key());
-                        result.push_back(
-                        {
-                            {"op", "add"}, {"path", path + "/" + key},
-                            {"value", it.value()}
-                        });
-                    }
-                }
-
-                break;
-            }
-
-            default:
-            {
-                // both primitive type: replace value
-                result.push_back(
-                {
-                    {"op", "replace"}, {"path", path}, {"value", target}
-                });
-                break;
-            }
-        }
-
-        return result;
-    }
-
-    /// @}
-
-    ////////////////////////////////
-    // JSON Merge Patch functions //
-    ////////////////////////////////
-
-    /// @name JSON Merge Patch functions
-    /// @{
-
-    /*!
-    @brief applies a JSON Merge Patch
-
-    The merge patch format is primarily intended for use with the HTTP PATCH
-    method as a means of describing a set of modifications to a target
-    resource's content. This function applies a merge patch to the current
-    JSON value.
-
-    The function implements the following algorithm from Section 2 of
-    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):
-
-    ```
-    define MergePatch(Target, Patch):
-      if Patch is an Object:
-        if Target is not an Object:
-          Target = {} // Ignore the contents and set it to an empty Object
-        for each Name/Value pair in Patch:
-          if Value is null:
-            if Name exists in Target:
-              remove the Name/Value pair from Target
-          else:
-            Target[Name] = MergePatch(Target[Name], Value)
-        return Target
-      else:
-        return Patch
-    ```
-
-    Thereby, `Target` is the current object; that is, the patch is applied to
-    the current value.
-
-    @param[in] apply_patch  the patch to apply
-
-    @complexity Linear in the lengths of @a patch.
-
-    @liveexample{The following code shows how a JSON Merge Patch is applied to
-    a JSON document.,merge_patch}
-
-    @sa @ref patch -- apply a JSON patch
-    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)
-
-    @since version 3.0.0
-    */
-    void merge_patch(const basic_json& apply_patch)
-    {
-        if (apply_patch.is_object())
-        {
-            if (!is_object())
-            {
-                *this = object();
-            }
-            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
-            {
-                if (it.value().is_null())
-                {
-                    erase(it.key());
-                }
-                else
-                {
-                    operator[](it.key()).merge_patch(it.value());
-                }
-            }
-        }
-        else
-        {
-            *this = apply_patch;
-        }
-    }
-
-    /// @}
-};
-
-/*!
-@brief user-defined to_string function for JSON values
-
-This function implements a user-defined to_string  for JSON objects.
-
-@param[in] j  a JSON object
-@return a std::string object
-*/
-
-NLOHMANN_BASIC_JSON_TPL_DECLARATION
-std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
-{
-    return j.dump();
-}
-} // namespace nlohmann
-
-///////////////////////
-// nonmember support //
-///////////////////////
-
-// specialization of std::swap, and std::hash
-namespace std
-{
-
-/// hash value for JSON objects
-template<>
-struct hash<nlohmann::json>
-{
-    /*!
-    @brief return a hash value for a JSON object
-
-    @since version 1.0.0
-    */
-    std::size_t operator()(const nlohmann::json& j) const
-    {
-        return nlohmann::detail::hash(j);
-    }
-};
-
-/// specialization for std::less<value_t>
-/// @note: do not remove the space after '<',
-///        see https://github.com/nlohmann/json/pull/679
-template<>
-struct less<::nlohmann::detail::value_t>
-{
-    /*!
-    @brief compare two value_t enum values
-    @since version 3.0.0
-    */
-    bool operator()(nlohmann::detail::value_t lhs,
-                    nlohmann::detail::value_t rhs) const noexcept
-    {
-        return nlohmann::detail::operator<(lhs, rhs);
-    }
-};
-
-// C++20 prohibit function specialization in the std namespace.
-#ifndef JSON_HAS_CPP_20
-
-/*!
-@brief exchanges the values of two JSON objects
-
-@since version 1.0.0
-*/
-template<>
-inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept(
-    is_nothrow_move_constructible<nlohmann::json>::value&&
-    is_nothrow_move_assignable<nlohmann::json>::value
-)
-{
-    j1.swap(j2);
-}
-
-#endif
-
-} // namespace std
-
-/*!
-@brief user-defined string literal for JSON values
-
-This operator implements a user-defined string literal for JSON objects. It
-can be used by adding `"_json"` to a string literal and returns a JSON object
-if no parse error occurred.
-
-@param[in] s  a string representation of a JSON object
-@param[in] n  the length of string @a s
-@return a JSON object
-
-@since version 1.0.0
-*/
-// Work around compiler bug in nvcc 11.0, see NVIDIA/NVBench#18
-#if defined(__NVCC__) && \
-    __cplusplus >= 201703L && \
-    __CUDACC_VER_MAJOR__ == 11 && \
-     __CUDACC_VER_MINOR__ == 0
-
-#else
-JSON_HEDLEY_NON_NULL(1)
-inline nlohmann::json operator "" _json(const char* s, std::size_t n)
-{
-    return nlohmann::json::parse(s, s + n);
-}
-#endif
-
-/*!
-@brief user-defined string literal for JSON pointer
-
-This operator implements a user-defined string literal for JSON Pointers. It
-can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer
-object if no parse error occurred.
-
-@param[in] s  a string representation of a JSON Pointer
-@param[in] n  the length of string @a s
-@return a JSON pointer object
-
-@since version 2.0.0
-*/
-JSON_HEDLEY_NON_NULL(1)
-inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
-{
-    return nlohmann::json::json_pointer(std::string(s, n));
-}
-
-#include <nlohmann/detail/macro_unscope.hpp>
-
-#endif  // INCLUDE_NLOHMANN_JSON_HPP_
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index ef9fb48f..dfd7b07e 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -188,7 +188,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
 }
 using my_types = nvbench::type_list<int, float, double>;
 NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
-  .set_type_axis_names({"ValueType"});
+  .set_type_axes_names({"ValueType"});
 ```
 
 The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
diff --git a/docs/cli_help.md b/docs/cli_help.md
index 8629e8f5..0336c5ed 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -89,8 +89,15 @@
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
+* `--stopping-criterion <criterion>`
+  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough 
+    samples were collected.
+  * Only applies to Cold measurements.
+  * Default is stdrel (`--stopping-criterion stdrel`)
+
 * `--min-time <seconds>`
   * Accumulate at least `<seconds>` of execution time per measurement.
+  * Only applies to `stdrel` stopping criterion.
   * Default is 0.5 seconds.
   * If both GPU and CPU times are gathered, this applies to GPU time only.
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
@@ -100,6 +107,7 @@
   * Gather samples until the error in the measurement drops below `<value>`.
   * Noise is specified as the percent relative standard deviation.
   * Default is 0.5% (`--max-noise 0.5`)
+  * Only applies to `stdrel` stopping criterion.
   * Only applies to Cold measurements.
   * If both GPU and CPU times are gathered, this applies to GPU noise only.
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 219fc898..a98bcbeb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,51 +1,71 @@
 set(example_srcs
+  auto_throughput.cu
   axes.cu
+  custom_criterion.cu
   enums.cu
   exec_tag_sync.cu
   exec_tag_timer.cu
   skip.cu
   stream.cu
   throughput.cu
-  auto_throughput.cu
 )
 
 # Metatarget for all examples:
 add_custom_target(nvbench.example.all)
 add_dependencies(nvbench.all nvbench.example.all)
 
-foreach(example_src IN LISTS example_srcs)
-  get_filename_component(example_name "${example_src}" NAME_WLE)
-  string(PREPEND example_name "nvbench.example.")
-  add_executable(${example_name} "${example_src}")
-  nvbench_config_target(${example_name})
-  target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
-  target_link_libraries(${example_name} PRIVATE nvbench::main)
-  set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
-  add_test(NAME ${example_name}
-    COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
-  )
 
-  add_dependencies(nvbench.example.all ${example_name})
-endforeach()
+function (nvbench_add_examples_target target_prefix cuda_std)
+  add_custom_target(${target_prefix}.all)
+  add_dependencies(nvbench.example.all ${target_prefix}.all)
 
-# Silence some warnings from old thrust headers:
-set(thrust_examples
-  auto_throughput
-  axes
-  exec_tag_sync
-  exec_tag_timer
-  skip
-  throughput
-)
-foreach (example IN LISTS thrust_examples)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    # C4324: structure was padded due to alignment specifier
-    nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4324")
-
-    # warning C4201: nonstandard extension used: nameless struct/union:
-    # Fixed in Thrust 1.12.0 (CTK 11.4, NV HPC 21.3)
-    if (${CUDAToolkit_VERSION} VERSION_LESS 11.4)
-      nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4201")
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    string(PREPEND example_name "${target_prefix}.")
+    add_executable(${example_name} "${example_src}")
+    nvbench_config_target(${example_name})
+    target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
+    target_link_libraries(${example_name} PRIVATE nvbench::main)
+    set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
+    add_test(NAME ${example_name}
+      COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
+    )
+
+    # These should not deadlock. If they do, it may be that the CUDA context was created before
+    # setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136.
+    set_tests_properties(${example_name} PROPERTIES
+      FAIL_REGULAR_EXPRESSION "Possible Deadlock Detected"
+    )
+
+    add_dependencies(${target_prefix}.all ${example_name})
+  endforeach()
+
+  # Silence some warnings from old thrust headers:
+  set(thrust_examples
+    auto_throughput
+    axes
+    custom_criterion
+    exec_tag_sync
+    exec_tag_timer
+    skip
+    stream
+    throughput
+  )
+  foreach (example IN LISTS thrust_examples)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+      # C4324: structure was padded due to alignment specifier
+      nvbench_add_cxx_flag(${target_prefix}.${example} PRIVATE "/wd4324")
+
+      # warning C4201: nonstandard extension used: nameless struct/union:
+      # Fixed in Thrust 1.12.0 (CTK 11.4, NV HPC 21.3)
+      if (${CUDAToolkit_VERSION} VERSION_LESS 11.4)
+        nvbench_add_cxx_flag(${target_prefix}.${example} PRIVATE "/wd4201")
+      endif()
     endif()
-  endif()
+  endforeach()
+endfunction()
+
+
+foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
+  nvbench_add_examples_target(nvbench.example.cpp${std} ${std})
 endforeach()
diff --git a/examples/axes.cu b/examples/axes.cu
index b8c21152..44ae5988 100644
--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -56,8 +56,8 @@ NVBENCH_BENCH(single_float64_axis)
 void copy_sweep_grid_shape(nvbench::state &state)
 {
   // Get current parameters:
-  const int block_size = static_cast<int>(state.get_int64("BlockSize"));
-  const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
+  const auto block_size = static_cast<unsigned int>(state.get_int64("BlockSize"));
+  const auto num_blocks = static_cast<unsigned int>(state.get_int64("NumBlocks"));
 
   // Number of int32s in 256 MiB:
   const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
@@ -77,6 +77,7 @@ void copy_sweep_grid_shape(nvbench::state &state)
      num_values,
      in_ptr  = thrust::raw_pointer_cast(in.data()),
      out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      (void) num_values; // clang thinks this is unused...
       nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
         in_ptr,
         out_ptr,
@@ -110,6 +111,7 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
     [num_values,
      in_ptr  = thrust::raw_pointer_cast(in.data()),
      out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      (void) num_values; // clang thinks this is unused...
       nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
                                                                  out_ptr,
                                                                  num_values);
@@ -133,7 +135,7 @@ void copy_type_conversion_sweep(nvbench::state &state,
                                 nvbench::type_list<InputType, OutputType>)
 {
   // Optional: Skip narrowing conversions.
-  if (sizeof(InputType) > sizeof(OutputType))
+  if constexpr(sizeof(InputType) > sizeof(OutputType))
   {
     state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
     return;
@@ -156,6 +158,7 @@ void copy_type_conversion_sweep(nvbench::state &state,
     [num_values,
      in_ptr  = thrust::raw_pointer_cast(in.data()),
      out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      (void) num_values; // clang thinks this is unused...
       nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
                                                                  out_ptr,
                                                                  num_values);
diff --git a/examples/custom_criterion.cu b/examples/custom_criterion.cu
new file mode 100644
index 00000000..46612355
--- /dev/null
+++ b/examples/custom_criterion.cu
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// Inherit from the stopping_criterion_base class:
+class fixed_criterion final : public nvbench::stopping_criterion_base
+{
+  nvbench::int64_t m_num_samples{};
+
+public:
+  fixed_criterion()
+      : nvbench::stopping_criterion_base{"fixed", {{"max-samples", nvbench::int64_t{42}}}}
+  {}
+
+protected:
+  // Setup the criterion in the `do_initialize()` method:
+  virtual void do_initialize() override
+  {
+    m_num_samples = 0;
+  }
+
+  // Process new measurements in the `add_measurement()` method:
+  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
+  {
+    m_num_samples++;
+  }
+
+  // Check if the stopping criterion is met in the `is_finished()` method:
+  virtual bool do_is_finished() override
+  {
+    return m_num_samples >= m_params.get_int64("max-samples");
+  }
+
+};
+
+// Register the criterion with NVBench:
+NVBENCH_REGISTER_CRITERION(fixed_criterion);
+
+void throughput_bench(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(num_values);
+  thrust::device_vector<nvbench::int32_t> output(num_values);
+
+  // Provide throughput information:
+  state.add_element_count(num_values, "NumElements");
+  state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
+    (void) num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()),
+      num_values);
+  });
+}
+NVBENCH_BENCH(throughput_bench).set_stopping_criterion("fixed");
diff --git a/examples/enums.cu b/examples/enums.cu
index c14c2d48..fa149acd 100644
--- a/examples/enums.cu
+++ b/examples/enums.cu
@@ -91,7 +91,7 @@ NVBENCH_BENCH(runtime_enum_sweep_string)
 // ```
 void runtime_enum_sweep_int64(nvbench::state &state)
 {
-  const auto enum_value = static_cast<MyEnum>(state.get_int64("MyEnum"));
+  [[maybe_unused]] const auto enum_value = static_cast<MyEnum>(state.get_int64("MyEnum"));
 
   // Do stuff with enum_value.
   // Create inputs, etc, configure runtime kernel parameters, etc.
diff --git a/examples/exec_tag_sync.cu b/examples/exec_tag_sync.cu
index 0ef4ee78..13669314 100644
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -27,6 +27,9 @@
 // Used to initialize input data:
 #include <thrust/sequence.h>
 
+// Used to run the benchmark on a CUDA stream
+#include <thrust/execution_policy.h>
+
 // `sequence_bench` measures the execution time of `thrust::sequence`. Since
 // algorithms in `thrust::` implicitly sync the CUDA device, the
 // `nvbench::exec_tag::sync` must be passed to `state.exec(...)`.
diff --git a/examples/exec_tag_timer.cu b/examples/exec_tag_timer.cu
index 6aab8582..e283f43b 100644
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -23,6 +23,7 @@
 
 // Thrust simplifies memory management, etc:
 #include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
@@ -53,6 +54,8 @@ void mod2_inplace(nvbench::state &state)
   state.exec(nvbench::exec_tag::timer,
              // Lambda now takes a `timer` argument:
              [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
+               (void) num_values; // clang thinks this is unused...
+
                // Reset working data:
                thrust::copy(thrust::device.on(launch.get_stream()),
                             input.cbegin(),
diff --git a/examples/stream.cu b/examples/stream.cu
index 9507558d..20254e5e 100644
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,6 +52,7 @@ void stream_bench(nvbench::state &state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
 
   state.exec([&input, &output, num_values](nvbench::launch &) {
+    (void) num_values; // clang thinks this is unused...
     copy(thrust::raw_pointer_cast(input.data()),
          thrust::raw_pointer_cast(output.data()),
          num_values);
diff --git a/examples/throughput.cu b/examples/throughput.cu
index 5621ebd7..24df6ee8 100644
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,6 +51,7 @@ void throughput_bench(nvbench::state &state)
   state.add_global_memory_writes<nvbench::int32_t>(num_values);
 
   state.exec([&input, &output, num_values](nvbench::launch &launch) {
+    (void) num_values; // clang thinks this is unused...
     nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
       thrust::raw_pointer_cast(input.data()),
       thrust::raw_pointer_cast(output.data()),
diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt
index f86bd415..182843c1 100644
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -4,6 +4,7 @@ set(srcs
   benchmark_base.cxx
   benchmark_manager.cxx
   blocking_kernel.cu
+  criterion_manager.cxx
   csv_printer.cu
   cuda_call.cu
   device_info.cu
@@ -17,23 +18,24 @@ set(srcs
   printer_multiplex.cxx
   runner.cxx
   state.cxx
+  stopping_criterion.cxx
   string_axis.cxx
   type_axis.cxx
   type_strings.cxx
 
+  detail/entropy_criterion.cxx
   detail/measure_cold.cu
   detail/measure_hot.cu
   detail/state_generator.cxx
+  detail/stdrel_criterion.cxx
+
+  internal/nvml.cxx
 )
 
 if (NVBench_ENABLE_CUPTI)
   list(APPEND srcs detail/measure_cupti.cu cupti_profiler.cxx)
 endif()
 
-if (NVBench_ENABLE_NVML)
-  list(APPEND srcs internal/nvml.cxx)
-endif()
-
 # CUDA 11.0 can't compile json_printer without crashing
 # So for that version fall back to C++ with degraded
 # output ( no PTX version info )
@@ -65,7 +67,7 @@ nvbench_write_config_header(config.cuh.in
 )
 
 # nvbench (nvbench::nvbench)
-add_library(nvbench SHARED ${srcs})
+add_library(nvbench ${srcs})
 nvbench_config_target(nvbench)
 target_include_directories(nvbench PUBLIC
   "$<BUILD_INTERFACE:${NVBench_SOURCE_DIR}>"
@@ -78,8 +80,29 @@ target_link_libraries(nvbench
   PRIVATE
     fmt::fmt
     nvbench_json
-    nvbench_git_revision
 )
+
+# ##################################################################################################
+# * conda environment -----------------------------------------------------------------------------
+rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
+if(TARGET conda_env)
+  # When we are inside a conda env the linker will be set to
+  # `ld.bfd` which will try to resolve all undefined symbols at link time.
+  #
+  # Since we could be using a shared library version of fmt we need
+  # it on the final link line of consumers
+  target_link_libraries(nvbench PRIVATE $<BUILD_INTERFACE:conda_env>)
+
+  # When we are inside a conda env the linker will be set to
+  # `ld.bfd` which will try to resolve all undefined symbols at link time.
+  #
+  # Since we could be using a shared library version of fmt we need
+  # it on the final link line of consumers
+  if(fmt_is_external)
+    target_link_libraries(nvbench PUBLIC fmt::fmt)
+  endif()
+endif()
+
 target_compile_features(nvbench PUBLIC cuda_std_17 PRIVATE cxx_std_17)
 add_dependencies(nvbench.all nvbench)
 
@@ -107,3 +130,18 @@ if (json_is_cu)
     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--diag_suppress=940>
   )
 endif()
+
+# The call to `rapids_cmake_write_git_revision_file` must be in the same
+# CMakeLists.txt as the consumer ( nvbench ) for CMake to get the dependency
+# graph correct.
+rapids_cmake_write_git_revision_file(
+  nvbench_git_revision
+  "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
+)
+target_link_libraries(nvbench PRIVATE nvbench_git_revision)
+
+if(NOT BUILD_SHARED_LIBS)
+  # Need to ensure that for static builds we export the nvbench_git_revision
+  # target
+  nvbench_install_libraries(nvbench_git_revision)
+endif()
diff --git a/nvbench/axes_metadata.cuh b/nvbench/axes_metadata.cuh
index 353855a8..26631913 100644
--- a/nvbench/axes_metadata.cuh
+++ b/nvbench/axes_metadata.cuh
@@ -41,8 +41,8 @@ struct axes_metadata
   template <typename... TypeAxes>
   explicit axes_metadata(nvbench::type_list<TypeAxes...>);
 
-  axes_metadata()                 = default;
-  axes_metadata(axes_metadata &&) = default;
+  axes_metadata()                            = default;
+  axes_metadata(axes_metadata &&)            = default;
   axes_metadata &operator=(axes_metadata &&) = default;
 
   axes_metadata(const axes_metadata &);
@@ -58,20 +58,16 @@ struct axes_metadata
 
   void add_string_axis(std::string name, std::vector<std::string> data);
 
-  [[nodiscard]] const nvbench::int64_axis &
-  get_int64_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::int64_axis &get_int64_axis(std::string_view name) const;
   [[nodiscard]] nvbench::int64_axis &get_int64_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::float64_axis &
-  get_float64_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::float64_axis &get_float64_axis(std::string_view name) const;
   [[nodiscard]] nvbench::float64_axis &get_float64_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::string_axis &
-  get_string_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::string_axis &get_string_axis(std::string_view name) const;
   [[nodiscard]] nvbench::string_axis &get_string_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::type_axis &
-  get_type_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::type_axis &get_type_axis(std::string_view name) const;
   [[nodiscard]] nvbench::type_axis &get_type_axis(std::string_view name);
 
   [[nodiscard]] const nvbench::type_axis &get_type_axis(std::size_t index) const;
@@ -83,10 +79,9 @@ struct axes_metadata
   [[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name) const;
   [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::axis_base &
-  get_axis(std::string_view name, nvbench::axis_type type) const;
-  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name,
-                                             nvbench::axis_type type);
+  [[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name,
+                                                   nvbench::axis_type type) const;
+  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name, nvbench::axis_type type);
 
   [[nodiscard]] static std::vector<std::string>
   generate_default_type_axis_names(std::size_t num_type_axes);
@@ -101,7 +96,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
 {
   using type_axes_list         = nvbench::type_list<TypeAxes...>;
   constexpr auto num_type_axes = nvbench::tl::size<type_axes_list>::value;
-  auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
+  auto names                   = axes_metadata::generate_default_type_axis_names(num_type_axes);
 
   auto names_iter = names.begin(); // contents will be moved from
   nvbench::tl::foreach<type_axes_list>(
@@ -114,8 +109,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
       // The word "type" appears 6 times in the next line.
       // Every. Single. Token.
       typedef typename decltype(wrapped_type)::type type_list;
-      auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++),
-                                                       type_axis_index);
+      auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++), type_axis_index);
       axis->template set_inputs<type_list>();
       axes.push_back(std::move(axis));
     });
diff --git a/nvbench/axes_metadata.cxx b/nvbench/axes_metadata.cxx
index 044bc91f..ef51a964 100644
--- a/nvbench/axes_metadata.cxx
+++ b/nvbench/axes_metadata.cxx
@@ -64,9 +64,7 @@ try
     auto &axis = *m_axes[i];
     if (axis.get_type() != nvbench::axis_type::type)
     {
-      NVBENCH_THROW(std::runtime_error,
-                    "Number of names exceeds number of type axes ({})",
-                    i);
+      NVBENCH_THROW(std::runtime_error, "Number of names exceeds number of type axes ({})", i);
     }
 
     axis.set_name(std::move(names[i]));
@@ -81,8 +79,7 @@ catch (std::exception &e)
                 names);
 }
 
-void axes_metadata::add_float64_axis(std::string name,
-                                     std::vector<nvbench::float64_t> data)
+void axes_metadata::add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
 {
   auto axis = std::make_unique<nvbench::float64_axis>(std::move(name));
   axis->set_inputs(std::move(data));
@@ -98,8 +95,7 @@ void axes_metadata::add_int64_axis(std::string name,
   m_axes.push_back(std::move(axis));
 }
 
-void axes_metadata::add_string_axis(std::string name,
-                                    std::vector<std::string> data)
+void axes_metadata::add_string_axis(std::string name, std::vector<std::string> data)
 {
   auto axis = std::make_unique<nvbench::string_axis>(std::move(name));
   axis->set_inputs(std::move(data));
@@ -188,10 +184,9 @@ nvbench::type_axis &axes_metadata::get_type_axis(std::size_t index)
 
 const axis_base &axes_metadata::get_axis(std::string_view name) const
 {
-  auto iter =
-    std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
-      return axis->get_name() == name;
-    });
+  auto iter = std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
+    return axis->get_name() == name;
+  });
 
   if (iter == m_axes.cend())
   {
@@ -203,10 +198,9 @@ const axis_base &axes_metadata::get_axis(std::string_view name) const
 
 axis_base &axes_metadata::get_axis(std::string_view name)
 {
-  auto iter =
-    std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
-      return axis->get_name() == name;
-    });
+  auto iter = std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
+    return axis->get_name() == name;
+  });
 
   if (iter == m_axes.end())
   {
@@ -216,8 +210,7 @@ axis_base &axes_metadata::get_axis(std::string_view name)
   return **iter;
 }
 
-const axis_base &axes_metadata::get_axis(std::string_view name,
-                                         nvbench::axis_type type) const
+const axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type) const
 {
   const auto &axis = this->get_axis(name);
   if (axis.get_type() != type)
@@ -231,8 +224,7 @@ const axis_base &axes_metadata::get_axis(std::string_view name,
   return axis;
 }
 
-axis_base &axes_metadata::get_axis(std::string_view name,
-                                   nvbench::axis_type type)
+axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type)
 {
   auto &axis = this->get_axis(name);
   if (axis.get_type() != type)
@@ -246,8 +238,7 @@ axis_base &axes_metadata::get_axis(std::string_view name,
   return axis;
 }
 
-std::vector<std::string>
-axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
+std::vector<std::string> axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
 {
   switch (num_type_axes)
   {
diff --git a/nvbench/axis_base.cuh b/nvbench/axis_base.cuh
index 712172f4..0760f702 100644
--- a/nvbench/axis_base.cuh
+++ b/nvbench/axis_base.cuh
@@ -21,6 +21,7 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <utility>
 
 namespace nvbench
@@ -47,10 +48,7 @@ struct axis_base
 
   [[nodiscard]] axis_type get_type() const { return m_type; }
 
-  [[nodiscard]] std::string_view get_type_as_string() const
-  {
-    return axis_type_to_string(m_type);
-  }
+  [[nodiscard]] std::string_view get_type_as_string() const { return axis_type_to_string(m_type); }
 
   [[nodiscard]] std::string_view get_flags_as_string() const
   {
@@ -93,16 +91,12 @@ inline std::string_view axis_type_to_string(axis_type type)
   {
     case axis_type::type:
       return "type";
-      break;
     case axis_type::int64:
       return "int64";
-      break;
     case axis_type::float64:
       return "float64";
-      break;
     case axis_type::string:
       return "string";
-      break;
   }
   throw std::runtime_error{"nvbench::axis_type_to_string Invalid axis_type."};
 }
diff --git a/nvbench/axis_base.cxx b/nvbench/axis_base.cxx
index 6d0bd4df..166f1bae 100644
--- a/nvbench/axis_base.cxx
+++ b/nvbench/axis_base.cxx
@@ -23,9 +23,6 @@ namespace nvbench
 
 axis_base::~axis_base() = default;
 
-std::unique_ptr<axis_base> axis_base::clone() const
-{
-  return this->do_clone();
-}
+std::unique_ptr<axis_base> axis_base::clone() const { return this->do_clone(); }
 
 } // namespace nvbench
diff --git a/nvbench/benchmark.cuh b/nvbench/benchmark.cuh
index 5e050d1c..a226070b 100644
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -57,18 +57,14 @@ struct benchmark final : public benchmark_base
   using type_axes        = TypeAxes;
   using type_configs     = nvbench::tl::cartesian_product<type_axes>;
 
-  static constexpr std::size_t num_type_configs =
-    nvbench::tl::size<type_configs>{};
+  static constexpr std::size_t num_type_configs = nvbench::tl::size<type_configs>{};
 
   benchmark()
       : benchmark_base(type_axes{})
   {}
 
 private:
-  std::unique_ptr<benchmark_base> do_clone() const final
-  {
-    return std::make_unique<benchmark>();
-  }
+  std::unique_ptr<benchmark_base> do_clone() const final { return std::make_unique<benchmark>(); }
 
   void do_set_type_axes_names(std::vector<std::string> names) final
   {
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 3a16408c..170b942a 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -20,8 +20,8 @@
 
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/device_info.cuh>
-#include <nvbench/device_manager.cuh>
 #include <nvbench/state.cuh>
+#include <nvbench/stopping_criterion.cuh>
 
 #include <functional> // reference_wrapper, ref
 #include <memory>
@@ -52,7 +52,6 @@ struct benchmark_base
   template <typename TypeAxes>
   explicit benchmark_base(TypeAxes type_axes)
       : m_axes(type_axes)
-      , m_devices(nvbench::device_manager::get().get_devices())
   {}
 
   virtual ~benchmark_base();
@@ -80,32 +79,28 @@ struct benchmark_base
     return *this;
   }
 
-  benchmark_base &add_float64_axis(std::string name,
-                                   std::vector<nvbench::float64_t> data)
+  benchmark_base &add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
   {
     m_axes.add_float64_axis(std::move(name), std::move(data));
     return *this;
   }
 
-  benchmark_base &add_int64_axis(
-    std::string name,
-    std::vector<nvbench::int64_t> data,
-    nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
+  benchmark_base &add_int64_axis(std::string name,
+                                 std::vector<nvbench::int64_t> data,
+                                 nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
   {
     m_axes.add_int64_axis(std::move(name), std::move(data), flags);
     return *this;
   }
 
-  benchmark_base &add_int64_power_of_two_axis(std::string name,
-                                              std::vector<nvbench::int64_t> data)
+  benchmark_base &add_int64_power_of_two_axis(std::string name, std::vector<nvbench::int64_t> data)
   {
     return this->add_int64_axis(std::move(name),
                                 std::move(data),
                                 nvbench::int64_axis_flags::power_of_two);
   }
 
-  benchmark_base &add_string_axis(std::string name,
-                                  std::vector<std::string> data)
+  benchmark_base &add_string_axis(std::string name, std::vector<std::string> data)
   {
     m_axes.add_string_axis(std::move(name), std::move(data));
     return *this;
@@ -133,48 +128,30 @@ struct benchmark_base
     return *this;
   }
 
-  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const
-  {
-    return m_devices;
-  }
+  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const { return m_devices; }
 
   [[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }
 
-  [[nodiscard]] const nvbench::axes_metadata &get_axes() const
-  {
-    return m_axes;
-  }
+  [[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; }
 
   // Computes the number of configs in the benchmark.
   // Unlike get_states().size(), this method may be used prior to calling run().
   [[nodiscard]] std::size_t get_config_count() const;
 
   // Is empty until run() is called.
-  [[nodiscard]] const std::vector<nvbench::state> &get_states() const
-  {
-    return m_states;
-  }
+  [[nodiscard]] const std::vector<nvbench::state> &get_states() const { return m_states; }
   [[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }
 
   void run() { this->do_run(); }
 
-  void set_printer(nvbench::printer_base &printer)
-  {
-    m_printer = std::ref(printer);
-  }
+  void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }
 
   void clear_printer() { m_printer = std::nullopt; }
 
-  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const
-  {
-    return m_printer;
-  }
+  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const { return m_printer; }
 
   /// Execute at least this many trials per measurement. @{
-  [[nodiscard]] nvbench::int64_t get_min_samples() const
-  {
-    return m_min_samples;
-  }
+  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
   benchmark_base &set_min_samples(nvbench::int64_t min_samples)
   {
     m_min_samples = min_samples;
@@ -193,7 +170,7 @@ struct benchmark_base
   }
   /// @}
 
-  /// If true, the benchmark does not use the blocking_kernel. This is intended 
+  /// If true, the benchmark does not use the blocking_kernel. This is intended
   /// for use with external profiling tools. @{
   [[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
   benchmark_base &set_disable_blocking_kernel(bool v)
@@ -203,22 +180,30 @@ struct benchmark_base
   }
   /// @}
 
-  /// Accumulate at least this many seconds of timing data per measurement. @{
-  [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
+  /// Accumulate at least this many seconds of timing data per measurement.
+  /// Only applies to `stdrel` stopping criterion. @{
+  [[nodiscard]] nvbench::float64_t get_min_time() const
+  {
+    return m_criterion_params.get_float64("min-time");
+  }
   benchmark_base &set_min_time(nvbench::float64_t min_time)
   {
-    m_min_time = min_time;
+    m_criterion_params.set_float64("min-time", min_time);
     return *this;
   }
   /// @}
 
   /// Specify the maximum amount of noise if a measurement supports noise.
   /// Noise is the relative standard deviation:
-  /// `noise = stdev / mean_time`. @{
-  [[nodiscard]] nvbench::float64_t get_max_noise() const { return m_max_noise; }
+  /// `noise = stdev / mean_time`. 
+  /// Only applies to `stdrel` stopping criterion. @{
+  [[nodiscard]] nvbench::float64_t get_max_noise() const
+  {
+    return m_criterion_params.get_float64("max-noise");
+  }
   benchmark_base &set_max_noise(nvbench::float64_t max_noise)
   {
-    m_max_noise = max_noise;
+    m_criterion_params.set_float64("max-noise", max_noise);
     return *this;
   }
   /// @}
@@ -252,6 +237,19 @@ struct benchmark_base
   }
   /// @}
 
+  [[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; }
+  [[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
+
+  /// Control the stopping criterion for the measurement loop.
+  /// @{
+  [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
+  benchmark_base &set_stopping_criterion(std::string criterion)
+  {
+    m_stopping_criterion = std::move(criterion);
+    return *this;
+  }
+  /// @}
+
 protected:
   friend struct nvbench::runner_base;
 
@@ -269,12 +267,13 @@ protected:
   bool m_disable_blocking_kernel{false};
 
   nvbench::int64_t m_min_samples{10};
-  nvbench::float64_t m_min_time{0.5};
-  nvbench::float64_t m_max_noise{0.005}; // 0.5% relative standard deviation
 
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
+  nvbench::criterion_params m_criterion_params;
+  std::string m_stopping_criterion{"stdrel"};
+
 private:
   // route these through virtuals so the templated subclass can inject type info
   virtual std::unique_ptr<benchmark_base> do_clone() const            = 0;
diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx
index 2d08fdbd..6e89fd3d 100644
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -34,13 +34,14 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_axes    = m_axes;
   result->m_devices = m_devices;
 
-  result->m_min_samples = m_min_samples;
-  result->m_min_time    = m_min_time;
-  result->m_max_noise   = m_max_noise;
+  result->m_min_samples      = m_min_samples;
+  result->m_criterion_params = m_criterion_params;
 
   result->m_skip_time = m_skip_time;
   result->m_timeout   = m_timeout;
 
+  result->m_stopping_criterion = m_stopping_criterion;
+
   return result;
 }
 
@@ -68,8 +69,7 @@ std::size_t benchmark_base::get_config_count() const
     std::size_t{1},
     std::multiplies<>{},
     [](const auto &axis_ptr) {
-      if (const auto *type_axis_ptr =
-            dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
+      if (const auto *type_axis_ptr = dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
           type_axis_ptr != nullptr)
       {
         return type_axis_ptr->get_active_count();
diff --git a/nvbench/benchmark_manager.cuh b/nvbench/benchmark_manager.cuh
index 39b1717c..51fab18e 100644
--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -31,14 +31,22 @@ namespace nvbench
  */
 struct benchmark_manager
 {
-  using benchmark_vector =
-    std::vector<std::unique_ptr<nvbench::benchmark_base>>;
+  using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
 
   /**
    * @return The singleton benchmark_manager instance.
    */
   [[nodiscard]] static benchmark_manager &get();
 
+  /**
+   * Setup any default values for the benchmarks. Invoked from `main`.
+   *
+   * Specifically, any CUDA calls (e.g. cudaGetDeviceProperties, etc) needed to initialize the
+   * benchmarks should be done here to avoid creating a CUDA context before we configure the CUDA
+   * environment in `main`.
+   */
+   void initialize();
+
   /**
    * Register a new benchmark.
    */
@@ -53,25 +61,21 @@ struct benchmark_manager
    * Get a non-mutable reference to benchmark with the specified name/index.
    * @{
    */
-  [[nodiscard]] const benchmark_base &
-  get_benchmark(const std::string &name) const;
+  [[nodiscard]] const benchmark_base &get_benchmark(const std::string &name) const;
   [[nodiscard]] const benchmark_base &get_benchmark(std::size_t idx) const
   {
     return *m_benchmarks.at(idx);
   }
   /**@}*/
 
-  [[nodiscard]] const benchmark_vector &get_benchmarks() const
-  {
-    return m_benchmarks;
-  };
+  [[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
 
 private:
-  benchmark_manager()                          = default;
-  benchmark_manager(const benchmark_manager &) = delete;
-  benchmark_manager(benchmark_manager &&)      = delete;
+  benchmark_manager()                                     = default;
+  benchmark_manager(const benchmark_manager &)            = delete;
+  benchmark_manager(benchmark_manager &&)                 = delete;
   benchmark_manager &operator=(const benchmark_manager &) = delete;
-  benchmark_manager &operator=(benchmark_manager &&) = delete;
+  benchmark_manager &operator=(benchmark_manager &&)      = delete;
 
   benchmark_vector m_benchmarks;
 };
diff --git a/nvbench/benchmark_manager.cxx b/nvbench/benchmark_manager.cxx
index 2a0ca603..5df702db 100644
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -18,6 +18,7 @@
 
 #include <nvbench/benchmark_manager.cuh>
 
+#include <nvbench/device_manager.cuh>
 #include <nvbench/detail/throw.cuh>
 
 #include <fmt/format.h>
@@ -34,6 +35,15 @@ benchmark_manager &benchmark_manager::get()
   return the_manager;
 }
 
+void benchmark_manager::initialize()
+{
+  const auto& mgr = device_manager::get();
+  for (auto& bench : m_benchmarks)
+  {
+    bench->set_devices(mgr.get_devices());
+  }
+}
+
 benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
 {
   m_benchmarks.push_back(std::move(bench));
@@ -43,21 +53,18 @@ benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
 benchmark_manager::benchmark_vector benchmark_manager::clone_benchmarks() const
 {
   benchmark_vector result(m_benchmarks.size());
-  std::transform(m_benchmarks.cbegin(),
-                 m_benchmarks.cend(),
-                 result.begin(),
-                 [](const auto &bench) { return bench->clone(); });
+  std::transform(m_benchmarks.cbegin(), m_benchmarks.cend(), result.begin(), [](const auto &bench) {
+    return bench->clone();
+  });
   return result;
 }
 
-const benchmark_base &
-benchmark_manager::get_benchmark(const std::string &name) const
+const benchmark_base &benchmark_manager::get_benchmark(const std::string &name) const
 {
-  auto iter = std::find_if(m_benchmarks.cbegin(),
-                           m_benchmarks.cend(),
-                           [&name](const auto &bench_ptr) {
-                             return bench_ptr->get_name() == name;
-                           });
+  auto iter =
+    std::find_if(m_benchmarks.cbegin(), m_benchmarks.cend(), [&name](const auto &bench_ptr) {
+      return bench_ptr->get_name() == name;
+    });
   if (iter == m_benchmarks.cend())
   {
     NVBENCH_THROW(std::out_of_range, "No benchmark named '{}'.", name);
diff --git a/nvbench/blocking_kernel.cu b/nvbench/blocking_kernel.cu
index 1ee5855c..f3478331 100644
--- a/nvbench/blocking_kernel.cu
+++ b/nvbench/blocking_kernel.cu
@@ -42,8 +42,8 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
                              nvbench::float64_t timeout)
 {
   const auto start_point = cuda::std::chrono::high_resolution_clock::now();
-  const auto timeout_ns  = cuda::std::chrono::nanoseconds(
-    static_cast<nvbench::int64_t>(timeout * 1e9));
+  const auto timeout_ns =
+    cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(timeout * 1e9));
   const auto timeout_point = start_point + timeout_ns;
 
   const bool use_timeout = timeout >= 0.;
@@ -57,41 +57,40 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
   {
     *timeout_flag = 1;
     __threadfence_system(); // Ensure timeout flag visibility on host.
-    printf(
-      "\n"
-      "######################################################################\n"
-      "##################### Possible Deadlock Detected #####################\n"
-      "######################################################################\n"
-      "\n"
-      "Forcing unblock: The current measurement appears to have deadlocked\n"
-      "and the results cannot be trusted.\n"
-      "\n"
-      "This happens when the KernelLauncher synchronizes the CUDA device.\n"
-      "If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
-      "\n"
-      "    state.exec(<KernelLauncher>); // Deadlock\n"
-      "    state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
-      "\n"
-      "This tells NVBench about the sync so it can run the benchmark safely.\n"
-      "\n"
-      "If the KernelLauncher does not synchronize but has a very long \n"
-      "execution time, this may be a false positive. If so, disable this\n"
-      "check with:\n"
-      "\n"
-      "    state.set_blocking_kernel_timeout(-1);\n"
-      "\n"
-      "The current timeout is set to %0.5g seconds.\n"
-      "\n"
-      "For more information, see the 'Benchmarks that sync' section of the\n"
-      "NVBench documentation.\n"
-      "\n"
-      "If this happens while profiling with an external tool,\n"
-      "pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
-      "(to also only run the benchmark once) to the executable.\n"
-      "\n"
-      "For more information, see the 'Benchmark Properties' section of the\n"
-      "NVBench documentation.\n\n",
-      timeout);
+    printf("\n"
+           "######################################################################\n"
+           "##################### Possible Deadlock Detected #####################\n"
+           "######################################################################\n"
+           "\n"
+           "Forcing unblock: The current measurement appears to have deadlocked\n"
+           "and the results cannot be trusted.\n"
+           "\n"
+           "This happens when the KernelLauncher synchronizes the CUDA device.\n"
+           "If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
+           "\n"
+           "    state.exec(<KernelLauncher>); // Deadlock\n"
+           "    state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
+           "\n"
+           "This tells NVBench about the sync so it can run the benchmark safely.\n"
+           "\n"
+           "If the KernelLauncher does not synchronize but has a very long \n"
+           "execution time, this may be a false positive. If so, disable this\n"
+           "check with:\n"
+           "\n"
+           "    state.set_blocking_kernel_timeout(-1);\n"
+           "\n"
+           "The current timeout is set to %0.5g seconds.\n"
+           "\n"
+           "For more information, see the 'Benchmarks that sync' section of the\n"
+           "NVBench documentation.\n"
+           "\n"
+           "If this happens while profiling with an external tool,\n"
+           "pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
+           "(to also only run the benchmark once) to the executable.\n"
+           "\n"
+           "For more information, see the 'Benchmark Properties' section of the\n"
+           "NVBench documentation.\n\n",
+           timeout);
   }
 }
 
@@ -102,15 +101,11 @@ namespace nvbench
 
 blocking_kernel::blocking_kernel()
 {
-  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag,
-                                     sizeof(m_host_flag),
-                                     cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag, sizeof(m_host_flag), cudaHostRegisterMapped));
   NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_flag, &m_host_flag, 0));
-  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_timeout_flag,
-                                     sizeof(m_host_timeout_flag),
-                                     cudaHostRegisterMapped));
   NVBENCH_CUDA_CALL(
-    cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
+    cudaHostRegister(&m_host_timeout_flag, sizeof(m_host_timeout_flag), cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
 }
 
 blocking_kernel::~blocking_kernel()
@@ -119,14 +114,11 @@ blocking_kernel::~blocking_kernel()
   NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timeout_flag));
 }
 
-void blocking_kernel::block(const nvbench::cuda_stream &stream,
-                            nvbench::float64_t timeout)
+void blocking_kernel::block(const nvbench::cuda_stream &stream, nvbench::float64_t timeout)
 {
   m_host_flag         = 0;
   m_host_timeout_flag = 0;
-  block_stream<<<1, 1, 0, stream>>>(m_device_flag,
-                                    m_device_timeout_flag,
-                                    timeout);
+  block_stream<<<1, 1, 0, stream>>>(m_device_flag, m_device_timeout_flag, timeout);
 }
 
 void blocking_kernel::timeout_detected()
diff --git a/nvbench/blocking_kernel.cuh b/nvbench/blocking_kernel.cuh
index ecbfed8f..13f737ef 100644
--- a/nvbench/blocking_kernel.cuh
+++ b/nvbench/blocking_kernel.cuh
@@ -97,10 +97,10 @@ struct blocking_kernel
   }
 
   // move-only
-  blocking_kernel(const blocking_kernel &) = delete;
-  blocking_kernel(blocking_kernel &&)      = default;
+  blocking_kernel(const blocking_kernel &)            = delete;
+  blocking_kernel(blocking_kernel &&)                 = default;
   blocking_kernel &operator=(const blocking_kernel &) = delete;
-  blocking_kernel &operator=(blocking_kernel &&) = default;
+  blocking_kernel &operator=(blocking_kernel &&)      = default;
 
 private:
   nvbench::int32_t m_host_flag{};
diff --git a/nvbench/callable.cuh b/nvbench/callable.cuh
index ce7fff02..2cd1f15f 100644
--- a/nvbench/callable.cuh
+++ b/nvbench/callable.cuh
@@ -30,35 +30,29 @@ struct state;
 // Define a simple callable wrapper around a function. This allows the function
 // to be used as a class template parameter. Intended for use with kernel
 // generators and `NVBENCH_BENCH` macros.
-#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function)                               \
+#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function)                                                   \
   NVBENCH_DEFINE_CALLABLE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
 
-#define NVBENCH_DEFINE_CALLABLE(function, callable_name)                       \
-  struct callable_name                                                         \
-  {                                                                            \
-    void operator()(nvbench::state &state, nvbench::type_list<>)               \
-    {                                                                          \
-      function(state);                                                         \
-    }                                                                          \
+#define NVBENCH_DEFINE_CALLABLE(function, callable_name)                                           \
+  struct callable_name                                                                             \
+  {                                                                                                \
+    void operator()(nvbench::state &state, nvbench::type_list<>) { function(state); }              \
   }
 
-#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function)                      \
-  NVBENCH_DEFINE_CALLABLE_TEMPLATE(function,                                   \
-                                   NVBENCH_UNIQUE_IDENTIFIER(function))
-
-#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name)              \
-  struct callable_name                                                         \
-  {                                                                            \
-    template <typename... Ts>                                                  \
-    void operator()(nvbench::state &state, nvbench::type_list<Ts...>)          \
-    {                                                                          \
-      function(state, nvbench::type_list<Ts...>{});                            \
-    }                                                                          \
+#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function)                                          \
+  NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
+
+#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name)                                  \
+  struct callable_name                                                                             \
+  {                                                                                                \
+    template <typename... Ts>                                                                      \
+    void operator()(nvbench::state &state, nvbench::type_list<Ts...>)                              \
+    {                                                                                              \
+      function(state, nvbench::type_list<Ts...>{});                                                \
+    }                                                                                              \
   }
 
-#define NVBENCH_UNIQUE_IDENTIFIER(prefix)                                      \
-  NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
-#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id)                     \
+#define NVBENCH_UNIQUE_IDENTIFIER(prefix) NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
+#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id)                                         \
   NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)
-#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)                     \
-  prefix##_line_##unique_id
+#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) prefix##_line_##unique_id
diff --git a/nvbench/cpu_timer.cuh b/nvbench/cpu_timer.cuh
index 09d3c54e..d4ba6553 100644
--- a/nvbench/cpu_timer.cuh
+++ b/nvbench/cpu_timer.cuh
@@ -30,27 +30,20 @@ struct cpu_timer
   __forceinline__ cpu_timer() = default;
 
   // move-only
-  cpu_timer(const cpu_timer &) = delete;
-  cpu_timer(cpu_timer &&)      = default;
+  cpu_timer(const cpu_timer &)            = delete;
+  cpu_timer(cpu_timer &&)                 = default;
   cpu_timer &operator=(const cpu_timer &) = delete;
-  cpu_timer &operator=(cpu_timer &&) = default;
+  cpu_timer &operator=(cpu_timer &&)      = default;
 
-  __forceinline__ void start()
-  {
-    m_start = std::chrono::high_resolution_clock::now();
-  }
+  __forceinline__ void start() { m_start = std::chrono::high_resolution_clock::now(); }
 
-  __forceinline__ void stop()
-  {
-    m_stop = std::chrono::high_resolution_clock::now();
-  }
+  __forceinline__ void stop() { m_stop = std::chrono::high_resolution_clock::now(); }
 
   // In seconds:
   [[nodiscard]] __forceinline__ nvbench::float64_t get_duration()
   {
     const auto duration = m_stop - m_start;
-    const auto ns =
-      std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+    const auto ns       = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
     return static_cast<nvbench::float64_t>(ns) * (1e-9);
   }
 
diff --git a/nvbench/create.cuh b/nvbench/create.cuh
index 7aed1b7e..902d6c38 100644
--- a/nvbench/create.cuh
+++ b/nvbench/create.cuh
@@ -27,19 +27,17 @@
 
 #define NVBENCH_TYPE_AXES(...) nvbench::type_list<__VA_ARGS__>
 
-#define NVBENCH_BENCH(KernelGenerator)                                         \
-  NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator);                             \
-  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =  \
-    nvbench::benchmark_manager::get()                                          \
-      .add(std::make_unique<                                                   \
-           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>())  \
+#define NVBENCH_BENCH(KernelGenerator)                                                             \
+  NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator);                                                 \
+  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =                      \
+    nvbench::benchmark_manager::get()                                                              \
+      .add(std::make_unique<nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>())     \
       .set_name(#KernelGenerator)
 
-#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes)                         \
-  NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator);                    \
-  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =  \
-    nvbench::benchmark_manager::get()                                          \
-      .add(std::make_unique<                                                   \
-           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator),      \
-                              TypeAxes>>())                                    \
+#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes)                                             \
+  NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator);                                        \
+  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =                      \
+    nvbench::benchmark_manager::get()                                                              \
+      .add(std::make_unique<                                                                       \
+           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), TypeAxes>>())            \
       .set_name(#KernelGenerator)
diff --git a/nvbench/criterion_manager.cuh b/nvbench/criterion_manager.cuh
new file mode 100644
index 00000000..6c60993c
--- /dev/null
+++ b/nvbench/criterion_manager.cuh
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/detail/entropy_criterion.cuh>
+#include <nvbench/detail/stdrel_criterion.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
+
+#include <memory>
+
+#include <unordered_map>
+
+namespace nvbench
+{
+
+class criterion_manager
+{
+  std::unordered_map<std::string, std::unique_ptr<nvbench::stopping_criterion_base>> m_map;
+
+  criterion_manager();
+
+public:
+  /**
+   * @return The singleton criterion_manager instance.
+   */
+  static criterion_manager& get();
+
+  /**
+   * Register a new stopping criterion.
+   */
+  nvbench::stopping_criterion_base& add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
+  nvbench::stopping_criterion_base& get_criterion(const std::string& name);
+  const nvbench::stopping_criterion_base& get_criterion(const std::string& name) const;
+
+  using params_description = std::vector<std::pair<std::string, nvbench::named_values::type>>;
+  params_description get_params_description() const;
+};
+
+/**
+ * Given a stopping criterion type `TYPE`, registers it in the criterion manager
+ *
+ * See the `custom_criterion.cu` example for usage.
+ */
+#define NVBENCH_REGISTER_CRITERION(TYPE)                                                           \
+  static nvbench::stopping_criterion_base &NVBENCH_UNIQUE_IDENTIFIER(TYPE) =                       \
+    nvbench::criterion_manager::get().add(std::make_unique<TYPE>())
+
+} // namespace nvbench
diff --git a/nvbench/criterion_manager.cxx b/nvbench/criterion_manager.cxx
new file mode 100644
index 00000000..f4857e9e
--- /dev/null
+++ b/nvbench/criterion_manager.cxx
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/detail/throw.cuh>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace nvbench
+{
+
+criterion_manager::criterion_manager()
+{
+  m_map.emplace("stdrel", std::make_unique<nvbench::detail::stdrel_criterion>());
+  m_map.emplace("entropy", std::make_unique<nvbench::detail::entropy_criterion>());
+}
+
+criterion_manager &criterion_manager::get()
+{
+  static criterion_manager registry;
+  return registry;
+}
+
+stopping_criterion_base& criterion_manager::get_criterion(const std::string& name)
+{
+  auto iter = m_map.find(name);
+  if (iter == m_map.end())
+  {
+    NVBENCH_THROW(std::runtime_error, "No stopping criterion named \"{}\".", name);
+  }
+  return *iter->second.get();
+}
+
+const nvbench::stopping_criterion_base& criterion_manager::get_criterion(const std::string& name) const
+{
+  auto iter = m_map.find(name);
+  if (iter == m_map.end())
+  {
+    NVBENCH_THROW(std::runtime_error, "No stopping criterion named \"{}\".", name);
+  }
+  return *iter->second.get();
+}
+
+stopping_criterion_base &criterion_manager::add(std::unique_ptr<stopping_criterion_base> criterion)
+{
+  const std::string name = criterion->get_name();
+
+  auto [it, success] = m_map.emplace(name, std::move(criterion));
+
+  if (!success)
+  {
+    NVBENCH_THROW(std::runtime_error,
+                  "Stopping criterion \"{}\" is already registered.", name);
+  }
+
+  return *it->second.get();
+}
+
+nvbench::criterion_manager::params_description criterion_manager::get_params_description() const
+{
+  nvbench::criterion_manager::params_description desc;
+
+  for (auto &[criterion_name, criterion] : m_map)
+  {
+    nvbench::criterion_params params = criterion->get_params();
+
+    for (auto param : params.get_names())
+    {
+      nvbench::named_values::type type = params.get_type(param);
+      if (std::find_if(desc.begin(), desc.end(), [&](auto d) {
+            return d.first == param && d.second != type;
+          }) != desc.end())
+      {
+        NVBENCH_THROW(std::runtime_error,
+                      "Stopping criterion \"{}\" parameter \"{}\" is already used by another "
+                      "criterion with a different type.",
+                      criterion_name,
+                      param);
+      }
+      desc.emplace_back(param, type);
+    }
+  }
+
+  return desc;
+}
+
+} // namespace nvbench
diff --git a/nvbench/csv_printer.cu b/nvbench/csv_printer.cu
index 6acb535e..a0a906ef 100644
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -66,10 +66,8 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
     {
       std::optional<nvbench::device_info> device = cur_state.get_device();
 
-      std::string device_id   = device ? fmt::to_string(device->get_id())
-                                       : std::string{};
-      std::string device_name = device ? std::string{device->get_name()}
-                                       : std::string{};
+      std::string device_id   = device ? fmt::to_string(device->get_id()) : std::string{};
+      std::string device_name = device ? std::string{device->get_name()} : std::string{};
 
       table.add_cell(row, "_bench_name", "Benchmark", bench_name);
       table.add_cell(row, "_device_id", "Device", std::move(device_id));
@@ -88,15 +86,11 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
                          name + "_axis_pow2_pretty",
                          name + " (pow2)",
                          fmt::format("2^{}", exponent));
-          table.add_cell(row,
-                         name + "_axis_plain",
-                         fmt::format("{}", name),
-                         fmt::to_string(value));
+          table.add_cell(row, name + "_axis_plain", fmt::format("{}", name), fmt::to_string(value));
         }
         else
         {
-          std::string value = std::visit(format_visitor,
-                                         axis_values.get_value(name));
+          std::string value = std::visit(format_visitor, axis_values.get_value(name));
           table.add_cell(row, name + "_axis", name, std::move(value));
         }
       }
@@ -117,14 +111,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
           continue;
         }
         const std::string &tag    = summ.get_tag();
-        const std::string &header = summ.has_value("name")
-                                      ? summ.get_string("name")
-                                      : tag;
-
-        const std::string hint = summ.has_value("hint")
-                                   ? summ.get_string("hint")
-                                   : std::string{};
-        std::string value = std::visit(format_visitor, summ.get_value("value"));
+        const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
+
+        const std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
+        std::string value      = std::visit(format_visitor, summ.get_value("value"));
         if (hint == "duration")
         {
           table.add_cell(row, tag, header + " (sec)", std::move(value));
@@ -171,9 +161,9 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
     std::size_t remaining = table.m_columns.size();
     for (const auto &col : table.m_columns)
     {
-      fmt::format_to(buffer, "{}{}", col.header, (--remaining == 0) ? "" : ",");
+      fmt::format_to(std::back_inserter(buffer), "{}{}", col.header, (--remaining == 0) ? "" : ",");
     }
-    fmt::format_to(buffer, "\n");
+    fmt::format_to(std::back_inserter(buffer), "\n");
   }
 
   { // Rows
@@ -182,12 +172,9 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
       std::size_t remaining = table.m_columns.size();
       for (const auto &col : table.m_columns)
       {
-        fmt::format_to(buffer,
-                       "{}{}",
-                       col.rows[i],
-                       (--remaining == 0) ? "" : ",");
+        fmt::format_to(std::back_inserter(buffer), "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
       }
-      fmt::format_to(buffer, "\n");
+      fmt::format_to(std::back_inserter(buffer), "\n");
     }
   }
 
diff --git a/nvbench/cuda_call.cu b/nvbench/cuda_call.cu
index 6cb304be..662c7593 100644
--- a/nvbench/cuda_call.cu
+++ b/nvbench/cuda_call.cu
@@ -16,8 +16,8 @@
  *  limitations under the License.
  */
 
-#include <nvbench/cuda_call.cuh>
 #include <nvbench/config.cuh>
+#include <nvbench/cuda_call.cuh>
 
 #include <fmt/format.h>
 
@@ -66,11 +66,7 @@ void throw_error(const std::string &filename,
                                        command));
 }
 #else
-void throw_error(const std::string &,
-                 std::size_t,
-                 const std::string &,
-                 CUresult)
-{}
+void throw_error(const std::string &, std::size_t, const std::string &, CUresult) {}
 #endif
 
 void exit_error(const std::string &filename,
diff --git a/nvbench/cuda_call.cuh b/nvbench/cuda_call.cuh
index f1d6c45b..5b2ae362 100644
--- a/nvbench/cuda_call.cuh
+++ b/nvbench/cuda_call.cuh
@@ -18,52 +18,45 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
 #include <cuda.h>
+#include <cuda_runtime_api.h>
 
 #include <string>
 
 /// Throws a std::runtime_error if `call` doesn't return `cudaSuccess`.
-#define NVBENCH_CUDA_CALL(call)                                                \
-  do                                                                           \
-  {                                                                            \
-    const cudaError_t nvbench_cuda_call_error = call;                          \
-    if (nvbench_cuda_call_error != cudaSuccess)                                \
-    {                                                                          \
-      nvbench::cuda_call::throw_error(__FILE__,                                \
-                                      __LINE__,                                \
-                                      #call,                                   \
-                                      nvbench_cuda_call_error);                \
-    }                                                                          \
+/// Resets the error with cudaGetLastError().
+#define NVBENCH_CUDA_CALL(call)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    const cudaError_t nvbench_cuda_call_error = call;                                              \
+    if (nvbench_cuda_call_error != cudaSuccess)                                                    \
+    {                                                                                              \
+      cudaGetLastError();                                                                          \
+      nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
+    }                                                                                              \
   } while (false)
 
 /// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
-#define NVBENCH_DRIVER_API_CALL(call)                                          \
-  do                                                                           \
-  {                                                                            \
-    const CUresult nvbench_cuda_call_error = call;                             \
-    if (nvbench_cuda_call_error != CUDA_SUCCESS)                               \
-    {                                                                          \
-      nvbench::cuda_call::throw_error(__FILE__,                                \
-                                      __LINE__,                                \
-                                      #call,                                   \
-                                      nvbench_cuda_call_error);                \
-    }                                                                          \
+#define NVBENCH_DRIVER_API_CALL(call)                                                              \
+  do                                                                                               \
+  {                                                                                                \
+    const CUresult nvbench_cuda_call_error = call;                                                 \
+    if (nvbench_cuda_call_error != CUDA_SUCCESS)                                                   \
+    {                                                                                              \
+      nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
+    }                                                                                              \
   } while (false)
 
 /// Terminates process with failure status if `call` doesn't return
 /// `cudaSuccess`.
-#define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                       \
-  do                                                                           \
-  {                                                                            \
-    const cudaError_t nvbench_cuda_call_error = call;                          \
-    if (nvbench_cuda_call_error != cudaSuccess)                                \
-    {                                                                          \
-      nvbench::cuda_call::exit_error(__FILE__,                                 \
-                                     __LINE__,                                 \
-                                     #call,                                    \
-                                     nvbench_cuda_call_error);                 \
-    }                                                                          \
+#define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                                           \
+  do                                                                                               \
+  {                                                                                                \
+    const cudaError_t nvbench_cuda_call_error = call;                                              \
+    if (nvbench_cuda_call_error != cudaSuccess)                                                    \
+    {                                                                                              \
+      nvbench::cuda_call::exit_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);          \
+    }                                                                                              \
   } while (false)
 
 namespace nvbench::cuda_call
diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 6674c279..2c7536c1 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -66,10 +66,10 @@ struct cuda_stream
   ~cuda_stream() = default;
 
   // move-only
-  cuda_stream(const cuda_stream &) = delete;
+  cuda_stream(const cuda_stream &)            = delete;
   cuda_stream &operator=(const cuda_stream &) = delete;
   cuda_stream(cuda_stream &&)                 = default;
-  cuda_stream &operator=(cuda_stream &&) = default;
+  cuda_stream &operator=(cuda_stream &&)      = default;
 
   /**
    * @return The `cudaStream_t` managed by this `cuda_stream`.
diff --git a/nvbench/cuda_timer.cuh b/nvbench/cuda_timer.cuh
index 0e022ce1..e1c6e661 100644
--- a/nvbench/cuda_timer.cuh
+++ b/nvbench/cuda_timer.cuh
@@ -42,10 +42,10 @@ struct cuda_timer
   }
 
   // move-only
-  cuda_timer(const cuda_timer &) = delete;
-  cuda_timer(cuda_timer &&)      = default;
+  cuda_timer(const cuda_timer &)            = delete;
+  cuda_timer(cuda_timer &&)                 = default;
   cuda_timer &operator=(const cuda_timer &) = delete;
-  cuda_timer &operator=(cuda_timer &&) = default;
+  cuda_timer &operator=(cuda_timer &&)      = default;
 
   __forceinline__ void start(cudaStream_t stream)
   {
diff --git a/nvbench/cupti_profiler.cuh b/nvbench/cupti_profiler.cuh
index 6e0e255f..214706a7 100644
--- a/nvbench/cupti_profiler.cuh
+++ b/nvbench/cupti_profiler.cuh
@@ -21,14 +21,13 @@
 #include <nvbench/config.cuh>
 #include <nvbench/device_info.cuh>
 
+#include <optional>
 #include <string>
 #include <vector>
-#include <optional>
 
 namespace nvbench::detail
 {
 
-
 #ifdef NVBENCH_HAS_CUPTI
 /**
  * Pass required metrics in the constructor and organize your code as follows
@@ -62,7 +61,7 @@ namespace nvbench::detail
  */
 class cupti_profiler
 {
-  bool m_available {};
+  bool m_available{};
   std::string m_chip_name;
 
   // Counter data
@@ -87,11 +86,10 @@ public:
   cupti_profiler(cupti_profiler &&) noexcept;
   cupti_profiler &operator=(cupti_profiler &&) noexcept;
 
-  cupti_profiler(const cupti_profiler &) = delete;
+  cupti_profiler(const cupti_profiler &)            = delete;
   cupti_profiler &operator=(const cupti_profiler &) = delete;
 
-  cupti_profiler(nvbench::device_info device,
-                 std::vector<std::string> &&metric_names);
+  cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names);
   ~cupti_profiler();
 
   [[nodiscard]] bool is_initialized() const;
@@ -125,5 +123,4 @@ private:
 };
 #endif
 
-
 } // namespace nvbench::detail
diff --git a/nvbench/cupti_profiler.cxx b/nvbench/cupti_profiler.cxx
index 6dcd81d7..6233ef0e 100644
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -31,6 +31,7 @@
 #include <fmt/format.h>
 
 #include <stdexcept>
+#include <type_traits>
 
 namespace nvbench::detail
 {
@@ -53,14 +54,13 @@ void nvpw_call(const NVPA_Status status)
 {
   if (status != NVPA_STATUS_SUCCESS)
   {
-    NVBENCH_THROW(std::runtime_error, "NVPW call returned error: {}", status);
+    NVBENCH_THROW(std::runtime_error, "NVPW call returned error: {}", static_cast<std::underlying_type_t<NVPA_Status>>(status));
   }
 }
 
 } // namespace
 
-cupti_profiler::cupti_profiler(nvbench::device_info device,
-                               std::vector<std::string> &&metric_names)
+cupti_profiler::cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names)
     : m_metric_names(metric_names)
     , m_device(device)
 {
@@ -154,12 +154,10 @@ class eval_request
   NVPW_MetricsEvaluator *evaluator_ptr;
 
 public:
-  eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
-               const std::string &metric_name)
+  eval_request(NVPW_MetricsEvaluator *evaluator_ptr, const std::string &metric_name)
       : evaluator_ptr(evaluator_ptr)
   {
-    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params =
-      {};
+    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {};
 
     params.structSize =
       NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE;
@@ -168,8 +166,7 @@ class eval_request
     params.pMetricEvalRequest          = &request;
     params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
 
-    nvpw_call(
-      NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
+    nvpw_call(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
   }
 
   [[nodiscard]] std::vector<const char *> get_raw_dependencies()
@@ -178,10 +175,9 @@ class eval_request
 
     NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params{};
 
-    params.structSize =
-      NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
-    params.pMetricsEvaluator           = evaluator_ptr;
-    params.pMetricEvalRequests         = &request;
+    params.structSize          = NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
+    params.pMetricsEvaluator   = evaluator_ptr;
+    params.pMetricEvalRequests = &request;
     params.numMetricEvalRequests       = 1;
     params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
     params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
@@ -211,26 +207,23 @@ class metric_evaluator
                    const std::uint8_t *counter_data_image         = nullptr,
                    const std::size_t counter_data_image_size      = 0)
   {
-    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
-      scratch_buffer_param{};
+    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params scratch_buffer_param{};
 
     scratch_buffer_param.structSize =
       NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE;
     scratch_buffer_param.pChipName                 = chip_name.c_str();
     scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;
 
-    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
-      &scratch_buffer_param));
+    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(&scratch_buffer_param));
 
     scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);
 
     NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params{};
 
-    evaluator_params.structSize =
-      NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
-    evaluator_params.scratchBufferSize         = scratch_buffer.size();
-    evaluator_params.pScratchBuffer            = scratch_buffer.data();
-    evaluator_params.pChipName                 = chip_name.c_str();
+    evaluator_params.structSize        = NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
+    evaluator_params.scratchBufferSize = scratch_buffer.size();
+    evaluator_params.pScratchBuffer    = scratch_buffer.data();
+    evaluator_params.pChipName         = chip_name.c_str();
     evaluator_params.pCounterAvailabilityImage = counter_availability_image;
     evaluator_params.pCounterDataImage         = counter_data_image;
     evaluator_params.counterDataImageSize      = counter_data_image_size;
@@ -247,7 +240,7 @@ class metric_evaluator
     {
       NVPW_MetricsEvaluator_Destroy_Params params{};
 
-      params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
+      params.structSize        = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
       params.pMetricsEvaluator = evaluator_ptr;
 
       nvpw_call(NVPW_MetricsEvaluator_Destroy(&params));
@@ -259,10 +252,7 @@ class metric_evaluator
     return {evaluator_ptr, metric_name};
   }
 
-  [[nodiscard]] operator NVPW_MetricsEvaluator *() const
-  {
-    return evaluator_ptr;
-  }
+  [[nodiscard]] operator NVPW_MetricsEvaluator *() const { return evaluator_ptr; }
 };
 
 } // namespace
@@ -270,10 +260,10 @@ class metric_evaluator
 namespace
 {
 
-[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
-  const std::string &chip_name,
-  const std::vector<std::string> &metric_names,
-  const std::uint8_t *counter_availability_image = nullptr)
+[[nodiscard]] std::vector<NVPA_RawMetricRequest>
+get_raw_metric_requests(const std::string &chip_name,
+                        const std::vector<std::string> &metric_names,
+                        const std::uint8_t *counter_availability_image = nullptr)
 {
   metric_evaluator evaluator(chip_name, counter_availability_image);
 
@@ -282,8 +272,7 @@ namespace
 
   for (auto &metric_name : metric_names)
   {
-    for (auto &raw_dependency :
-         evaluator.create_request(metric_name).get_raw_dependencies())
+    for (auto &raw_dependency : evaluator.create_request(metric_name).get_raw_dependencies())
     {
       raw_metric_names.push_back(raw_dependency);
     }
@@ -295,10 +284,10 @@ namespace
   for (auto &raw_name : raw_metric_names)
   {
     NVPA_RawMetricRequest metricRequest{};
-    metricRequest.structSize            = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
-    metricRequest.pMetricName           = raw_name;
-    metricRequest.isolated              = true;
-    metricRequest.keepInstances         = true;
+    metricRequest.structSize    = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
+    metricRequest.pMetricName   = raw_name;
+    metricRequest.isolated      = true;
+    metricRequest.keepInstances = true;
     raw_requests.push_back(metricRequest);
   }
 
@@ -309,12 +298,11 @@ class metrics_config
 {
   bool initialized{};
 
-  void create(const std::string &chip_name,
-              const std::uint8_t *availability_image)
+  void create(const std::string &chip_name, const std::uint8_t *availability_image)
   {
     NVPW_CUDA_RawMetricsConfig_Create_V2_Params params{};
 
-    params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
+    params.structSize                = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
     params.activityKind              = NVPA_ACTIVITY_KIND_PROFILER;
     params.pChipName                 = chip_name.c_str();
     params.pCounterAvailabilityImage = availability_image;
@@ -329,9 +317,8 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_SetCounterAvailability_Params params{};
 
-    params.structSize =
-      NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
-    params.pRawMetricsConfig         = raw_metrics_config;
+    params.structSize        = NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
+    params.pRawMetricsConfig = raw_metrics_config;
     params.pCounterAvailabilityImage = availability_image;
 
     nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(&params));
@@ -341,7 +328,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_BeginPassGroup_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
 
     nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(&params));
@@ -351,7 +338,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_AddMetrics_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
+    params.structSize         = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
     params.pRawMetricsConfig  = raw_metrics_config;
     params.pRawMetricRequests = raw_metric_requests.data();
     params.numMetricRequests  = raw_metric_requests.size();
@@ -363,7 +350,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_EndPassGroup_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
 
     nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(&params));
@@ -373,8 +360,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_GenerateConfigImage_Params params{};
 
-    params.structSize =
-      NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
 
     nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(&params));
@@ -398,7 +384,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_GetConfigImage_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
     params.bytesAllocated    = 0;
     params.pBuffer           = nullptr;
@@ -419,7 +405,7 @@ class metrics_config
     {
       NVPW_RawMetricsConfig_Destroy_Params params{};
 
-      params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
+      params.structSize        = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
       params.pRawMetricsConfig = raw_metrics_config;
 
       NVPW_RawMetricsConfig_Destroy(&params);
@@ -433,13 +419,12 @@ class metrics_config
 
 void cupti_profiler::initialize_config_image()
 {
-  m_config_image =
-    metrics_config(m_chip_name,
-                   get_raw_metric_requests(m_chip_name,
-                                           m_metric_names,
-                                           m_availability_image.data()),
-                   m_availability_image.data())
-      .get_config_image();
+  m_config_image = metrics_config(m_chip_name,
+                                  get_raw_metric_requests(m_chip_name,
+                                                          m_metric_names,
+                                                          m_availability_image.data()),
+                                  m_availability_image.data())
+                     .get_config_image();
 }
 
 namespace
@@ -450,12 +435,11 @@ class counter_data_builder
   bool initialized{};
 
 public:
-  counter_data_builder(const std::string &chip_name,
-                       const std::uint8_t *pCounterAvailabilityImage)
+  counter_data_builder(const std::string &chip_name, const std::uint8_t *pCounterAvailabilityImage)
   {
     NVPW_CUDA_CounterDataBuilder_Create_Params params{};
 
-    params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
+    params.structSize                = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
     params.pChipName                 = chip_name.c_str();
     params.pCounterAvailabilityImage = pCounterAvailabilityImage;
 
@@ -471,7 +455,7 @@ class counter_data_builder
     {
       NVPW_CounterDataBuilder_Destroy_Params params{};
 
-      params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
+      params.structSize          = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
       params.pCounterDataBuilder = builder;
 
       NVPW_CounterDataBuilder_Destroy(&params);
@@ -488,16 +472,14 @@ void cupti_profiler::initialize_counter_data_prefix_image()
   const std::uint8_t *counter_availability_image = nullptr;
 
   std::vector<NVPA_RawMetricRequest> raw_metric_requests =
-    get_raw_metric_requests(m_chip_name,
-                            m_metric_names,
-                            counter_availability_image);
+    get_raw_metric_requests(m_chip_name, m_metric_names, counter_availability_image);
 
   counter_data_builder data_builder(m_chip_name, counter_availability_image);
 
   {
     NVPW_CounterDataBuilder_AddMetrics_Params params{};
 
-    params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
+    params.structSize          = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
     params.pCounterDataBuilder = data_builder.builder;
     params.pRawMetricRequests  = raw_metric_requests.data();
     params.numMetricRequests   = raw_metric_requests.size();
@@ -508,8 +490,7 @@ void cupti_profiler::initialize_counter_data_prefix_image()
   {
     NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params{};
 
-    params.structSize =
-      NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
+    params.structSize          = NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
     params.pCounterDataBuilder = data_builder.builder;
     params.bytesAllocated      = 0;
     params.pBuffer             = nullptr;
@@ -532,11 +513,9 @@ get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
 {
   CUpti_Profiler_CounterDataImage_CalculateSize_Params params{};
 
-  params.structSize =
-    CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
-  params.pOptions = options;
-  params.sizeofCounterDataImageOptions =
-    CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+  params.structSize = CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
+  params.pOptions   = options;
+  params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
 
   cupti_call(cuptiProfilerCounterDataImageCalculateSize(&params));
   return params.counterDataImageSize;
@@ -559,12 +538,10 @@ void cupti_profiler::initialize_counter_data_image()
   {
     CUpti_Profiler_CounterDataImage_Initialize_Params params{};
 
-    params.structSize =
-      CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
-    params.sizeofCounterDataImageOptions =
-      CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
-    params.pOptions             = &counter_data_image_options;
-    params.counterDataImageSize = m_data_image.size();
+    params.structSize = CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
+    params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+    params.pOptions                      = &counter_data_image_options;
+    params.counterDataImageSize          = m_data_image.size();
 
     params.pCounterDataImage = &m_data_image[0];
     cupti_call(cuptiProfilerCounterDataImageInitialize(&params));
@@ -578,8 +555,7 @@ void cupti_profiler::initialize_counter_data_image()
     params.counterDataImageSize = m_data_image.size();
     params.pCounterDataImage    = &m_data_image[0];
 
-    cupti_call(
-      cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
+    cupti_call(cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
 
     m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
   }
@@ -587,8 +563,7 @@ void cupti_profiler::initialize_counter_data_image()
   {
     CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params{};
 
-    params.structSize =
-      CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
+    params.structSize = CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
     params.counterDataImageSize         = m_data_image.size();
     params.pCounterDataImage            = &m_data_image[0];
     params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
@@ -608,17 +583,14 @@ cupti_profiler::~cupti_profiler()
   }
 }
 
-bool cupti_profiler::is_initialized() const
-{
-  return m_available;
-}
+bool cupti_profiler::is_initialized() const { return m_available; }
 
 void cupti_profiler::prepare_user_loop()
 {
   {
     CUpti_Profiler_BeginSession_Params params{};
 
-    params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
+    params.structSize                   = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
     params.ctx                          = nullptr;
     params.counterDataImageSize         = m_data_image.size();
     params.pCounterDataImage            = &m_data_image[0];
@@ -735,9 +707,7 @@ std::vector<double> cupti_profiler::get_counter_values()
 
     if (params.numRanges != 1)
     {
-      NVBENCH_THROW(std::runtime_error,
-                    "{}",
-                    "Something's gone wrong, one range is expected");
+      NVBENCH_THROW(std::runtime_error, "{}", "Something's gone wrong, one range is expected");
     }
   }
 
@@ -752,8 +722,7 @@ std::vector<double> cupti_profiler::get_counter_values()
     {
       NVPW_MetricsEvaluator_SetDeviceAttributes_Params params{};
 
-      params.structSize =
-        NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
+      params.structSize           = NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
       params.pMetricsEvaluator    = evaluator;
       params.pCounterDataImage    = m_data_image.data();
       params.counterDataImageSize = m_data_image.size();
@@ -764,11 +733,10 @@ std::vector<double> cupti_profiler::get_counter_values()
     {
       NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params{};
 
-      params.structSize =
-        NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
-      params.pMetricsEvaluator           = evaluator;
-      params.pMetricEvalRequests         = &request.request;
-      params.numMetricEvalRequests       = 1;
+      params.structSize            = NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
+      params.pMetricsEvaluator     = evaluator;
+      params.pMetricEvalRequests   = &request.request;
+      params.numMetricEvalRequests = 1;
       params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
       params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
       params.pCounterDataImage           = m_data_image.data();
diff --git a/nvbench/detail/device_scope.cuh b/nvbench/detail/device_scope.cuh
index de3a55a3..c924beed 100644
--- a/nvbench/detail/device_scope.cuh
+++ b/nvbench/detail/device_scope.cuh
@@ -39,9 +39,9 @@ struct [[maybe_unused]] device_scope
   ~device_scope() { NVBENCH_CUDA_CALL(cudaSetDevice(m_old_device_id)); }
 
   // move-only
-  device_scope(device_scope &&) = default;
-  device_scope &operator=(device_scope &&) = default;
-  device_scope(const device_scope &)       = delete;
+  device_scope(device_scope &&)                 = default;
+  device_scope &operator=(device_scope &&)      = default;
+  device_scope(const device_scope &)            = delete;
   device_scope &operator=(const device_scope &) = delete;
 
 private:
diff --git a/nvbench/detail/entropy_criterion.cuh b/nvbench/detail/entropy_criterion.cuh
new file mode 100644
index 00000000..b0e4ebe0
--- /dev/null
+++ b/nvbench/detail/entropy_criterion.cuh
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/types.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/detail/ring_buffer.cuh>
+
+#include <vector>
+
+namespace nvbench::detail
+{
+
+class entropy_criterion final : public stopping_criterion_base
+{
+  // state
+  nvbench::int64_t m_total_samples{};
+  nvbench::float64_t m_total_cuda_time{};
+  std::vector<std::pair<nvbench::float64_t, nvbench::int64_t>> m_freq_tracker;
+
+  // TODO The window size should be user-configurable
+  nvbench::detail::ring_buffer<nvbench::float64_t> m_entropy_tracker{299};
+
+  // Used to avoid re-allocating temporary memory
+  std::vector<nvbench::float64_t> m_probabilities; 
+
+  nvbench::float64_t compute_entropy();
+
+public:
+  entropy_criterion();
+
+protected:
+  virtual void do_initialize() override;
+  virtual void do_add_measurement(nvbench::float64_t measurement) override;
+  virtual bool do_is_finished() override;
+  
+};
+
+} // namespace nvbench::detail
diff --git a/nvbench/detail/entropy_criterion.cxx b/nvbench/detail/entropy_criterion.cxx
new file mode 100644
index 00000000..6d9ba8cd
--- /dev/null
+++ b/nvbench/detail/entropy_criterion.cxx
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/entropy_criterion.cuh>
+#include <nvbench/types.cuh>
+
+#include <cmath>
+
+
+namespace nvbench::detail
+{
+
+entropy_criterion::entropy_criterion()
+    : stopping_criterion_base{"entropy", {{"max-angle", 0.048}, {"min-r2", 0.36}}}
+{
+  m_freq_tracker.reserve(m_entropy_tracker.capacity() * 2);
+  m_probabilities.reserve(m_entropy_tracker.capacity() * 2);
+}
+
+void entropy_criterion::do_initialize()
+{
+  m_total_samples   = 0;
+  m_total_cuda_time = 0.0;
+  m_entropy_tracker.clear();
+  m_freq_tracker.clear();
+}
+
+nvbench::float64_t entropy_criterion::compute_entropy() 
+{
+  const std::size_t n = m_freq_tracker.size();
+  if (n == 0)
+  {
+    return 0.0;
+  }
+
+  m_probabilities.resize(n);
+  for (std::size_t i = 0; i < n; i++)
+  {
+    m_probabilities[i] = static_cast<nvbench::float64_t>(m_freq_tracker[i].second) /
+                         static_cast<nvbench::float64_t>(m_total_samples);
+  }
+
+  nvbench::float64_t entropy{};
+  for (nvbench::float64_t p : m_probabilities)
+  {
+    entropy -= p * std::log2(p);
+  }
+
+  return entropy;
+}
+
+void entropy_criterion::do_add_measurement(nvbench::float64_t measurement)
+{
+  m_total_samples++;
+  m_total_cuda_time += measurement;
+
+  {
+    auto key = measurement;
+    constexpr bool bin_keys = false;
+
+    if (bin_keys) 
+    {
+      const auto resolution_us = 0.5;
+      const auto resulution_s = resolution_us / 1'000'000;
+      const auto epsilon = resulution_s * 2;
+      key = std::round(key / epsilon) * epsilon;
+    }
+
+    // This approach is about 3x faster than `std::{unordered_,}map`
+    // Up to 100k samples, only about 20% slower than corresponding stdrel method
+    auto it = std::lower_bound(m_freq_tracker.begin(),
+                               m_freq_tracker.end(),
+                               std::make_pair(key, nvbench::int64_t{}));
+
+    if (it != m_freq_tracker.end() && it->first == key)
+    {
+      it->second += 1;
+    }
+    else
+    {
+      m_freq_tracker.insert(it, std::make_pair(key, nvbench::int64_t{1}));
+    }
+  }
+
+  m_entropy_tracker.push_back(compute_entropy());
+}
+
+bool entropy_criterion::do_is_finished()
+{
+  if (m_entropy_tracker.size() < 2)
+  {
+    return false;
+  }
+
+  // Even number of samples is used to reduce the overhead and not required to compute entropy.
+  // This makes `is_finished()` about 20% faster than corresponding stdrel method.
+  if (m_total_samples % 2 != 0)
+  {
+    return false;
+  }
+
+  auto begin = m_entropy_tracker.cbegin();
+  auto end   = m_entropy_tracker.cend();
+  auto mean  = statistics::compute_mean(begin, end);
+
+  const auto [slope, intercept] = statistics::compute_linear_regression(begin, end, mean);
+
+  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle")) 
+  {
+    return false;
+  }
+
+  const auto r2 = statistics::compute_r2(begin, end, mean, slope, intercept);
+  if (r2 < m_params.get_float64("min-r2"))
+  {
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace nvbench::detail
diff --git a/nvbench/detail/kernel_launcher_timer_wrapper.cuh b/nvbench/detail/kernel_launcher_timer_wrapper.cuh
index 39a999ec..1efdf6e7 100644
--- a/nvbench/detail/kernel_launcher_timer_wrapper.cuh
+++ b/nvbench/detail/kernel_launcher_timer_wrapper.cuh
@@ -33,7 +33,7 @@ namespace detail
 template <typename KernelLauncher>
 struct kernel_launch_timer_wrapper
 {
- explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
+  explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
       : m_kernel_launcher{launcher}
   {}
 
diff --git a/nvbench/detail/l2flush.cuh b/nvbench/detail/l2flush.cuh
index 0e33f7e4..f85b3e79 100644
--- a/nvbench/detail/l2flush.cuh
+++ b/nvbench/detail/l2flush.cuh
@@ -31,13 +31,12 @@ struct l2flush
   {
     int dev_id{};
     NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
-    NVBENCH_CUDA_CALL(
-      cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
+    NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
     if (m_l2_size > 0)
     {
-      void* buffer = m_l2_buffer;
-      NVBENCH_CUDA_CALL(cudaMalloc(&buffer, m_l2_size));
-      m_l2_buffer = reinterpret_cast<int*>(buffer);
+      void *buffer = m_l2_buffer;
+      NVBENCH_CUDA_CALL(cudaMalloc(&buffer, static_cast<std::size_t>(m_l2_size)));
+      m_l2_buffer = reinterpret_cast<int *>(buffer);
     }
   }
 
@@ -53,7 +52,8 @@ struct l2flush
   {
     if (m_l2_size > 0)
     {
-      NVBENCH_CUDA_CALL(cudaMemsetAsync(m_l2_buffer, 0, m_l2_size, stream));
+      NVBENCH_CUDA_CALL(
+        cudaMemsetAsync(m_l2_buffer, 0, static_cast<std::size_t>(m_l2_size), stream));
     }
   }
 
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 380d2cd1..3b415fb9 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -16,53 +16,48 @@
  *  limitations under the License.
  */
 
-#include <nvbench/detail/measure_cold.cuh>
-
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/detail/measure_cold.cuh>
+#include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_base.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 
-#include <nvbench/detail/ring_buffer.cuh>
-#include <nvbench/detail/throw.cuh>
-
 #include <fmt/format.h>
 
-#include <algorithm>
-#include <cstdio>
-#include <stdexcept>
-#include <variant>
-
 namespace nvbench::detail
 {
 
 measure_cold_base::measure_cold_base(state &exec_state)
     : m_state{exec_state}
     , m_launch{m_state.get_cuda_stream()}
+    , m_criterion_params{exec_state.get_criterion_params()}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
     , m_run_once{exec_state.get_run_once()}
     , m_no_block{exec_state.get_disable_blocking_kernel()}
     , m_min_samples{exec_state.get_min_samples()}
-    , m_max_noise{exec_state.get_max_noise()}
-    , m_min_time{exec_state.get_min_time()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
-{}
+{
+  if (m_min_samples > 0)
+  {
+    m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
+    m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
+  }
+}
 
 void measure_cold_base::check()
 {
   const auto device = m_state.get_device();
   if (!device)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `cold` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cold` measurement.");
   }
   if (!device->is_active())
   { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
   }
 }
 
@@ -72,10 +67,11 @@ void measure_cold_base::initialize()
   m_total_cpu_time  = 0.;
   m_cpu_noise       = 0.;
   m_total_samples   = 0;
-  m_noise_tracker.clear();
   m_cuda_times.clear();
   m_cpu_times.clear();
   m_max_time_exceeded = false;
+
+  m_stopping_criterion.initialize(m_criterion_params);
 }
 
 void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
@@ -91,18 +87,7 @@ void measure_cold_base::record_measurements()
   m_total_cpu_time += cur_cpu_time;
   ++m_total_samples;
 
-  // Compute convergence statistics using CUDA timings:
-  const auto mean_cuda_time = m_total_cuda_time /
-                              static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cuda_stdev =
-    nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
-                                                    m_cuda_times.cend(),
-                                                    mean_cuda_time);
-  auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
-  if (std::isfinite(cuda_rel_stdev))
-  {
-    m_noise_tracker.push_back(cuda_rel_stdev);
-  }
+  m_stopping_criterion.add_measurement(cur_cuda_time);
 }
 
 bool measure_cold_base::is_finished()
@@ -113,39 +98,12 @@ bool measure_cold_base::is_finished()
   }
 
   // Check that we've gathered enough samples:
-  if (m_total_cuda_time > m_min_time && m_total_samples > m_min_samples)
+  if (m_total_samples > m_min_samples)
   {
-    // Noise has dropped below threshold
-    if (m_noise_tracker.back() < m_max_noise)
+    if (m_stopping_criterion.is_finished())
     {
       return true;
     }
-
-    // Check if the noise (cuda rel stdev) has converged by inspecting a
-    // trailing window of recorded noise measurements.
-    // This helps identify benchmarks that are inherently noisy and would
-    // never converge to the target stdev threshold. This check ensures that the
-    // benchmark will end if the stdev stabilizes above the target threshold.
-    // Gather some iterations before checking noise, and limit how often we
-    // check this.
-    if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0))
-    {
-      // Use the current noise as the stdev reference.
-      const auto current_noise = m_noise_tracker.back();
-      const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
-        m_noise_tracker.cbegin(),
-        m_noise_tracker.cend(),
-        current_noise);
-      const auto noise_rel_stdev = noise_stdev / current_noise;
-
-      // If the rel stdev of the last N cuda noise measurements is less than
-      // 5%, consider the result stable.
-      const auto noise_threshold = 0.05;
-      if (noise_rel_stdev < noise_threshold)
-      {
-        return true;
-      }
-    }
   }
 
   // Check for timeouts:
@@ -162,13 +120,11 @@ bool measure_cold_base::is_finished()
 void measure_cold_base::run_trials_epilogue()
 {
   // Only need to compute this at the end, not per iteration.
-  const auto cpu_mean = m_total_cuda_time /
-                        static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cpu_stdev =
-    nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
-                                                    m_cpu_times.cend(),
-                                                    cpu_mean);
-  m_cpu_noise = cpu_stdev / cpu_mean;
+  const auto cpu_mean  = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                                         m_cpu_times.cend(),
+                                                                         cpu_mean);
+  m_cpu_noise          = cpu_stdev / cpu_mean;
 
   m_walltime_timer.stop();
 }
@@ -199,8 +155,7 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("description",
-                    "Relative standard deviation of isolated CPU times");
+    summ.set_string("description", "Relative standard deviation of isolated CPU times");
     summ.set_float64("value", m_cpu_noise);
   }
 
@@ -215,16 +170,21 @@ void measure_cold_base::generate_summaries()
     summ.set_float64("value", avg_cuda_time);
   }
 
+  const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+                                                                          m_cuda_times.cend(),
+                                                                          mean_cuda_time);
+  const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
+  const auto noise = cuda_rel_stdev;
+  const auto max_noise = m_criterion_params.get_float64("max-noise");
+  const auto min_time = m_criterion_params.get_float64("min-time");
+
   {
     auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("description",
-                    "Relative standard deviation of isolated GPU times");
-    summ.set_float64("value",
-                     m_noise_tracker.empty()
-                       ? std::numeric_limits<nvbench::float64_t>::infinity()
-                       : m_noise_tracker.back());
+    summ.set_string("description", "Relative standard deviation of isolated GPU times");
+    summ.set_float64("value", noise);
   }
 
   if (const auto items = m_state.get_element_count(); items != 0)
@@ -232,8 +192,7 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
     summ.set_string("name", "Elem/s");
     summ.set_string("hint", "item_rate");
-    summ.set_string("description",
-                    "Number of input elements processed per second");
+    summ.set_string("description", "Number of input elements processed per second");
     summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
   }
 
@@ -251,8 +210,8 @@ void measure_cold_base::generate_summaries()
     }
 
     {
-      const auto peak_gmem_bw = static_cast<double>(
-        m_state.get_device()->get_global_memory_bus_bandwidth());
+      const auto peak_gmem_bw =
+        static_cast<double>(m_state.get_device()->get_global_memory_bus_bandwidth());
 
       auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
       summ.set_string("name", "BWUtil");
@@ -274,8 +233,7 @@ void measure_cold_base::generate_summaries()
   }
 
   // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
 
@@ -283,15 +241,15 @@ void measure_cold_base::generate_summaries()
     {
       const auto timeout = m_walltime_timer.get_duration();
 
-      if (!m_noise_tracker.empty() && m_noise_tracker.back() > m_max_noise)
+      if (noise > max_noise)
       {
         printer.log(nvbench::log_level::warn,
                     fmt::format("Current measurement timed out ({:0.2f}s) "
                                 "while over noise threshold ({:0.2f}% > "
                                 "{:0.2f}%)",
                                 timeout,
-                                m_noise_tracker.back() * 100,
-                                m_max_noise * 100));
+                                noise * 100,
+                                max_noise * 100));
       }
       if (m_total_samples < m_min_samples)
       {
@@ -302,7 +260,7 @@ void measure_cold_base::generate_summaries()
                                 m_total_samples,
                                 m_min_samples));
       }
-      if (m_total_cuda_time < m_min_time)
+      if (m_total_cuda_time < min_time)
       {
         printer.log(nvbench::log_level::warn,
                     fmt::format("Current measurement timed out ({:0.2f}s) "
@@ -310,7 +268,7 @@ void measure_cold_base::generate_summaries()
                                 "{:0.2f}s)",
                                 timeout,
                                 m_total_cuda_time,
-                                m_min_time));
+                                min_time));
       }
     }
 
@@ -324,10 +282,7 @@ void measure_cold_base::generate_summaries()
                             m_walltime_timer.get_duration(),
                             m_total_samples));
 
-    printer.process_bulk_data(m_state,
-                              "nv/cold/sample_times",
-                              "sample_times",
-                              m_cuda_times);
+    printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
   }
 }
 
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 0cab36ae..2b0183f5 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -25,15 +25,14 @@
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
+#include <nvbench/stopping_criterion.cuh>
 
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
-#include <nvbench/detail/ring_buffer.cuh>
 #include <nvbench/detail/statistics.cuh>
 
 #include <cuda_runtime.h>
 
-#include <algorithm>
 #include <utility>
 #include <vector>
 
@@ -49,10 +48,10 @@ namespace detail
 struct measure_cold_base
 {
   explicit measure_cold_base(nvbench::state &exec_state);
-  measure_cold_base(const measure_cold_base &) = delete;
-  measure_cold_base(measure_cold_base &&)      = delete;
+  measure_cold_base(const measure_cold_base &)            = delete;
+  measure_cold_base(measure_cold_base &&)                 = delete;
   measure_cold_base &operator=(const measure_cold_base &) = delete;
-  measure_cold_base &operator=(measure_cold_base &&) = delete;
+  measure_cold_base &operator=(measure_cold_base &&)      = delete;
 
 protected:
   template <bool use_blocking_kernel>
@@ -68,10 +67,7 @@ protected:
 
   void check_skip_time(nvbench::float64_t warmup_time);
 
-  __forceinline__ void flush_device_l2()
-  {
-    m_l2flush.flush(m_launch.get_stream());
-  }
+  __forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
 
   __forceinline__ void sync_stream() const
   {
@@ -90,12 +86,13 @@ protected:
   nvbench::detail::l2flush m_l2flush;
   nvbench::blocking_kernel m_blocker;
 
+  nvbench::criterion_params m_criterion_params;
+  nvbench::stopping_criterion_base& m_stopping_criterion;
+
   bool m_run_once{false};
   bool m_no_block{false};
 
   nvbench::int64_t m_min_samples{};
-  nvbench::float64_t m_max_noise{}; // rel stdev
-  nvbench::float64_t m_min_time{};
 
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
@@ -105,9 +102,6 @@ protected:
   nvbench::float64_t m_total_cpu_time{};
   nvbench::float64_t m_cpu_noise{}; // rel stdev
 
-  // Trailing history of noise measurements for convergence tests
-  nvbench::detail::ring_buffer<nvbench::float64_t> m_noise_tracker{512};
-
   std::vector<nvbench::float64_t> m_cuda_times;
   std::vector<nvbench::float64_t> m_cpu_times;
 
diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index 9e8de6c3..e583cd54 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -50,8 +50,7 @@ struct metric_traits;
 template <>
 struct metric_traits<metric_id::dram_peak_sustained_throughput>
 {
-  static constexpr const char *metric_name =
-    "dram__throughput.avg.pct_of_peak_sustained_elapsed";
+  static constexpr const char *metric_name = "dram__throughput.avg.pct_of_peak_sustained_elapsed";
 
   static constexpr const char *name = "HBWPeak";
   static constexpr const char *hint = "percentage";
@@ -119,10 +118,7 @@ struct metric_traits<metric_id::l1_hit_rate>
   static constexpr const char *description = "Hit rate at L1 cache.";
   static constexpr double divider          = 100.0;
 
-  static bool is_collected(nvbench::state &m_state)
-  {
-    return m_state.is_l1_hit_rate_collected();
-  };
+  static bool is_collected(nvbench::state &m_state) { return m_state.is_l1_hit_rate_collected(); };
 };
 
 template <>
@@ -134,10 +130,7 @@ struct metric_traits<metric_id::l2_hit_rate>
   static constexpr const char *description = "Hit rate at L2 cache.";
   static constexpr double divider          = 100.0;
 
-  static bool is_collected(nvbench::state &m_state)
-  {
-    return m_state.is_l2_hit_rate_collected();
-  };
+  static bool is_collected(nvbench::state &m_state) { return m_state.is_l2_hit_rate_collected(); };
 };
 
 template <metric_id id = metric_id::dram_peak_sustained_throughput>
@@ -153,8 +146,7 @@ void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
 }
 
 template <>
-void add_metrics_impl<metric_id::count>(nvbench::state &,
-                                        std::vector<std::string> &)
+void add_metrics_impl<metric_id::count>(nvbench::state &, std::vector<std::string> &)
 {}
 
 std::vector<std::string> add_metrics(nvbench::state &state)
@@ -179,13 +171,11 @@ try
 // clang-format on
 catch (const std::exception &ex)
 {
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref)
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref)
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::warn,
-                fmt::format("CUPTI failed to construct profiler: {}",
-                            ex.what()));
+                fmt::format("CUPTI failed to construct profiler: {}", ex.what()));
   }
 }
 
@@ -194,15 +184,11 @@ void measure_cupti_base::check()
   const auto device = m_state.get_device();
   if (!device)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `cupti` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cupti` measurement.");
   }
   if (!device->is_active())
   { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
   }
 }
 
@@ -210,16 +196,13 @@ namespace
 {
 
 template <metric_id id = metric_id::dram_peak_sustained_throughput>
-void gen_summary(std::size_t result_id,
-                 nvbench::state &m_state,
-                 const std::vector<double> &result)
+void gen_summary(std::size_t result_id, nvbench::state &m_state, const std::vector<double> &result)
 {
   using metric = metric_traits<id>;
 
   if (metric::is_collected(m_state))
   {
-    auto &summ =
-      m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
+    auto &summ = m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
     summ.set_string("name", metric::name);
     summ.set_string("hint", metric::hint);
     summ.set_string("description", metric::description);
@@ -231,9 +214,7 @@ void gen_summary(std::size_t result_id,
 }
 
 template <>
-void gen_summary<metric_id::count>(std::size_t,
-                                   nvbench::state &,
-                                   const std::vector<double> &)
+void gen_summary<metric_id::count>(std::size_t, nvbench::state &, const std::vector<double> &)
 {}
 
 void gen_summaries(nvbench::state &state, const std::vector<double> &result)
@@ -266,8 +247,7 @@ try
   }
 
   // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::pass,
@@ -278,13 +258,11 @@ try
 }
 catch (const std::exception &ex)
 {
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref)
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref)
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::warn,
-                fmt::format("CUPTI failed to generate the summary: {}",
-                            ex.what()));
+                fmt::format("CUPTI failed to generate the summary: {}", ex.what()));
   }
 }
 
diff --git a/nvbench/detail/measure_cupti.cuh b/nvbench/detail/measure_cupti.cuh
index 736c3754..ec7b2120 100644
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -50,10 +50,10 @@ namespace detail
 struct measure_cupti_base
 {
   explicit measure_cupti_base(nvbench::state &exec_state);
-  measure_cupti_base(const measure_cupti_base &) = delete;
-  measure_cupti_base(measure_cupti_base &&)      = delete;
+  measure_cupti_base(const measure_cupti_base &)            = delete;
+  measure_cupti_base(measure_cupti_base &&)                 = delete;
   measure_cupti_base &operator=(const measure_cupti_base &) = delete;
-  measure_cupti_base &operator=(measure_cupti_base &&) = delete;
+  measure_cupti_base &operator=(measure_cupti_base &&)      = delete;
 
 protected:
   struct kernel_launch_timer;
@@ -61,10 +61,7 @@ protected:
   void check();
   void generate_summaries();
 
-  __forceinline__ void flush_device_l2()
-  {
-    m_l2flush.flush(m_launch.get_stream());
-  }
+  __forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
 
   __forceinline__ void sync_stream() const
   {
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 25e2119f..94971229 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -47,9 +47,8 @@ measure_hot_base::measure_hot_base(state &exec_state)
   // to match the cold result if available.
   try
   {
-    nvbench::int64_t cold_samples =
-      m_state.get_summary("nv/cold/sample_size").get_int64("value");
-    m_min_samples = std::max(m_min_samples, cold_samples);
+    nvbench::int64_t cold_samples = m_state.get_summary("nv/cold/sample_size").get_int64("value");
+    m_min_samples                 = std::max(m_min_samples, cold_samples);
 
     // If the cold measurement ran successfully, disable skip_time. It'd just
     // be annoying to skip now.
@@ -72,15 +71,11 @@ void measure_hot_base::check()
   const auto device = m_state.get_device();
   if (!device)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `hot` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `hot` measurement.");
   }
   if (!device->is_active())
   { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
   }
 }
 
@@ -116,8 +111,7 @@ void measure_hot_base::generate_summaries()
   }
 
   // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
 
diff --git a/nvbench/detail/measure_hot.cuh b/nvbench/detail/measure_hot.cuh
index 9c4c2ecf..c9175830 100644
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -27,7 +27,7 @@
 
 #include <cuda_runtime.h>
 
-#include <utility>
+#include <algorithm>
 
 namespace nvbench
 {
@@ -41,10 +41,10 @@ namespace detail
 struct measure_hot_base
 {
   explicit measure_hot_base(nvbench::state &exec_state);
-  measure_hot_base(const measure_hot_base &) = delete;
-  measure_hot_base(measure_hot_base &&)      = delete;
+  measure_hot_base(const measure_hot_base &)            = delete;
+  measure_hot_base(measure_hot_base &&)                 = delete;
   measure_hot_base &operator=(const measure_hot_base &) = delete;
-  measure_hot_base &operator=(measure_hot_base &&) = delete;
+  measure_hot_base &operator=(measure_hot_base &&)      = delete;
 
 protected:
   void check();
@@ -131,7 +131,7 @@ private:
     // The .95 factor here pads the batch_size a bit to avoid needing a second
     // batch due to noise.
     const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
-    auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
+    auto batch_size          = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
 
     do
     {
@@ -142,7 +142,7 @@ private:
         // Block stream until some work is queued.
         // Limit the number of kernel executions while blocked to prevent
         // deadlocks. See warnings on blocking_kernel.
-        const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
+        const auto blocked_launches   = std::min(batch_size, nvbench::int64_t{2});
         const auto unblocked_launches = batch_size - blocked_launches;
 
         this->block_stream();
@@ -189,7 +189,6 @@ private:
         break; // Stop iterating
       }
 
-
       m_walltime_timer.stop();
       if (m_walltime_timer.get_duration() > m_timeout)
       {
diff --git a/nvbench/detail/ring_buffer.cuh b/nvbench/detail/ring_buffer.cuh
index fa862004..5c00b24a 100644
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -22,18 +22,112 @@
 
 #include <nvbench/detail/statistics.cuh>
 
+#include <cstddef>
+#include <iterator>
 #include <cassert>
 #include <vector>
 
 namespace nvbench::detail
 {
 
+template <class T>
+class ring_buffer_iterator
+{
+  std::ptrdiff_t m_index;
+  std::ptrdiff_t m_capacity;
+  T *m_ptr;
+
+public:
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type        = T;
+  using difference_type   = std::ptrdiff_t;
+  using pointer           = T *;
+  using reference         = T &;
+
+  ring_buffer_iterator(std::ptrdiff_t index, std::ptrdiff_t capacity, pointer ptr)
+      : m_index{index}
+      , m_capacity{capacity}
+      , m_ptr{ptr}
+  {}
+
+  ring_buffer_iterator operator++()
+  {
+    ++m_index;
+    return *this;
+  }
+
+  ring_buffer_iterator operator++(int)
+  {
+    ring_buffer_iterator temp = *this;
+    ++(*this);
+    return temp;
+  }
+
+  ring_buffer_iterator &operator--()
+  {
+    --m_index;
+    return *this;
+  }
+
+  ring_buffer_iterator operator--(int)
+  {
+    ring_buffer_iterator temp = *this;
+    --(*this);
+    return temp;
+  }
+
+  ring_buffer_iterator operator+(difference_type n) const 
+  { 
+    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr); 
+  }
+
+  ring_buffer_iterator operator-(difference_type n) const 
+  { 
+    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr); 
+  }
+
+  difference_type operator-(const ring_buffer_iterator &other) const
+  {
+    return m_index - other.m_index;
+  }
+
+  reference operator*() const { return m_ptr[m_index % m_capacity]; }
+  pointer operator->() const { return &(operator*()); }
+
+  reference operator[](difference_type n) const { return *(*this + n); }
+
+  bool operator==(const ring_buffer_iterator &other) const
+  {
+    return m_ptr == other.m_ptr && m_index == other.m_index;
+  }
+  bool operator!=(const ring_buffer_iterator &other) const { return !(*this == other); }
+  bool operator<(const ring_buffer_iterator &other) const { return m_index < other.m_index; }
+  bool operator>(const ring_buffer_iterator &other) const { return m_index > other.m_index; }
+  bool operator<=(const ring_buffer_iterator &other) const { return !(*this > other); }
+  bool operator>=(const ring_buffer_iterator &other) const { return !(*this < other); }
+};
+
 /**
  * @brief A simple, dynamically sized ring buffer.
  */
 template <typename T>
 struct ring_buffer
 {
+private:
+  using buffer_t = typename std::vector<T>;
+  using diff_t   = typename buffer_t::difference_type;
+
+  buffer_t m_buffer;
+  std::size_t m_index{0};
+  bool m_full{false};
+
+  std::size_t get_front_index() const 
+  {
+    return m_full ? m_index : 0;
+  }
+
+public:
+
   /**
    * Create a new ring buffer with the requested capacity.
    */
@@ -42,34 +136,59 @@ struct ring_buffer
   {}
 
   /**
-   * Iterators provide all values in the ring buffer in unspecified order.
+   * Iterators provide all values in the ring buffer in FIFO order.
    * @{
    */
-  // clang-format off
-  [[nodiscard]] auto begin()        { return m_buffer.begin(); }
-  [[nodiscard]] auto begin() const  { return m_buffer.begin(); }
-  [[nodiscard]] auto cbegin() const { return m_buffer.cbegin(); }
-  [[nodiscard]] auto end()        { return m_buffer.begin()  + this->size(); }
-  [[nodiscard]] auto end() const  { return m_buffer.begin()  + this->size(); }
-  [[nodiscard]] auto cend() const { return m_buffer.cbegin() + this->size(); }
-  // clang-format on
+  [[nodiscard]] ring_buffer_iterator<T> begin()
+  {
+    return {static_cast<std::ptrdiff_t>(get_front_index()),
+            static_cast<std::ptrdiff_t>(capacity()),
+            m_buffer.data()};
+  }
+
+  [[nodiscard]] ring_buffer_iterator<T> end()
+  {
+    return {static_cast<std::ptrdiff_t>(get_front_index() + size()),
+            static_cast<std::ptrdiff_t>(capacity()),
+            m_buffer.data()};
+  }
+  [[nodiscard]] ring_buffer_iterator<const T> begin() const
+  {
+    return {static_cast<std::ptrdiff_t>(get_front_index()),
+            static_cast<std::ptrdiff_t>(capacity()),
+            m_buffer.data()};
+  }
+
+  [[nodiscard]] ring_buffer_iterator<const T> end() const
+  {
+    return {static_cast<std::ptrdiff_t>(get_front_index() + size()),
+            static_cast<std::ptrdiff_t>(capacity()),
+            m_buffer.data()};
+  }
+  [[nodiscard]] ring_buffer_iterator<const T> cbegin() const
+  {
+    return {static_cast<std::ptrdiff_t>(get_front_index()),
+            static_cast<std::ptrdiff_t>(capacity()),
+            m_buffer.data()};
+  }
+
+  [[nodiscard]] ring_buffer_iterator<const T> cend() const
+  {
+    return {static_cast<std::ptrdiff_t>(get_front_index() + size()),
+            static_cast<std::ptrdiff_t>(capacity()),
+            m_buffer.data()};
+  }
   /** @} */
 
   /**
    * The number of valid values in the ring buffer. Always <= capacity().
    */
-  [[nodiscard]] std::size_t size() const
-  {
-    return m_full ? m_buffer.size() : m_index;
-  }
+  [[nodiscard]] std::size_t size() const { return m_full ? m_buffer.size() : m_index; }
 
   /**
    * The maximum size of the ring buffer.
    */
-  [[nodiscard]] std::size_t capacity() const
-  {
-    return m_buffer.size();
-  }
+  [[nodiscard]] std::size_t capacity() const { return m_buffer.size(); }
 
   /**
    * @return True if the ring buffer is empty.
@@ -119,11 +238,6 @@ struct ring_buffer
     return m_buffer[back_index];
   }
   /**@}*/
-
-private:
-  std::vector<T> m_buffer;
-  std::size_t m_index{0};
-  bool m_full{false};
 };
 
 } // namespace nvbench::detail
diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh
index 9352a5ff..bab2daf1 100644
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -53,7 +53,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
                 "`ExecTags` argument must be a member (or combination of "
                 "members) from nvbench::exec_tag.");
 
-  constexpr auto measure_tags = tags & measure_mask;
+  constexpr auto measure_tags  = tags & measure_mask;
   constexpr auto modifier_tags = tags & modifier_mask;
 
   // "run once" is handled by the cold measurement:
@@ -81,8 +81,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
     }
     else
     {
-      this->exec(cold | hot | tags,
-                 std::forward<KernelLauncher>(kernel_launcher));
+      this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
     }
     return;
   }
@@ -99,8 +98,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
     constexpr bool use_blocking_kernel = !(tags & no_block);
     if constexpr (tags & timer)
     {
-      // Estimate bandwidth here
-      #ifdef NVBENCH_HAS_CUPTI
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
       if constexpr (!(modifier_tags & run_once))
       {
         if (this->is_cupti_required())
@@ -110,7 +109,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
           measure();
         }
       }
-      #endif
+#endif
 
       using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
       measure_t measure{*this, kernel_launcher};
@@ -121,8 +120,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
       using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
       wrapper_t wrapper{kernel_launcher};
 
-      // Estimate bandwidth here
-      #ifdef NVBENCH_HAS_CUPTI
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
       if constexpr (!(modifier_tags & run_once))
       {
         if (this->is_cupti_required())
@@ -132,10 +131,9 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
           measure();
         }
       }
-      #endif
+#endif
 
-      using measure_t =
-        nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
+      using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
       measure_t measure(*this, wrapper);
       measure();
     }
@@ -143,12 +141,10 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
 
   if constexpr (tags & hot)
   {
-    static_assert(!(tags & sync),
-                  "Hot measurement doesn't support the `sync` exec_tag.");
-    static_assert(!(tags & timer),
-                  "Hot measurement doesn't support the `timer` exec_tag.");
+    static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
+    static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
     constexpr bool use_blocking_kernel = !(tags & no_block);
-    using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
+    using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
     measure_t measure{*this, kernel_launcher};
     measure();
   }
diff --git a/nvbench/detail/state_generator.cxx b/nvbench/detail/state_generator.cxx
index 8c153bff..26a897a4 100644
--- a/nvbench/detail/state_generator.cxx
+++ b/nvbench/detail/state_generator.cxx
@@ -40,9 +40,7 @@ void state_iterator::add_axis(const nvbench::axis_base &axis)
   this->add_axis(axis.get_name(), axis.get_type(), axis.get_size());
 }
 
-void state_iterator::add_axis(std::string axis,
-                              nvbench::axis_type type,
-                              std::size_t size)
+void state_iterator::add_axis(std::string axis, nvbench::axis_type type, std::size_t size)
 {
   m_indices.push_back({std::move(axis), type, std::size_t{0}, size});
 }
@@ -74,10 +72,7 @@ state_iterator::get_current_indices() const
   return m_indices;
 }
 
-[[nodiscard]] bool state_iterator::iter_valid() const
-{
-  return m_current < m_total;
-}
+[[nodiscard]] bool state_iterator::iter_valid() const { return m_current < m_total; }
 
 void state_iterator::next()
 {
@@ -102,7 +97,7 @@ state_generator::state_generator(const benchmark_base &bench)
 
 void state_generator::build_axis_configs()
 {
-  const axes_metadata &axes = m_benchmark.get_axes();
+  const axes_metadata &axes                               = m_benchmark.get_axes();
   const std::vector<std::unique_ptr<axis_base>> &axes_vec = axes.get_axes();
 
   // Construct two state_generators:
@@ -118,35 +113,29 @@ void state_generator::build_axis_configs()
     type_axes.reserve(axes_vec.size());
 
     // Filter all axes by into type and non-type:
-    std::for_each(axes_vec.cbegin(),
-                  axes_vec.cend(),
-                  [&non_type_si, &type_axes](const auto &axis) {
-                    if (axis->get_type() == nvbench::axis_type::type)
-                    {
-                      type_axes.push_back(
-                        std::cref(static_cast<const type_axis &>(*axis)));
-                    }
-                    else
-                    {
-                      non_type_si.add_axis(*axis);
-                    }
-                  });
+    std::for_each(axes_vec.cbegin(), axes_vec.cend(), [&non_type_si, &type_axes](const auto &axis) {
+      if (axis->get_type() == nvbench::axis_type::type)
+      {
+        type_axes.push_back(std::cref(static_cast<const type_axis &>(*axis)));
+      }
+      else
+      {
+        non_type_si.add_axis(*axis);
+      }
+    });
 
     // Reverse sort type axes by index. This way the state_generator's cartesian
     // product of the type axes values will be enumerated in the same order as
     // nvbench::tl::cartesian_product<type_axes>. This is necessary to ensure
     // that the correct states are passed to the corresponding benchmark
     // instantiations.
-    std::sort(type_axes.begin(),
-              type_axes.end(),
-              [](const auto &axis_1, const auto &axis_2) {
-                return axis_1.get().get_axis_index() >
-                       axis_2.get().get_axis_index();
-              });
-
-    std::for_each(type_axes.cbegin(),
-                  type_axes.cend(),
-                  [&type_si](const auto &axis) { type_si.add_axis(axis); });
+    std::sort(type_axes.begin(), type_axes.end(), [](const auto &axis_1, const auto &axis_2) {
+      return axis_1.get().get_axis_index() > axis_2.get().get_axis_index();
+    });
+
+    std::for_each(type_axes.cbegin(), type_axes.cend(), [&type_si](const auto &axis) {
+      type_si.add_axis(axis);
+    });
   }
 
   // type_axis_configs generation:
@@ -157,8 +146,8 @@ void state_generator::build_axis_configs()
     // Build type_axis_configs
     for (type_si.init(); type_si.iter_valid(); type_si.next())
     {
-      auto &[config, active_mask] = m_type_axis_configs.emplace_back(
-        std::make_pair(nvbench::named_values{}, true));
+      auto &[config, active_mask] =
+        m_type_axis_configs.emplace_back(std::make_pair(nvbench::named_values{}, true));
 
       // Reverse the indices so they're once again in the same order as
       // specified:
@@ -173,8 +162,7 @@ void state_generator::build_axis_configs()
           active_mask = false;
         }
 
-        config.set_string(axis_info.axis,
-                          axis.get_input_string(axis_info.index));
+        config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
       }
     } // type_si
   }   // type_axis_config generation
@@ -199,21 +187,18 @@ void state_generator::build_axis_configs()
             break;
 
           case axis_type::int64:
-            config.set_int64(
-              axis_info.axis,
-              axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_int64(axis_info.axis,
+                             axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
             break;
 
           case axis_type::float64:
-            config.set_float64(
-              axis_info.axis,
-              axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_float64(axis_info.axis,
+                               axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
             break;
 
           case axis_type::string:
-            config.set_string(
-              axis_info.axis,
-              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_string(axis_info.axis,
+                              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
             break;
         } // switch (type)
       }   // for (axis_info : current_indices)
@@ -239,15 +224,12 @@ void state_generator::build_states()
   }
 }
 
-void state_generator::add_states_for_device(
-  const std::optional<device_info> &device)
+void state_generator::add_states_for_device(const std::optional<device_info> &device)
 {
   const auto num_type_configs = m_type_axis_configs.size();
-  for (std::size_t type_config_index = 0; type_config_index < num_type_configs;
-       ++type_config_index)
+  for (std::size_t type_config_index = 0; type_config_index < num_type_configs; ++type_config_index)
   {
-    const auto &[type_config,
-                 axis_mask] = m_type_axis_configs[type_config_index];
+    const auto &[type_config, axis_mask] = m_type_axis_configs[type_config_index];
 
     if (!axis_mask)
     { // Don't generate inner vector if the type config is masked out.
@@ -261,10 +243,7 @@ void state_generator::add_states_for_device(
       config.append(non_type_config);
 
       // Create benchmark:
-      m_states.push_back(nvbench::state{m_benchmark,
-                                        std::move(config),
-                                        device,
-                                        type_config_index});
+      m_states.push_back(nvbench::state{m_benchmark, std::move(config), device, type_config_index});
     }
   }
 }
diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh
index 957bca4c..522b4f21 100644
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -18,16 +18,22 @@
 
 #pragma once
 
-#include <nvbench/types.cuh>
-
 #include <nvbench/detail/transform_reduce.cuh>
+#include <nvbench/types.cuh>
 
 #include <cmath>
 #include <functional>
 #include <iterator>
 #include <limits>
+#include <numeric>
+#include <cmath>
+
 #include <type_traits>
 
+#ifndef M_PI
+  #define M_PI 3.14159265358979323846
+#endif
+
 namespace nvbench::detail::statistics
 {
 
@@ -36,13 +42,13 @@ namespace nvbench::detail::statistics
  *
  * If the input has fewer than 5 sample, infinity is returned.
  */
-template <typename Iter,
-          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+template <typename Iter, typename ValueType = typename std::iterator_traits<Iter>::value_type>
 ValueType standard_deviation(Iter first, Iter last, ValueType mean)
 {
   static_assert(std::is_floating_point_v<ValueType>);
 
-  const auto num = last - first;
+  const auto num = std::distance(first, last);
+
   if (num < 5) // don't bother with low sample sizes.
   {
     return std::numeric_limits<ValueType>::infinity();
@@ -57,8 +63,135 @@ ValueType standard_deviation(Iter first, Iter last, ValueType mean)
                                                             val *= val;
                                                             return val;
                                                           }) /
-                        static_cast<ValueType>((num - 1));
+                        static_cast<ValueType>((num - 1)); // Bessel’s correction
   return std::sqrt(variance);
 }
 
+/**
+ * Computes and returns the mean.
+ *
+ * If the input has fewer than 1 sample, infinity is returned.
+ */
+template <class It>
+nvbench::float64_t compute_mean(It first, It last)
+{
+  const auto num = std::distance(first, last);
+
+  if (num < 1)
+  {
+    return std::numeric_limits<nvbench::float64_t>::infinity();
+  }
+
+  return std::accumulate(first, last, 0.0) / static_cast<nvbench::float64_t>(num);
+}
+
+/**
+ * Computes linear regression and returns the slope and intercept
+ *
+ * This version takes precomputed mean of [first, last).
+ * If the input has fewer than 2 samples, infinity is returned for both slope and intercept.
+ */
+template <class It>
+std::pair<nvbench::float64_t, nvbench::float64_t>
+compute_linear_regression(It first, It last, nvbench::float64_t mean_y)
+{
+  const std::size_t n = static_cast<std::size_t>(std::distance(first, last));
+
+  if (n < 2)
+  {
+    return std::make_pair(std::numeric_limits<nvbench::float64_t>::infinity(),
+                          std::numeric_limits<nvbench::float64_t>::infinity());
+  }
+
+  // Assuming x starts from 0
+  const nvbench::float64_t mean_x = (static_cast<nvbench::float64_t>(n) - 1.0) / 2.0;
+
+  // Calculate the numerator and denominator for the slope
+  nvbench::float64_t numerator   = 0.0;
+  nvbench::float64_t denominator = 0.0;
+
+  for (std::size_t i = 0; i < n; ++i, ++first)
+  {
+    const nvbench::float64_t x_diff = static_cast<nvbench::float64_t>(i) - mean_x;
+    numerator += x_diff * (*first - mean_y);
+    denominator += x_diff * x_diff;
+  }
+
+  // Calculate the slope and intercept
+  const nvbench::float64_t slope     = numerator / denominator;
+  const nvbench::float64_t intercept = mean_y - slope * mean_x;
+
+  return std::make_pair(slope, intercept);
+}
+
+/**
+ * Computes linear regression and returns the slope and intercept
+ *
+ * If the input has fewer than 2 samples, infinity is returned for both slope and intercept.
+ */
+template <class It>
+std::pair<nvbench::float64_t, nvbench::float64_t> compute_linear_regression(It first, It last)
+{
+  return compute_linear_regression(first, last, compute_mean(first, last));
+}
+
+/**
+ * Computes and returns the R^2 (coefficient of determination)
+ *
+ * This version takes precomputed mean of [first, last).
+ */
+template <class It>
+nvbench::float64_t compute_r2(It first,
+                              It last,
+                              nvbench::float64_t mean_y,
+                              nvbench::float64_t slope,
+                              nvbench::float64_t intercept)
+{
+  const std::size_t n = static_cast<std::size_t>(std::distance(first, last));
+
+  nvbench::float64_t ss_tot = 0.0;
+  nvbench::float64_t ss_res = 0.0;
+
+  for (std::size_t i = 0; i < n; ++i, ++first)
+  {
+    const nvbench::float64_t y = *first;
+    const nvbench::float64_t y_pred = slope * static_cast<nvbench::float64_t>(i) + intercept;
+
+    ss_tot += (y - mean_y) * (y - mean_y);
+    ss_res += (y - y_pred) * (y - y_pred);
+  }
+
+  if (ss_tot == 0.0)
+  {
+    return 1.0;
+  }
+
+  return 1.0 - ss_res / ss_tot;
+}
+
+/**
+ * Computes and returns the R^2 (coefficient of determination)
+ */
+template <class It>
+nvbench::float64_t
+compute_r2(It first, It last, nvbench::float64_t slope, nvbench::float64_t intercept)
+{
+  return compute_r2(first, last, compute_mean(first, last), slope, intercept);
+}
+
+inline nvbench::float64_t rad2deg(nvbench::float64_t rad)
+{
+  return rad * 180.0 / M_PI;
+}
+
+inline nvbench::float64_t slope2rad(nvbench::float64_t slope)
+{
+  return std::atan2(slope, 1.0);
+}
+
+inline nvbench::float64_t slope2deg(nvbench::float64_t slope)
+{
+  return rad2deg(slope2rad(slope));
+}
+
 } // namespace nvbench::detail::statistics
diff --git a/nvbench/detail/stdrel_criterion.cuh b/nvbench/detail/stdrel_criterion.cuh
new file mode 100644
index 00000000..5f87e842
--- /dev/null
+++ b/nvbench/detail/stdrel_criterion.cuh
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/types.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/detail/ring_buffer.cuh>
+
+#include <vector>
+
+namespace nvbench::detail
+{
+
+class stdrel_criterion final : public stopping_criterion_base
+{
+  // state
+  nvbench::int64_t m_total_samples{};
+  nvbench::float64_t m_total_cuda_time{};
+  std::vector<nvbench::float64_t> m_cuda_times{};
+  nvbench::detail::ring_buffer<nvbench::float64_t> m_noise_tracker{512};
+
+public:
+  stdrel_criterion();
+
+protected:
+  virtual void do_initialize() override;
+  virtual void do_add_measurement(nvbench::float64_t measurement) override;
+  virtual bool do_is_finished() override;
+};
+
+} // namespace nvbench::detail
diff --git a/nvbench/detail/stdrel_criterion.cxx b/nvbench/detail/stdrel_criterion.cxx
new file mode 100644
index 00000000..a6c5ea8a
--- /dev/null
+++ b/nvbench/detail/stdrel_criterion.cxx
@@ -0,0 +1,98 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/stdrel_criterion.cuh>
+
+namespace nvbench::detail
+{
+
+stdrel_criterion::stdrel_criterion()
+    : stopping_criterion_base{"stdrel",
+                              {{"max-noise", nvbench::detail::compat_max_noise()},
+                               {"min-time", nvbench::detail::compat_min_time()}}}
+{}
+
+void stdrel_criterion::do_initialize()
+{
+  m_total_samples = 0;
+  m_total_cuda_time = 0.0;
+  m_cuda_times.clear();
+  m_noise_tracker.clear();
+}
+
+void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
+{
+  m_total_samples++;
+  m_total_cuda_time += measurement;
+  m_cuda_times.push_back(measurement);
+
+  // Compute convergence statistics using CUDA timings:
+  const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+                                                                          m_cuda_times.cend(),
+                                                                          mean_cuda_time);
+  const auto cuda_rel_stdev       = cuda_stdev / mean_cuda_time;
+  if (std::isfinite(cuda_rel_stdev))
+  {
+    m_noise_tracker.push_back(cuda_rel_stdev);
+  }
+}
+
+bool stdrel_criterion::do_is_finished()
+{
+  if (m_total_cuda_time <= m_params.get_float64("min-time"))
+  {
+    return false;
+  }
+
+  // Noise has dropped below threshold
+  if (m_noise_tracker.back() < m_params.get_float64("max-noise"))
+  {
+    return true;
+  }
+
+  // Check if the noise (cuda rel stdev) has converged by inspecting a
+  // trailing window of recorded noise measurements.
+  // This helps identify benchmarks that are inherently noisy and would
+  // never converge to the target stdev threshold. This check ensures that the
+  // benchmark will end if the stdev stabilizes above the target threshold.
+  // Gather some iterations before checking noise, and limit how often we
+  // check this.
+  if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0))
+  {
+    // Use the current noise as the stdev reference.
+    const auto current_noise = m_noise_tracker.back();
+    const auto noise_stdev =
+      nvbench::detail::statistics::standard_deviation(m_noise_tracker.cbegin(),
+                                                      m_noise_tracker.cend(),
+                                                      current_noise);
+    const auto noise_rel_stdev = noise_stdev / current_noise;
+
+    // If the rel stdev of the last N cuda noise measurements is less than
+    // 5%, consider the result stable.
+    const auto noise_threshold = 0.05;
+    if (noise_rel_stdev < noise_threshold)
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+} // namespace nvbench::detail
diff --git a/nvbench/detail/throw.cuh b/nvbench/detail/throw.cuh
index ffbe5bb9..e3bb9fd5 100644
--- a/nvbench/detail/throw.cuh
+++ b/nvbench/detail/throw.cuh
@@ -21,17 +21,15 @@
 #include <fmt/format.h>
 #include <stdexcept>
 
-#define NVBENCH_THROW(exception_type, format_str, ...)                         \
-  throw exception_type(fmt::format("{}:{}: {}",                                \
-                                   __FILE__,                                   \
-                                   __LINE__,                                   \
-                                   fmt::format(format_str, __VA_ARGS__)))
+#define NVBENCH_THROW(exception_type, format_str, ...)                                             \
+  throw exception_type(                                                                            \
+    fmt::format("{}:{}: {}", __FILE__, __LINE__, fmt::format(format_str, __VA_ARGS__)))
 
-#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...)           \
-  do                                                                           \
-  {                                                                            \
-    if (condition)                                                             \
-    {                                                                          \
-      NVBENCH_THROW(exception_type, format_str, __VA_ARGS__);                  \
-    }                                                                          \
+#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...)                               \
+  do                                                                                               \
+  {                                                                                                \
+    if (condition)                                                                                 \
+    {                                                                                              \
+      NVBENCH_THROW(exception_type, format_str, __VA_ARGS__);                                      \
+    }                                                                                              \
   } while (false)
diff --git a/nvbench/detail/transform_reduce.cuh b/nvbench/detail/transform_reduce.cuh
index 8bc5db68..56253587 100644
--- a/nvbench/detail/transform_reduce.cuh
+++ b/nvbench/detail/transform_reduce.cuh
@@ -27,10 +27,7 @@
 namespace nvbench::detail
 {
 
-template <typename InIterT,
-          typename InitValueT,
-          typename ReduceOp,
-          typename TransformOp>
+template <typename InIterT, typename InitValueT, typename ReduceOp, typename TransformOp>
 InitValueT transform_reduce(InIterT first,
                             InIterT last,
                             InitValueT init,
diff --git a/nvbench/detail/type_list_impl.cuh b/nvbench/detail/type_list_impl.cuh
index d2e498cd..e97aaaa1 100644
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -20,12 +20,10 @@ namespace tl::detail
 {
 
 template <typename... Ts>
-auto size(nvbench::type_list<Ts...>)
-  -> std::integral_constant<std::size_t, sizeof...(Ts)>;
+auto size(nvbench::type_list<Ts...>) -> std::integral_constant<std::size_t, sizeof...(Ts)>;
 
-template <std::size_t I, typename... Ts>
-auto get(nvbench::type_list<Ts...>)
-  -> std::tuple_element_t<I, std::tuple<Ts...>>;
+template <std::size_t Idx, typename... Ts>
+auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<Idx, std::tuple<Ts...>>;
 
 template <typename... Ts, typename... Us>
 auto concat(nvbench::type_list<Ts...>, nvbench::type_list<Us...>)
@@ -44,9 +42,8 @@ struct prepend_each<T, nvbench::type_list<>>
 template <typename T, typename TL, typename... TLTail>
 struct prepend_each<T, nvbench::type_list<TL, TLTail...>>
 {
-  using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
-  using next =
-    typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
+  using cur  = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
+  using next = typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
   using type = decltype(detail::concat(nvbench::type_list<cur>{}, next{}));
 };
 
@@ -71,23 +68,20 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<>, TLTail...>>
 template <typename T, typename... Ts>
 struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Ts...>>>
 {
-  using cur = nvbench::type_list<nvbench::type_list<T>>;
-  using next =
-    std::conditional_t<sizeof...(Ts) != 0,
-                       typename detail::cartesian_product<
-                         nvbench::type_list<nvbench::type_list<Ts...>>>::type,
-                       nvbench::type_list<>>;
+  using cur  = nvbench::type_list<nvbench::type_list<T>>;
+  using next = std::conditional_t<
+    sizeof...(Ts) != 0,
+    typename detail::cartesian_product<nvbench::type_list<nvbench::type_list<Ts...>>>::type,
+    nvbench::type_list<>>;
   using type = decltype(detail::concat(cur{}, next{}));
 };
 
 template <typename T, typename... Tail, typename TL, typename... TLTail>
-struct cartesian_product<
-  nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
+struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
 {
-  using tail_prod =
-    typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
-  using cur  = typename detail::prepend_each<T, tail_prod>::type;
-  using next = typename detail::cartesian_product<
+  using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
+  using cur       = typename detail::prepend_each<T, tail_prod>::type;
+  using next      = typename detail::cartesian_product<
     nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
   using type = decltype(detail::concat(cur{}, next{}));
 };
diff --git a/nvbench/device_info.cu b/nvbench/device_info.cu
index 02c6b973..3b26cdbc 100644
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -45,6 +45,9 @@ device_info::device_info(int id)
     , m_nvml_device(nullptr)
 {
   NVBENCH_CUDA_CALL(cudaGetDeviceProperties(&m_prop, m_id));
+  // NVML's lifetime should extend for the entirety of the process, so store in a
+  // global.
+  [[maybe_unused]] static auto nvml_lifetime = nvbench::nvml::NVMLLifetimeManager();
 
 #ifdef NVBENCH_HAS_NVML
   // Retrieve the current device's pci_id as a null-terminated string.
@@ -65,17 +68,15 @@ void device_info::set_persistence_mode(bool state)
 #else  // NVBENCH_HAS_NVML
 try
 {
-  NVBENCH_NVML_CALL(nvmlDeviceSetPersistenceMode(
-    m_nvml_device,
-    state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
+  NVBENCH_NVML_CALL(
+    nvmlDeviceSetPersistenceMode(m_nvml_device,
+                                 state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
 }
 catch (nvml::call_failed &e)
 {
   if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Persistence mode is only supported on Linux.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Persistence mode is only supported on Linux.");
   }
   else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
   {
@@ -104,30 +105,26 @@ try
       break;
 
     case clock_rate::base:
-      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(
-        m_nvml_device,
-        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
-        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
+      NVBENCH_NVML_CALL(
+        nvmlDeviceSetGpuLockedClocks(m_nvml_device,
+                                     static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
+                                     static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
       break;
 
     case clock_rate::maximum: {
-      const auto max_mhz = static_cast<unsigned int>(
-        this->get_sm_default_clock_rate() / (1000 * 1000));
-      NVBENCH_NVML_CALL(
-        nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
+      const auto max_mhz =
+        static_cast<unsigned int>(this->get_sm_default_clock_rate() / (1000 * 1000));
+      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
       break;
     }
 
     default:
-      NVBENCH_THROW(std::runtime_error,
-                    "Unrecognized clock rate: {}",
-                    static_cast<int>(rate));
+      NVBENCH_THROW(std::runtime_error, "Unrecognized clock rate: {}", static_cast<int>(rate));
   }
 }
 catch (nvml::call_failed &e)
 {
-  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED &&
-      this->get_sm_version() < 700)
+  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED && this->get_sm_version() < 700)
   {
     NVBENCH_THROW(std::runtime_error,
                   "GPU clock rates can only be modified for Volta and later. "
@@ -156,9 +153,7 @@ catch (nvml::call_failed &e)
 {
   if (!is_active())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "get_context is called for inactive device");
+    NVBENCH_THROW(std::runtime_error, "{}", "get_context is called for inactive device");
   }
 
   CUcontext cu_context;
diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh
index 296a2c2b..98184cf9 100644
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -54,10 +54,7 @@ struct device_info
   [[nodiscard]] int get_id() const { return m_id; }
 
   /// @return The name of the device.
-  [[nodiscard]] std::string_view get_name() const
-  {
-    return std::string_view(m_prop.name);
-  }
+  [[nodiscard]] std::string_view get_name() const { return std::string_view(m_prop.name); }
 
   [[nodiscard]] bool is_active() const
   {
@@ -83,7 +80,6 @@ struct device_info
   /// @note Requires root / admin privileges.
   void set_persistence_mode(bool state);
 
-
   /// Symbolic values for special clock rates
   enum class clock_rate
   {
@@ -101,10 +97,7 @@ struct device_info
   void lock_gpu_clocks(clock_rate rate);
 
   /// @return The SM version of the current device as (major*100) + (minor*10).
-  [[nodiscard]] int get_sm_version() const
-  {
-    return m_prop.major * 100 + m_prop.minor * 10;
-  }
+  [[nodiscard]] int get_sm_version() const { return m_prop.major * 100 + m_prop.minor * 10; }
 
   /// @return The PTX version of the current device, e.g. sm_80 returns 800.
   [[nodiscard]] __forceinline__ int get_ptx_version() const
@@ -119,46 +112,25 @@ struct device_info
   }
 
   /// @return The number of physical streaming multiprocessors on this device.
-  [[nodiscard]] int get_number_of_sms() const
-  {
-    return m_prop.multiProcessorCount;
-  }
+  [[nodiscard]] int get_number_of_sms() const { return m_prop.multiProcessorCount; }
 
   /// @return The maximum number of resident blocks per SM.
-  [[nodiscard]] int get_max_blocks_per_sm() const
-  {
-    return m_prop.maxBlocksPerMultiProcessor;
-  }
+  [[nodiscard]] int get_max_blocks_per_sm() const { return m_prop.maxBlocksPerMultiProcessor; }
 
   /// @return The maximum number of resident threads per SM.
-  [[nodiscard]] int get_max_threads_per_sm() const
-  {
-    return m_prop.maxThreadsPerMultiProcessor;
-  }
+  [[nodiscard]] int get_max_threads_per_sm() const { return m_prop.maxThreadsPerMultiProcessor; }
 
   /// @return The maximum number of threads per block.
-  [[nodiscard]] int get_max_threads_per_block() const
-  {
-    return m_prop.maxThreadsPerBlock;
-  }
+  [[nodiscard]] int get_max_threads_per_block() const { return m_prop.maxThreadsPerBlock; }
 
   /// @return The number of registers per SM.
-  [[nodiscard]] int get_registers_per_sm() const
-  {
-    return m_prop.regsPerMultiprocessor;
-  }
+  [[nodiscard]] int get_registers_per_sm() const { return m_prop.regsPerMultiprocessor; }
 
   /// @return The number of registers per block.
-  [[nodiscard]] int get_registers_per_block() const
-  {
-    return m_prop.regsPerBlock;
-  }
+  [[nodiscard]] int get_registers_per_block() const { return m_prop.regsPerBlock; }
 
   /// @return The total number of bytes available in global memory.
-  [[nodiscard]] std::size_t get_global_memory_size() const
-  {
-    return m_prop.totalGlobalMem;
-  }
+  [[nodiscard]] std::size_t get_global_memory_size() const { return m_prop.totalGlobalMem; }
 
   struct memory_info
   {
@@ -176,16 +148,13 @@ struct device_info
   }
 
   /// @return The width of the global memory bus in bits.
-  [[nodiscard]] int get_global_memory_bus_width() const
-  {
-    return m_prop.memoryBusWidth;
-  }
+  [[nodiscard]] int get_global_memory_bus_width() const { return m_prop.memoryBusWidth; }
 
   //// @return The global memory bus bandwidth in bytes/sec.
   [[nodiscard]] std::size_t get_global_memory_bus_bandwidth() const
   { // 2 is for DDR, CHAR_BITS to convert bus_width to bytes.
     return 2 * this->get_global_memory_bus_peak_clock_rate() *
-           (this->get_global_memory_bus_width() / CHAR_BIT);
+           static_cast<std::size_t>(this->get_global_memory_bus_width() / CHAR_BIT);
   }
 
   /// @return The size of the L2 cache in bytes.
@@ -201,10 +170,7 @@ struct device_info
   }
 
   /// @return The available amount of shared memory in bytes per block.
-  [[nodiscard]] std::size_t get_shared_memory_per_block() const
-  {
-    return m_prop.sharedMemPerBlock;
-  }
+  [[nodiscard]] std::size_t get_shared_memory_per_block() const { return m_prop.sharedMemPerBlock; }
 
   /// @return True if ECC is enabled on this device.
   [[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
@@ -224,23 +190,11 @@ struct device_info
 #endif
 
   /// @return A cached copy of the device's cudaDeviceProp.
-  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
-  {
-    return m_prop;
-  }
+  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { return m_prop; }
 
-  [[nodiscard]] bool operator<(const device_info &o) const
-  {
-    return m_id < o.m_id;
-  }
-  [[nodiscard]] bool operator==(const device_info &o) const
-  {
-    return m_id == o.m_id;
-  }
-  [[nodiscard]] bool operator!=(const device_info &o) const
-  {
-    return m_id != o.m_id;
-  }
+  [[nodiscard]] bool operator<(const device_info &o) const { return m_id < o.m_id; }
+  [[nodiscard]] bool operator==(const device_info &o) const { return m_id == o.m_id; }
+  [[nodiscard]] bool operator!=(const device_info &o) const { return m_id != o.m_id; }
 
 private:
   int m_id;
@@ -267,11 +221,10 @@ try
 {
   nvbench::detail::device_scope _{dev_id};
   cudaFuncAttributes attr{};
-  NVBENCH_CUDA_CALL(
-    cudaFuncGetAttributes(&attr, ((const void*)nvbench::detail::noop_kernel_ptr) ));
+  NVBENCH_CUDA_CALL(cudaFuncGetAttributes(&attr, ((const void *)nvbench::detail::noop_kernel_ptr)));
   return attr.ptxVersion * 10;
 }
-catch(...)
+catch (...)
 { // Fail gracefully when no appropriate PTX is found for this device.
   return -1;
 }
diff --git a/nvbench/device_manager.cu b/nvbench/device_manager.cu
index 136b20a3..a70a18c4 100644
--- a/nvbench/device_manager.cu
+++ b/nvbench/device_manager.cu
@@ -18,10 +18,11 @@
 
 #include <nvbench/device_manager.cuh>
 
+#include <cuda_runtime_api.h>
+
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
-
-#include <cuda_runtime_api.h>
+#include <nvbench/detail/throw.cuh>
 
 namespace nvbench
 {
@@ -44,4 +45,13 @@ device_manager::device_manager()
   }
 }
 
+const nvbench::device_info &device_manager::get_device(int id) 
+{ 
+  if (id < 0) 
+  {
+    NVBENCH_THROW(std::runtime_error, "Negative index: {}.", id);
+  }
+  return m_devices.at(static_cast<std::size_t>(id)); 
+}
+
 } // namespace nvbench
diff --git a/nvbench/device_manager.cuh b/nvbench/device_manager.cuh
index 94907755..36082b81 100644
--- a/nvbench/device_manager.cuh
+++ b/nvbench/device_manager.cuh
@@ -40,10 +40,7 @@ struct device_manager
   /**
    * @return The total number of detected CUDA devices.
    */
-  [[nodiscard]] int get_number_of_devices() const
-  {
-    return static_cast<int>(m_devices.size());
-  }
+  [[nodiscard]] int get_number_of_devices() const { return static_cast<int>(m_devices.size()); }
 
   /**
    * @return The number of devices actually used by all benchmarks.
@@ -57,39 +54,27 @@ struct device_manager
   /**
    * @return The device_info object corresponding to `id`.
    */
-  [[nodiscard]] const nvbench::device_info &get_device(int id)
-  {
-    return m_devices.at(id);
-  }
+  [[nodiscard]] const nvbench::device_info &get_device(int id);
 
   /**
    * @return A vector containing device_info objects for all detected CUDA
    * devices.
    */
-  [[nodiscard]] const device_info_vector &get_devices() const
-  {
-    return m_devices;
-  }
+  [[nodiscard]] const device_info_vector &get_devices() const { return m_devices; }
 
   /**
    * @return A vector containing device_info objects for devices that are
    * actively used by all benchmarks.
    * @note This is only valid after nvbench::option_parser::parse executes.
    */
-  [[nodiscard]] const device_info_vector &get_used_devices() const
-  {
-    return m_used_devices;
-  }
+  [[nodiscard]] const device_info_vector &get_used_devices() const { return m_used_devices; }
 
 private:
   device_manager();
 
   friend struct option_parser;
 
-  void set_used_devices(device_info_vector devices)
-  {
-    m_used_devices = std::move(devices);
-  }
+  void set_used_devices(device_info_vector devices) { m_used_devices = std::move(devices); }
 
   device_info_vector m_devices;
   device_info_vector m_used_devices;
diff --git a/nvbench/enum_type_list.cuh b/nvbench/enum_type_list.cuh
index 6ec529bc..614057f1 100644
--- a/nvbench/enum_type_list.cuh
+++ b/nvbench/enum_type_list.cuh
@@ -64,10 +64,7 @@ struct type_strings<nvbench::enum_type<Value, T>>
     return std::to_string(Value);
   }
 
-  static std::string description()
-  {
-    return nvbench::demangle<nvbench::enum_type<Value, T>>();
-  }
+  static std::string description() { return nvbench::demangle<nvbench::enum_type<Value, T>>(); }
 };
 
 } // namespace nvbench
@@ -86,15 +83,13 @@ struct type_strings<nvbench::enum_type<Value, T>>
  * \relatesalso enum_type_list
  * \relatesalso nvbench::enum_type_list
  */
-#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T,                                   \
-                                          input_generator,                     \
-                                          description_generator)               \
-  namespace nvbench                                                            \
-  {                                                                            \
-  template <T Value>                                                           \
-  struct type_strings<enum_type<Value, T>>                                     \
-  {                                                                            \
-    static std::string input_string() { return input_generator(Value); }       \
-    static std::string description() { return description_generator(Value); }  \
-  };                                                                           \
+#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, input_generator, description_generator)               \
+  namespace nvbench                                                                                \
+  {                                                                                                \
+  template <T Value>                                                                               \
+  struct type_strings<enum_type<Value, T>>                                                         \
+  {                                                                                                \
+    static std::string input_string() { return input_generator(Value); }                           \
+    static std::string description() { return description_generator(Value); }                      \
+  };                                                                                               \
   }
diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh
index b49ed36d..c935e4cb 100644
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -31,16 +31,16 @@ enum class exec_flag
   none = 0x0,
 
   // Modifiers:
-  timer    = 0x01, // KernelLauncher uses manual timing
-  no_block = 0x02, // Disables use of `blocking_kernel`.
-  sync     = 0x04, // KernelLauncher has indicated that it will sync
-  run_once = 0x08, // Only run the benchmark once (for profiling).
+  timer         = 0x01, // KernelLauncher uses manual timing
+  no_block      = 0x02, // Disables use of `blocking_kernel`.
+  sync          = 0x04, // KernelLauncher has indicated that it will sync
+  run_once      = 0x08, // Only run the benchmark once (for profiling).
   modifier_mask = timer | no_block | sync | run_once,
 
   // Measurement types:
-  cold = 0x0100, // measure_hot
-  hot  = 0x0200, // measure_cold
-  measure_mask  = cold | hot
+  cold         = 0x0100, // measure_cold
+  hot          = 0x0200, // measure_hot
+  measure_mask = cold | hot
 };
 
 } // namespace nvbench::detail
@@ -120,7 +120,9 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer;
 
 /// Modifier used to indicate that the KernelGenerator will perform CUDA
 /// synchronizations. Without this flag such benchmarks will deadlock.
-constexpr inline auto sync = nvbench::exec_tag::impl::no_block |
-                             nvbench::exec_tag::impl::sync;
+constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;
+
+/// Modifier used to indicate that batched measurements should be disabled
+constexpr inline auto no_batch = nvbench::exec_tag::impl::cold;
 
 } // namespace nvbench::exec_tag
diff --git a/nvbench/flags.cuh b/nvbench/flags.cuh
index 30ba84eb..cefefa3c 100644
--- a/nvbench/flags.cuh
+++ b/nvbench/flags.cuh
@@ -20,24 +20,24 @@
 
 #include <type_traits>
 
-#define NVBENCH_DECLARE_FLAGS(T)                                               \
-  constexpr inline T operator|(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator&(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator^(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator~(T v1)                                           \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(~static_cast<UT>(v1));                               \
+#define NVBENCH_DECLARE_FLAGS(T)                                                                   \
+  constexpr inline T operator|(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator&(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator^(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator~(T v1)                                                               \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(~static_cast<UT>(v1));                                                   \
   }
diff --git a/nvbench/float64_axis.cuh b/nvbench/float64_axis.cuh
index 0d606512..ef7b089d 100644
--- a/nvbench/float64_axis.cuh
+++ b/nvbench/float64_axis.cuh
@@ -36,20 +36,11 @@ struct float64_axis final : public axis_base
 
   ~float64_axis() final;
 
-  void set_inputs(std::vector<nvbench::float64_t> inputs)
-  {
-    m_values = std::move(inputs);
-  }
-  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const
-  {
-    return m_values[i];
-  }
+  void set_inputs(std::vector<nvbench::float64_t> inputs) { m_values = std::move(inputs); }
+  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<float64_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<float64_axis>(*this); }
   std::size_t do_get_size() const final { return m_values.size(); }
   std::string do_get_input_string(std::size_t i) const final;
   std::string do_get_description(std::size_t i) const final;
diff --git a/nvbench/git_revision.cuh b/nvbench/git_revision.cuh
index 2b29e920..50fc9da7 100644
--- a/nvbench/git_revision.cuh
+++ b/nvbench/git_revision.cuh
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
-*
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
-*  the License.
-*
-*  You may obtain a copy of the License at
-*
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*/
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
 #pragma once
 
@@ -25,7 +25,5 @@
 #define NVBENCH_GIT_SHA1 NVBench_GIT_SHA1
 #define NVBENCH_GIT_VERSION NVBench_GIT_VERSION
 #ifdef NVBench_GIT_IS_DIRTY
-#  define NVBENCH_GIT_IS_DIRTY
+#define NVBENCH_GIT_IS_DIRTY
 #endif
-
-
diff --git a/nvbench/int64_axis.cuh b/nvbench/int64_axis.cuh
index a6cec2e3..adc95d11 100644
--- a/nvbench/int64_axis.cuh
+++ b/nvbench/int64_axis.cuh
@@ -58,34 +58,27 @@ struct int64_axis final : public axis_base
     return static_cast<bool>(m_flags & int64_axis_flags::power_of_two);
   }
 
-  void set_inputs(std::vector<int64_t> inputs,
-                  int64_axis_flags flags = int64_axis_flags::none);
+  void set_inputs(std::vector<int64_t> inputs, int64_axis_flags flags = int64_axis_flags::none);
 
-  [[nodiscard]] const std::vector<int64_t> &get_inputs() const
-  {
-    return m_inputs;
-  };
+  [[nodiscard]] const std::vector<int64_t> &get_inputs() const { return m_inputs; };
 
   [[nodiscard]] int64_t get_value(std::size_t i) const { return m_values[i]; };
 
-  [[nodiscard]] const std::vector<int64_t> &get_values() const
-  {
-    return m_values;
-  };
+  [[nodiscard]] const std::vector<int64_t> &get_values() const { return m_values; };
 
   int64_axis_flags get_flags() const { return m_flags; }
 
   // Helper functions for pow2 conversions:
   static nvbench::int64_t compute_pow2(nvbench::int64_t exponent)
   {
-    return 1ll << exponent;
+    return nvbench::int64_t{1} << exponent;
   }
 
   // UB if value < 0.
   static nvbench::int64_t compute_log2(nvbench::int64_t value)
   {
     // TODO use <bit> functions in C++20?
-    nvbench::uint64_t bits    = static_cast<nvbench::int64_t>(value);
+    nvbench::uint64_t bits    = static_cast<nvbench::uint64_t>(value);
     nvbench::int64_t exponent = 0;
     while ((bits >>= 1) != 0ull)
     {
@@ -95,10 +88,7 @@ struct int64_axis final : public axis_base
   };
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<int64_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<int64_axis>(*this); }
   std::size_t do_get_size() const final { return m_inputs.size(); }
   std::string do_get_input_string(std::size_t) const final;
   std::string do_get_description(std::size_t) const final;
diff --git a/nvbench/int64_axis.cxx b/nvbench/int64_axis.cxx
index 24ff913d..599c388f 100644
--- a/nvbench/int64_axis.cxx
+++ b/nvbench/int64_axis.cxx
@@ -66,9 +66,8 @@ std::string int64_axis::do_get_input_string(std::size_t i) const
 
 std::string int64_axis::do_get_description(std::size_t i) const
 {
-  return this->is_power_of_two()
-           ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
-           : std::string{};
+  return this->is_power_of_two() ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
+                                 : std::string{};
 }
 
 std::string_view int64_axis::do_get_flags_as_string() const
diff --git a/nvbench/internal/markdown_table.cuh b/nvbench/internal/markdown_table.cuh
index bb721c65..518f57b9 100644
--- a/nvbench/internal/markdown_table.cuh
+++ b/nvbench/internal/markdown_table.cuh
@@ -85,8 +85,7 @@ private:
                             " {:^{}} ",
                             col.header,
                             col.max_width);
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
     }
     return fmt::format_to(iter, "\n");
   }
@@ -102,8 +101,7 @@ private:
                             "{:-^{}}",
                             "",
                             col.max_width + 2);
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
     }
     return fmt::format_to(iter, "\n");
   }
@@ -116,8 +114,7 @@ private:
 
     for (std::size_t row = 0; row < m_num_rows; ++row)
     {
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
       for (const column &col : m_columns)
       {
         iter = fmt::format_to(iter,
@@ -125,8 +122,7 @@ private:
                               " {:>{}} ",
                               col.rows[row],
                               col.max_width);
-        iter =
-          fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+        iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
       } // cols
 
       iter = fmt::format_to(iter, "\n");
diff --git a/nvbench/internal/nvml.cuh b/nvbench/internal/nvml.cuh
index 497f31a1..05c6764a 100644
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -32,6 +32,16 @@
 namespace nvbench::nvml
 {
 
+// RAII struct that initializes and shuts down NVML
+// Needs to be constructed and kept alive while using nvml
+struct NVMLLifetimeManager
+{
+  NVMLLifetimeManager();
+  ~NVMLLifetimeManager();
+private:
+  bool m_inited{false};
+};
+
 /// Base class for NVML-specific exceptions
 struct error : std::runtime_error
 {
@@ -74,10 +84,7 @@ struct call_failed : error
 
   [[nodiscard]] nvmlReturn_t get_error_code() const { return m_error_code; }
 
-  [[nodiscard]] const std::string &get_error_string() const
-  {
-    return m_error_string;
-  }
+  [[nodiscard]] const std::string &get_error_string() const { return m_error_string; }
 
 private:
   nvmlReturn_t m_error_code;
@@ -90,30 +97,26 @@ private:
 
 #ifdef NVBENCH_HAS_NVML
 
-#define NVBENCH_NVML_CALL(call)                                                \
-  do                                                                           \
-  {                                                                            \
-    const auto _rr = call;                                                     \
-    if (_rr != NVML_SUCCESS)                                                   \
-    {                                                                          \
-      throw nvbench::nvml::call_failed(__FILE__,                               \
-                                       __LINE__,                               \
-                                       #call,                                  \
-                                       _rr,                                    \
-                                       nvmlErrorString(_rr));                  \
-    }                                                                          \
+#define NVBENCH_NVML_CALL(call)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    const auto _rr = call;                                                                         \
+    if (_rr != NVML_SUCCESS)                                                                       \
+    {                                                                                              \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, nvmlErrorString(_rr));      \
+    }                                                                                              \
   } while (false)
 
 // Same as above, but used for nvmlInit(), where a failure means that
 // nvmlErrorString is not available.
-#define NVBENCH_NVML_CALL_NO_API(call)                                         \
-  do                                                                           \
-  {                                                                            \
-    const auto _rr = call;                                                     \
-    if (_rr != NVML_SUCCESS)                                                   \
-    {                                                                          \
-      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");    \
-    }                                                                          \
+#define NVBENCH_NVML_CALL_NO_API(call)                                                             \
+  do                                                                                               \
+  {                                                                                                \
+    const auto _rr = call;                                                                         \
+    if (_rr != NVML_SUCCESS)                                                                       \
+    {                                                                                              \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");                        \
+    }                                                                                              \
   } while (false)
 
 #endif // NVBENCH_HAS_NVML
diff --git a/nvbench/internal/nvml.cxx b/nvbench/internal/nvml.cxx
index 4f750bce..025515d2 100644
--- a/nvbench/internal/nvml.cxx
+++ b/nvbench/internal/nvml.cxx
@@ -18,54 +18,38 @@
 
 #include <nvbench/internal/nvml.cuh>
 
-#include <nvbench/config.cuh>
-
-#include <fmt/format.h>
-
-#include <nvml.h>
-
-#include <stdexcept>
-
-namespace
+namespace nvbench::nvml
 {
+NVMLLifetimeManager::NVMLLifetimeManager()
+{
+#ifdef NVBENCH_HAS_NVML
+  try
+  {
+    NVBENCH_NVML_CALL_NO_API(nvmlInit());
+    m_inited = true;
+  }
+  catch (std::exception &e)
+  {
+    fmt::print("NVML initialization failed:\n {}", e.what());
+  }
+#endif
+}
 
-// RAII struct that initializes and shuts down NVML
-struct NVMLLifetimeManager
+NVMLLifetimeManager::~NVMLLifetimeManager()
 {
-  NVMLLifetimeManager()
+#ifdef NVBENCH_HAS_NVML
+  if (m_inited)
   {
     try
     {
-      NVBENCH_NVML_CALL_NO_API(nvmlInit());
-      m_inited = true;
+      NVBENCH_NVML_CALL_NO_API(nvmlShutdown());
     }
     catch (std::exception &e)
     {
-      fmt::print("NVML initialization failed:\n {}", e.what());
-    }
-  }
-
-  ~NVMLLifetimeManager()
-  {
-    if (m_inited)
-    {
-      try
-      {
-        NVBENCH_NVML_CALL_NO_API(nvmlShutdown());
-      }
-      catch (std::exception &e)
-      {
-        fmt::print("NVML shutdown failed:\n {}", e.what());
-      }
+      fmt::print("NVML shutdown failed:\n {}", e.what());
     }
   }
+#endif
+}
 
-private:
-  bool m_inited{false};
-};
-
-// NVML's lifetime should extend for the entirety of the process, so store in a
-// global.
-auto nvml_lifetime = NVMLLifetimeManager{};
-
-} // namespace
+} // namespace nvbench::nvml
diff --git a/nvbench/internal/table_builder.cuh b/nvbench/internal/table_builder.cuh
index 81fca0a0..ae029b41 100644
--- a/nvbench/internal/table_builder.cuh
+++ b/nvbench/internal/table_builder.cuh
@@ -47,18 +47,14 @@ struct table_builder
                 const std::string &header,
                 std::string value)
   {
-    auto iter = std::find_if(m_columns.begin(),
-                             m_columns.end(),
-                             [&column_key](const column &col) {
-                               return col.key == column_key;
-                             });
+    auto iter = std::find_if(m_columns.begin(), m_columns.end(), [&column_key](const column &col) {
+      return col.key == column_key;
+    });
 
     auto &col = iter == m_columns.end()
-                ? m_columns.emplace_back(column{column_key,
-                                                header,
-                                                std::vector<std::string>{},
-                                                header.size()})
-                : *iter;
+                  ? m_columns.emplace_back(
+                      column{column_key, header, std::vector<std::string>{}, header.size()})
+                  : *iter;
 
     col.max_width = std::max(col.max_width, value.size());
     if (col.rows.size() <= row)
@@ -76,11 +72,9 @@ struct table_builder
       std::size_t{},
       [](const auto &a, const auto &b) { return a > b ? a : b; },
       [](const column &col) { return col.rows.size(); });
-    std::for_each(m_columns.begin(),
-                  m_columns.end(),
-                  [num_rows = m_num_rows](column &col) {
-                    col.rows.resize(num_rows);
-                  });
+    std::for_each(m_columns.begin(), m_columns.end(), [num_rows = m_num_rows](column &col) {
+      col.rows.resize(num_rows);
+    });
   }
 };
 
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index 7c99f2af..f7e337ae 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -43,10 +43,14 @@
 #include <utility>
 #include <vector>
 
-#ifdef __GNUC__
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
 #include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
 #else
-#include <filesystem>
+static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
 #endif
 
 #if NVBENCH_CPP_DIALECT >= 2020
@@ -126,11 +130,10 @@ std::string json_printer::version_t::get_string() const
   return fmt::format("{}.{}.{}", this->major, this->minor, this->patch);
 }
 
-void json_printer::do_process_bulk_data_float64(
-  state &state,
-  const std::string &tag,
-  const std::string &hint,
-  const std::vector<nvbench::float64_t> &data)
+void json_printer::do_process_bulk_data_float64(state &state,
+                                                const std::string &tag,
+                                                const std::string &hint,
+                                                const std::vector<nvbench::float64_t> &data)
 {
   printer_base::do_process_bulk_data_float64(state, tag, hint, data);
 
@@ -141,12 +144,6 @@ void json_printer::do_process_bulk_data_float64(
 
   if (hint == "sample_times")
   {
-#ifdef __GNUC__
-    namespace fs = std::experimental::filesystem;
-#else
-    namespace fs = std::filesystem;
-#endif
-
     nvbench::cpu_timer timer;
     timer.start();
 
@@ -157,16 +154,12 @@ void json_printer::do_process_bulk_data_float64(
       {
         if (!fs::create_directory(result_path))
         {
-          NVBENCH_THROW(std::runtime_error,
-                        "{}",
-                        "Failed to create result directory '{}'.");
+          NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
         }
       }
       else if (!fs::is_directory(result_path))
       {
-        NVBENCH_THROW(std::runtime_error,
-                      "{}",
-                      "'{}' exists and is not a directory.");
+        NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
       }
 
       const auto file_id = m_num_jsonbin_files++;
@@ -197,16 +190,12 @@ void json_printer::do_process_bulk_data_float64(
     }
     catch (std::exception &e)
     {
-      if (auto printer_opt_ref = state.get_benchmark().get_printer();
-          printer_opt_ref.has_value())
+      if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
       {
         auto &printer = printer_opt_ref.value().get();
-        printer.log(nvbench::log_level::warn,
-                    fmt::format("Error writing {} ({}) to {}: {}",
-                                tag,
-                                hint,
-                                result_path.string(),
-                                e.what()));
+        printer.log(
+          nvbench::log_level::warn,
+          fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
       }
     } // end catch
 
@@ -221,18 +210,45 @@ void json_printer::do_process_bulk_data_float64(
     summ.set_string("hide", "Not needed in table.");
 
     timer.stop();
-    if (auto printer_opt_ref = state.get_benchmark().get_printer();
-        printer_opt_ref.has_value())
+    if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
     {
       auto &printer = printer_opt_ref.value().get();
-      printer.log(nvbench::log_level::info,
-                  fmt::format("Wrote '{}' in {:>6.3f}ms",
-                              result_path.string(),
-                              timer.get_duration() * 1000));
+      printer.log(
+        nvbench::log_level::info,
+        fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
     }
   } // end hint == sample_times
 }
 
+static void add_devices_section(nlohmann::ordered_json &root)
+{
+  auto &devices = root["devices"];
+  for (const auto &dev_info : nvbench::device_manager::get().get_devices())
+  {
+    auto &device                    = devices.emplace_back();
+    device["id"]                    = dev_info.get_id();
+    device["name"]                  = dev_info.get_name();
+    device["sm_version"]            = dev_info.get_sm_version();
+    device["ptx_version"]           = dev_info.get_ptx_version();
+    device["sm_default_clock_rate"] = dev_info.get_sm_default_clock_rate();
+    device["number_of_sms"]         = dev_info.get_number_of_sms();
+    device["max_blocks_per_sm"]     = dev_info.get_max_blocks_per_sm();
+    device["max_threads_per_sm"]    = dev_info.get_max_threads_per_sm();
+    device["max_threads_per_block"] = dev_info.get_max_threads_per_block();
+    device["registers_per_sm"]      = dev_info.get_registers_per_sm();
+    device["registers_per_block"]   = dev_info.get_registers_per_block();
+    device["global_memory_size"]    = dev_info.get_global_memory_size();
+    device["global_memory_bus_peak_clock_rate"] =
+      dev_info.get_global_memory_bus_peak_clock_rate();
+    device["global_memory_bus_width"]     = dev_info.get_global_memory_bus_width();
+    device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
+    device["l2_cache_size"]               = dev_info.get_l2_cache_size();
+    device["shared_memory_per_sm"]        = dev_info.get_shared_memory_per_sm();
+    device["shared_memory_per_block"]     = dev_info.get_shared_memory_per_block();
+    device["ecc_state"]                   = dev_info.get_ecc_state();
+  }
+}
+
 void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
 {
   nlohmann::ordered_json root;
@@ -285,36 +301,7 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
     }   // "version"
   }     // "meta"
 
-  {
-    auto &devices = root["devices"];
-    for (const auto &dev_info : nvbench::device_manager::get().get_devices())
-    {
-      auto &device                    = devices.emplace_back();
-      device["id"]                    = dev_info.get_id();
-      device["name"]                  = dev_info.get_name();
-      device["sm_version"]            = dev_info.get_sm_version();
-      device["ptx_version"]           = dev_info.get_ptx_version();
-      device["sm_default_clock_rate"] = dev_info.get_sm_default_clock_rate();
-      device["number_of_sms"]         = dev_info.get_number_of_sms();
-      device["max_blocks_per_sm"]     = dev_info.get_max_blocks_per_sm();
-      device["max_threads_per_sm"]    = dev_info.get_max_threads_per_sm();
-      device["max_threads_per_block"] = dev_info.get_max_threads_per_block();
-      device["registers_per_sm"]      = dev_info.get_registers_per_sm();
-      device["registers_per_block"]   = dev_info.get_registers_per_block();
-      device["global_memory_size"]    = dev_info.get_global_memory_size();
-      device["global_memory_bus_peak_clock_rate"] =
-        dev_info.get_global_memory_bus_peak_clock_rate();
-      device["global_memory_bus_width"] =
-        dev_info.get_global_memory_bus_width();
-      device["global_memory_bus_bandwidth"] =
-        dev_info.get_global_memory_bus_bandwidth();
-      device["l2_cache_size"]        = dev_info.get_l2_cache_size();
-      device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
-      device["shared_memory_per_block"] =
-        dev_info.get_shared_memory_per_block();
-      device["ecc_state"] = dev_info.get_ecc_state();
-    }
-  } // "devices"
+  add_devices_section(root);
 
   {
     auto &benchmarks = root["benchmarks"];
@@ -358,23 +345,19 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
           switch (axis_ptr->get_type())
           {
             case nvbench::axis_type::type:
-              value["is_active"] =
-                static_cast<type_axis &>(*axis_ptr).get_is_active(i);
+              value["is_active"] = static_cast<type_axis &>(*axis_ptr).get_is_active(i);
               break;
 
             case nvbench::axis_type::int64:
-              value["value"] =
-                static_cast<int64_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<int64_axis &>(*axis_ptr).get_value(i);
               break;
 
             case nvbench::axis_type::float64:
-              value["value"] =
-                static_cast<float64_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<float64_axis &>(*axis_ptr).get_value(i);
               break;
 
             case nvbench::axis_type::string:
-              value["value"] =
-                static_cast<string_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<string_axis &>(*axis_ptr).get_value(i);
               break;
             default:
               break;
@@ -454,4 +437,73 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
   m_ostream << root.dump(2) << "\n";
 }
 
+void json_printer::do_print_benchmark_list(const benchmark_vector &benches)
+{
+  if (benches.empty())
+  {
+    return;
+  }
+
+  nlohmann::ordered_json root;
+  auto &benchmarks = root["benchmarks"];
+
+  for (const auto &bench_ptr : benches)
+  {
+    const auto bench_index = benchmarks.size();
+    auto &bench            = benchmarks.emplace_back();
+
+    bench["name"]  = bench_ptr->get_name();
+    bench["index"] = bench_index;
+
+    // We have to ensure that the axes are represented as an array, not an
+    // nil object when there are no axes.
+    auto &axes = bench["axes"] = nlohmann::json::array();
+
+    for (const auto &axis_ptr : bench_ptr->get_axes().get_axes())
+    {
+      auto &axis = axes.emplace_back();
+
+      axis["name"]  = axis_ptr->get_name();
+      axis["type"]  = axis_ptr->get_type_as_string();
+      axis["flags"] = axis_ptr->get_flags_as_string();
+
+      auto &values         = axis["values"];
+      const auto axis_size = axis_ptr->get_size();
+      for (std::size_t i = 0; i < axis_size; ++i)
+      {
+        auto &value           = values.emplace_back();
+        value["input_string"] = axis_ptr->get_input_string(i);
+        value["description"]  = axis_ptr->get_description(i);
+
+        switch (axis_ptr->get_type())
+        {
+          case nvbench::axis_type::int64:
+            value["value"] = static_cast<int64_axis &>(*axis_ptr).get_value(i);
+            break;
+
+          case nvbench::axis_type::float64:
+            value["value"] = static_cast<float64_axis &>(*axis_ptr).get_value(i);
+            break;
+
+          case nvbench::axis_type::string:
+            value["value"] = static_cast<string_axis &>(*axis_ptr).get_value(i);
+            break;
+
+          default:
+            break;
+        } // end switch (axis type)
+      }   // end foreach axis value
+    }
+  } // end foreach bench
+
+  m_ostream << root.dump(2) << "\n";
+}
+
+void json_printer::print_devices_json()
+{
+  nlohmann::ordered_json root;
+  add_devices_section(root);
+  m_ostream << root.dump(2) << "\n";
+}
+
 } // namespace nvbench
diff --git a/nvbench/json_printer.cuh b/nvbench/json_printer.cuh
index a58448f3..8457687f 100644
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -38,9 +38,7 @@ struct json_printer : nvbench::printer_base
 {
   using printer_base::printer_base;
 
-  json_printer(std::ostream &stream,
-               std::string stream_name,
-               bool enable_binary_output)
+  json_printer(std::ostream &stream, std::string stream_name, bool enable_binary_output)
       : printer_base(stream, std::move(stream_name))
       , m_enable_binary_output{enable_binary_output}
   {}
@@ -59,24 +57,20 @@ struct json_printer : nvbench::printer_base
 
   [[nodiscard]] static version_t get_json_file_version();
 
-  [[nodiscard]] bool get_enable_binary_output() const
-  {
-    return m_enable_binary_output;
-  }
+  [[nodiscard]] bool get_enable_binary_output() const { return m_enable_binary_output; }
   void set_enable_binary_output(bool b) { m_enable_binary_output = b; }
 
+  void print_devices_json();
+
 protected:
   // Virtual API from printer_base:
-  void do_log_argv(const std::vector<std::string>& argv) override
-  {
-    m_argv = argv;
-  }
-  void do_process_bulk_data_float64(
-    nvbench::state &state,
-    const std::string &tag,
-    const std::string &hint,
-    const std::vector<nvbench::float64_t> &data) override;
+  void do_log_argv(const std::vector<std::string> &argv) override { m_argv = argv; }
+  void do_process_bulk_data_float64(nvbench::state &state,
+                                    const std::string &tag,
+                                    const std::string &hint,
+                                    const std::vector<nvbench::float64_t> &data) override;
   void do_print_benchmark_results(const benchmark_vector &benches) override;
+  void do_print_benchmark_list(const benchmark_vector &) override;
 
   bool m_enable_binary_output{false};
   std::size_t m_num_jsonbin_files{};
diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh
index 4b973f32..c45f1fda 100644
--- a/nvbench/launch.cuh
+++ b/nvbench/launch.cuh
@@ -42,20 +42,17 @@ struct launch
   {}
 
   // move-only
-  launch(const launch &) = delete;
-  launch(launch &&)      = default;
+  launch(const launch &)            = delete;
+  launch(launch &&)                 = default;
   launch &operator=(const launch &) = delete;
-  launch &operator=(launch &&) = default;
+  launch &operator=(launch &&)      = delete;
 
   /**
    * @return a CUDA stream that all kernels and other stream-ordered CUDA work
    * must use. This stream can be changed by the `KernelGenerator` using the
    * `nvbench::state::set_cuda_stream` method.
    */
-  __forceinline__ const nvbench::cuda_stream &get_stream() const
-  {
-    return m_stream;
-  };
+  __forceinline__ const nvbench::cuda_stream &get_stream() const { return m_stream; };
 
 private:
   // The stream is owned by the `nvbench::state` associated with this launch.
diff --git a/nvbench/main.cuh b/nvbench/main.cuh
index 4c1588cd..cd809ba4 100644
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -25,64 +25,229 @@
 #include <nvbench/option_parser.cuh>
 #include <nvbench/printer_base.cuh>
 
+#include <cstdlib>
 #include <iostream>
 
-#define NVBENCH_MAIN                                                           \
-  int main(int argc, char const *const *argv)                                  \
-  try                                                                          \
-  {                                                                            \
-    NVBENCH_MAIN_BODY(argc, argv);                                             \
-    NVBENCH_CUDA_CALL(cudaDeviceReset());                                      \
-    return 0;                                                                  \
-  }                                                                            \
-  catch (std::exception & e)                                                   \
-  {                                                                            \
-    std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";    \
-    return 1;                                                                  \
-  }                                                                            \
-  catch (...)                                                                  \
-  {                                                                            \
-    std::cerr << "\nNVBench encountered an unknown error.\n";                  \
-    return 1;                                                                  \
+// Advanced users can rebuild NVBench's `main` function using the macros in this file, or replace
+// them with customized implementations.
+
+// Customization point, called before NVBench initialization.
+#ifndef NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv) []() {}()
+#endif
+
+// Customization point, called after NVBench initialization.
+#ifndef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing. Update argc/argv if needed.
+// argc/argv are the usual command line arguments types. The ARGS version of this
+// macro is a bit more convenient.
+#ifndef NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER(argc, argv) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing. Update args if needed.
+// Args is a vector of strings, each element is an argument.
+#ifndef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing.
+#ifndef NVBENCH_MAIN_PARSE_CUSTOM_PRE
+#define NVBENCH_MAIN_PARSE_CUSTOM_PRE(parser, args) []() {}()
+#endif
+
+// Customization point, called after NVBench parsing.
+#ifndef NVBENCH_MAIN_PARSE_CUSTOM_POST
+#define NVBENCH_MAIN_PARSE_CUSTOM_POST(parser) []() {}()
+#endif
+
+// Customization point, called before NVBench finalization.
+#ifndef NVBENCH_MAIN_FINALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_PRE() []() {}()
+#endif
+
+// Customization point, called after NVBench finalization.
+#ifndef NVBENCH_MAIN_FINALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_POST() []() {}()
+#endif
+
+// Customization point, use to catch addition exceptions.
+#ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#endif
+
+/************************************ Default implementation **************************************/
+
+#ifndef NVBENCH_MAIN
+#define NVBENCH_MAIN                                                                               \
+  int main(int argc, char **argv)                                                                  \
+  try                                                                                              \
+  {                                                                                                \
+    NVBENCH_MAIN_BODY(argc, argv);                                                                 \
+    return 0;                                                                                      \
+  }                                                                                                \
+  NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM                                                             \
+  NVBENCH_MAIN_CATCH_EXCEPTIONS
+#endif
+
+#ifndef NVBENCH_MAIN_BODY
+#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
+  NVBENCH_MAIN_INITIALIZE(argc, argv);                                                             \
+  {                                                                                                \
+    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
+                                                                                                   \
+    NVBENCH_MAIN_PRINT_PREAMBLE(parser);                                                           \
+    NVBENCH_MAIN_RUN_BENCHMARKS(parser);                                                           \
+    NVBENCH_MAIN_PRINT_EPILOGUE(parser);                                                           \
+                                                                                                   \
+    NVBENCH_MAIN_PRINT_RESULTS(parser);                                                            \
+  } /* Tear down parser before finalization */                                                     \
+  NVBENCH_MAIN_FINALIZE();                                                                         \
+  return 0;
+#endif
+
+#ifndef NVBENCH_MAIN_INITIALIZE
+#define NVBENCH_MAIN_INITIALIZE(argc, argv)                                                        \
+  { /* Open a scope to ensure that the inner initialize/finalize hooks clean up in order. */       \
+    NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv);                                                \
+    nvbench::detail::main_initialize(argc, argv);                                                  \
+    { /* Open a scope to ensure that the inner initialize/finalize hooks clean up in order. */     \
+      NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)
+#endif
+
+#ifndef NVBENCH_MAIN_PARSE
+#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
+  NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER(argc, argv);                                               \
+  std::vector<std::string> args = nvbench::detail::main_convert_args(argc, argv);                  \
+  NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args);                                                          \
+  nvbench::option_parser parser;                                                                   \
+  NVBENCH_MAIN_PARSE_CUSTOM_PRE(parser, args);                                                     \
+  parser.parse(args);                                                                              \
+  NVBENCH_MAIN_PARSE_CUSTOM_POST(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_PREAMBLE
+#define NVBENCH_MAIN_PRINT_PREAMBLE(parser) nvbench::detail::main_print_preamble(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_RUN_BENCHMARKS
+#define NVBENCH_MAIN_RUN_BENCHMARKS(parser) nvbench::detail::main_run_benchmarks(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_EPILOGUE
+#define NVBENCH_MAIN_PRINT_EPILOGUE(parser) nvbench::detail::main_print_epilogue(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_RESULTS
+#define NVBENCH_MAIN_PRINT_RESULTS(parser) nvbench::detail::main_print_results(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_FINALIZE
+#define NVBENCH_MAIN_FINALIZE()                                                                    \
+  NVBENCH_MAIN_FINALIZE_CUSTOM_PRE();                                                              \
+  } /* Close a scope to ensure that the inner initialize/finalize hooks clean up in order. */      \
+  nvbench::detail::main_finalize();                                                                \
+  NVBENCH_MAIN_FINALIZE_CUSTOM_POST();                                                             \
+  } /* Close a scope to ensure that the inner initialize/finalize hooks clean up in order. */      \
+  []() {}()
+#endif
+
+#ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS                                                              \
+  catch (std::exception & e)                                                                       \
+  {                                                                                                \
+    std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";                        \
+    return 1;                                                                                      \
+  }                                                                                                \
+  catch (...)                                                                                      \
+  {                                                                                                \
+    std::cerr << "\nNVBench encountered an unknown error.\n";                                      \
+    return 1;                                                                                      \
   }
+#endif
 
-#ifdef NVBENCH_HAS_CUPTI
-#define NVBENCH_INITIALIZE_DRIVER_API NVBENCH_DRIVER_API_CALL(cuInit(0))
+namespace nvbench::detail
+{
+
+inline void set_env(const char *name, const char *value)
+{
+#ifdef _MSC_VER
+  _putenv_s(name, value);
 #else
-// clang-format off
-#define NVBENCH_INITIALIZE_DRIVER_API do {} while (false)
-// clang-format on
-#endif
-
-#define NVBENCH_MAIN_PARSE(argc, argv)                                         \
-  nvbench::option_parser parser;                                               \
-  parser.parse(argc, argv)
-
-#define NVBENCH_MAIN_BODY(argc, argv)                                          \
-  do                                                                           \
-  {                                                                            \
-    NVBENCH_INITIALIZE_DRIVER_API;                                             \
-    NVBENCH_MAIN_PARSE(argc, argv);                                            \
-    auto &printer = parser.get_printer();                                      \
-                                                                               \
-    printer.print_device_info();                                               \
-    printer.print_log_preamble();                                              \
-    auto &benchmarks = parser.get_benchmarks();                                \
-                                                                               \
-    std::size_t total_states = 0;                                              \
-    for (auto &bench_ptr : benchmarks)                                         \
-    {                                                                          \
-      total_states += bench_ptr->get_config_count();                           \
-    }                                                                          \
-    printer.set_total_state_count(total_states);                               \
-                                                                               \
-    printer.set_completed_state_count(0);                                      \
-    for (auto &bench_ptr : benchmarks)                                         \
-    {                                                                          \
-      bench_ptr->set_printer(printer);                                         \
-      bench_ptr->run();                                                        \
-      bench_ptr->clear_printer();                                              \
-    }                                                                          \
-    printer.print_log_epilogue();                                              \
-    printer.print_benchmark_results(benchmarks);                               \
-  } while (false)
+  setenv(name, value, 1);
+#endif
+}
+
+inline void main_initialize(int, char **)
+{
+  // See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+  set_env("CUDA_MODULE_LOADING", "EAGER");
+
+  // Initialize CUDA driver API if needed:
+#ifdef NVBENCH_HAS_CUPTI
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+#endif
+
+  // Initialize the benchmarks *after* setting up the CUDA environment:
+  nvbench::benchmark_manager::get().initialize();
+}
+
+inline std::vector<std::string> main_convert_args(int argc, char const *const *argv)
+{
+  std::vector<std::string> args;
+  for (int i = 0; i < argc; ++i)
+  {
+    args.push_back(argv[i]);
+  }
+  return args;
+}
+
+inline void main_print_preamble(option_parser &parser)
+{
+  auto &printer = parser.get_printer();
+
+  printer.print_device_info();
+  printer.print_log_preamble();
+}
+
+inline void main_run_benchmarks(option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+
+  std::size_t total_states = 0;
+  for (auto &bench_ptr : benchmarks)
+  {
+    total_states += bench_ptr->get_config_count();
+  }
+
+  printer.set_completed_state_count(0);
+  printer.set_total_state_count(total_states);
+
+  for (auto &bench_ptr : benchmarks)
+  {
+    bench_ptr->set_printer(printer);
+    bench_ptr->run();
+    bench_ptr->clear_printer();
+  }
+}
+
+inline void main_print_epilogue(option_parser &parser)
+{
+  auto &printer = parser.get_printer();
+  printer.print_log_epilogue();
+}
+
+inline void main_print_results(option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+  printer.print_benchmark_results(benchmarks);
+}
+
+inline void main_finalize() { NVBENCH_CUDA_CALL(cudaDeviceReset()); }
+
+} // namespace nvbench::detail
diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu
index 276ca865..6b892a85 100644
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -41,65 +41,54 @@ namespace nvbench
 void markdown_printer::do_print_device_info()
 {
   fmt::memory_buffer buffer;
-  fmt::format_to(buffer, "# Devices\n\n");
+  fmt::format_to(std::back_inserter(buffer), "# Devices\n\n");
 
   const auto &device_mgr = nvbench::device_manager::get();
-  const auto &devices    = device_mgr.get_number_of_used_devices() > 0
-                             ? device_mgr.get_used_devices()
-                             : device_mgr.get_devices();
+  const auto &devices = device_mgr.get_number_of_used_devices() > 0 ? device_mgr.get_used_devices()
+                                                                    : device_mgr.get_devices();
   for (const auto &device : devices)
   {
     const auto [gmem_free, gmem_used] = device.get_global_memory_usage();
 
-    fmt::format_to(buffer, "## [{}] `{}`\n", device.get_id(), device.get_name());
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer), "## [{}] `{}`\n", device.get_id(), device.get_name());
+    fmt::format_to(std::back_inserter(buffer),
                    "* SM Version: {} (PTX Version: {})\n",
                    device.get_sm_version(),
                    device.get_ptx_version());
-    fmt::format_to(buffer, "* Number of SMs: {}\n", device.get_number_of_sms());
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer), "* Number of SMs: {}\n", device.get_number_of_sms());
+    fmt::format_to(std::back_inserter(buffer),
                    "* SM Default Clock Rate: {} MHz\n",
                    device.get_sm_default_clock_rate() / 1000 / 1000);
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    "* Global Memory: {} MiB Free / {} MiB Total\n",
                    gmem_free / 1024 / 1024,
                    gmem_used / 1024 / 1024);
-    fmt::format_to(
-      buffer,
-      "* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
-      device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
-      device.get_global_memory_bus_width(),
-      device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
+                   "* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
+                   device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
+                   device.get_global_memory_bus_width(),
+                   device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
+    fmt::format_to(std::back_inserter(buffer),
                    "* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
                    device.get_shared_memory_per_sm() / 1024,
                    device.get_shared_memory_per_block() / 1024);
-    fmt::format_to(buffer,
-                   "* L2 Cache Size: {} KiB\n",
-                   device.get_l2_cache_size() / 1024);
-    fmt::format_to(buffer,
-                   "* Maximum Active Blocks: {}/SM\n",
-                   device.get_max_blocks_per_sm());
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer), "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
+    fmt::format_to(std::back_inserter(buffer), "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
+    fmt::format_to(std::back_inserter(buffer),
                    "* Maximum Active Threads: {}/SM, {}/Block\n",
                    device.get_max_threads_per_sm(),
                    device.get_max_threads_per_block());
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    "* Available Registers: {}/SM, {}/Block\n",
                    device.get_registers_per_sm(),
                    device.get_registers_per_block());
-    fmt::format_to(buffer,
-                   "* ECC Enabled: {}\n",
-                   device.get_ecc_state() ? "Yes" : "No");
-    fmt::format_to(buffer, "\n");
+    fmt::format_to(std::back_inserter(buffer), "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
+    fmt::format_to(std::back_inserter(buffer), "\n");
   }
   m_ostream << fmt::to_string(buffer);
 }
 
-void markdown_printer::do_print_log_preamble()
-{
-  m_ostream << "# Log\n\n```\n";
-}
+void markdown_printer::do_print_log_preamble() { m_ostream << "# Log\n\n```\n"; }
 
 void markdown_printer::do_print_log_epilogue() { m_ostream << "```\n\n"; }
 
@@ -147,8 +136,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
 {
   if (m_total_state_count == 0)
   { // No progress info
-    this->log(nvbench::log_level::run,
-              exec_state.get_short_description(m_color));
+    this->log(nvbench::log_level::run, exec_state.get_short_description(m_color));
   }
   else
   { // Add progress
@@ -160,8 +148,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
   }
 }
 
-void markdown_printer::do_print_benchmark_list(
-  const printer_base::benchmark_vector &benches)
+void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vector &benches)
 {
   if (benches.empty())
   {
@@ -169,20 +156,20 @@ void markdown_printer::do_print_benchmark_list(
   }
 
   fmt::memory_buffer buffer;
-  fmt::format_to(buffer, "# Benchmarks\n\n");
+  fmt::format_to(std::back_inserter(buffer), "# Benchmarks\n\n");
   std::size_t benchmark_id{0};
   for (const auto &bench_ptr : benches)
   {
     const auto &axes              = bench_ptr->get_axes().get_axes();
     const std::size_t num_configs = bench_ptr->get_config_count();
 
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    "## [{}] `{}` ({} configurations)\n\n",
                    benchmark_id++,
                    bench_ptr->get_name(),
                    num_configs);
 
-    fmt::format_to(buffer, "### Axes\n\n");
+    fmt::format_to(std::back_inserter(buffer), "### Axes\n\n");
     for (const auto &axis_ptr : axes)
     {
       std::string flags_str(axis_ptr->get_flags_as_string());
@@ -190,7 +177,7 @@ void markdown_printer::do_print_benchmark_list(
       {
         flags_str = fmt::format(" [{}]", flags_str);
       }
-      fmt::format_to(buffer,
+      fmt::format_to(std::back_inserter(buffer),
                      "* `{}` : {}{}\n",
                      axis_ptr->get_name(),
                      axis_ptr->get_type_as_string(),
@@ -204,20 +191,16 @@ void markdown_printer::do_print_benchmark_list(
         {
           desc = fmt::format(" ({})", desc);
         }
-        fmt::format_to(buffer,
-                       "  * `{}`{}\n",
-                       axis_ptr->get_input_string(i),
-                       desc);
+        fmt::format_to(std::back_inserter(buffer), "  * `{}`{}\n", axis_ptr->get_input_string(i), desc);
       } // end foreach value
     }   // end foreach axis
-    fmt::format_to(buffer, "\n");
+    fmt::format_to(std::back_inserter(buffer), "\n");
   } // end foreach bench
 
   m_ostream << fmt::to_string(buffer);
 }
 
-void markdown_printer::do_print_benchmark_results(
-  const printer_base::benchmark_vector &benches)
+void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_vector &benches)
 {
   auto format_visitor = [](const auto &v) {
     using T = std::decay_t<decltype(v)>;
@@ -239,7 +222,7 @@ void markdown_printer::do_print_benchmark_results(
 
   // Start printing benchmarks
   fmt::memory_buffer buffer;
-  fmt::format_to(buffer, "# Benchmark Results\n");
+  fmt::format_to(std::back_inserter(buffer), "# Benchmark Results\n");
 
   for (const auto &bench_ptr : benches)
   {
@@ -247,24 +230,20 @@ void markdown_printer::do_print_benchmark_results(
     const auto &devices = bench.get_devices();
     const auto &axes    = bench.get_axes();
 
-    fmt::format_to(buffer, "\n## {}\n", bench.get_name());
+    fmt::format_to(std::back_inserter(buffer), "\n## {}\n", bench.get_name());
 
     // Do a single pass when no devices are specified. This happens for
     // benchmarks with `cpu` exec_tags.
     const std::size_t num_device_passes = devices.empty() ? 1 : devices.size();
-    for (std::size_t device_pass = 0; device_pass < num_device_passes;
-         ++device_pass)
+    for (std::size_t device_pass = 0; device_pass < num_device_passes; ++device_pass)
     {
-      std::optional<nvbench::device_info> device =
-        devices.empty() ? std::nullopt
-                        : std::make_optional(devices[device_pass]);
+      std::optional<nvbench::device_info> device = devices.empty()
+                                                     ? std::nullopt
+                                                     : std::make_optional(devices[device_pass]);
 
       if (device)
       {
-        fmt::format_to(buffer,
-                       "\n### [{}] {}\n\n",
-                       device->get_id(),
-                       device->get_name());
+        fmt::format_to(std::back_inserter(buffer), "\n### [{}] {}\n\n", device->get_id(), device->get_name());
       }
 
       std::size_t row = 0;
@@ -288,15 +267,11 @@ void markdown_printer::do_print_benchmark_results(
             {
               const nvbench::int64_t value    = axis_values.get_int64(name);
               const nvbench::int64_t exponent = int64_axis::compute_log2(value);
-              table.add_cell(row,
-                             name,
-                             name,
-                             fmt::format("2^{} = {}", exponent, value));
+              table.add_cell(row, name, name, fmt::format("2^{} = {}", exponent, value));
             }
             else
             {
-              std::string value = std::visit(format_visitor,
-                                             axis_values.get_value(name));
+              std::string value = std::visit(format_visitor, axis_values.get_value(name));
               table.add_cell(row, name + "_axis", name, std::move(value));
             }
           }
@@ -308,12 +283,9 @@ void markdown_printer::do_print_benchmark_results(
               continue;
             }
             const std::string &tag    = summ.get_tag();
-            const std::string &header = summ.has_value("name")
-                                          ? summ.get_string("name")
-                                          : tag;
+            const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
 
-            std::string hint = summ.has_value("hint") ? summ.get_string("hint")
-                                                      : std::string{};
+            std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
             if (hint == "duration")
             {
               table.add_cell(row, tag, header, this->do_format_duration(summ));
@@ -332,10 +304,7 @@ void markdown_printer::do_print_benchmark_results(
             }
             else if (hint == "sample_size")
             {
-              table.add_cell(row,
-                             tag,
-                             header,
-                             this->do_format_sample_size(summ));
+              table.add_cell(row, tag, header, this->do_format_sample_size(summ));
             }
             else if (hint == "percentage")
             {
@@ -351,10 +320,9 @@ void markdown_printer::do_print_benchmark_results(
       }
 
       auto table_str = table.to_string();
-      fmt::format_to(buffer,
+      fmt::format_to(std::back_inserter(buffer),
                      "{}",
-                     table_str.empty() ? "No data -- check log.\n"
-                                       : std::move(table_str));
+                     table_str.empty() ? "No data -- check log.\n" : std::move(table_str));
     } // end foreach device_pass
   }
 
diff --git a/nvbench/named_values.cuh b/nvbench/named_values.cuh
index c11dab4d..1ce51858 100644
--- a/nvbench/named_values.cuh
+++ b/nvbench/named_values.cuh
@@ -33,8 +33,7 @@ namespace nvbench
  */
 struct named_values
 {
-  using value_type =
-    std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
+  using value_type = std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
 
   enum class type
   {
@@ -43,7 +42,7 @@ struct named_values
     string
   };
 
-  void append(const named_values& other);
+  void append(const named_values &other);
 
   [[nodiscard]] std::size_t get_size() const;
   [[nodiscard]] std::vector<std::string> get_names() const;
@@ -60,11 +59,11 @@ struct named_values
 
   [[nodiscard]] type get_type(const std::string &name) const;
   [[nodiscard]] bool has_value(const std::string &name) const;
-  [[nodiscard]] const value_type& get_value(const std::string &name) const;
+  [[nodiscard]] const value_type &get_value(const std::string &name) const;
 
   void clear();
 
-  void remove_value(const std::string& name);
+  void remove_value(const std::string &name);
 
 private:
   struct named_value
diff --git a/nvbench/named_values.cxx b/nvbench/named_values.cxx
index 1aeb4dc4..605789ed 100644
--- a/nvbench/named_values.cxx
+++ b/nvbench/named_values.cxx
@@ -33,9 +33,7 @@ namespace nvbench
 
 void named_values::append(const named_values &other)
 {
-  m_storage.insert(m_storage.end(),
-                   other.m_storage.cbegin(),
-                   other.m_storage.cend());
+  m_storage.insert(m_storage.end(), other.m_storage.cbegin(), other.m_storage.cend());
 }
 
 void named_values::clear() { m_storage.clear(); }
@@ -55,20 +53,17 @@ std::vector<std::string> named_values::get_names() const
 
 bool named_values::has_value(const std::string &name) const
 {
-  auto iter =
-    std::find_if(m_storage.cbegin(),
-                 m_storage.cend(),
-                 [&name](const auto &val) { return val.name == name; });
+  auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
+    return val.name == name;
+  });
   return iter != m_storage.cend();
 }
 
-const named_values::value_type &
-named_values::get_value(const std::string &name) const
+const named_values::value_type &named_values::get_value(const std::string &name) const
 {
-  auto iter =
-    std::find_if(m_storage.cbegin(),
-                 m_storage.cend(),
-                 [&name](const auto &val) { return val.name == name; });
+  auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
+    return val.name == name;
+  });
   if (iter == m_storage.cend())
   {
     NVBENCH_THROW(std::runtime_error, "No value with name '{}'.", name);
@@ -96,9 +91,7 @@ named_values::type named_values::get_type(const std::string &name) const
       // warning C4702: unreachable code
       // This is a future-proofing check, it'll be reachable if something breaks
       NVBENCH_MSVC_PUSH_DISABLE_WARNING(4702)
-      NVBENCH_THROW(std::runtime_error,
-                    "Unknown variant type for entry '{}'.",
-                    name);
+      NVBENCH_THROW(std::runtime_error, "Unknown variant type for entry '{}'.", name);
     },
     this->get_value(name));
   NVBENCH_MSVC_POP_WARNING()
@@ -111,10 +104,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up int64 value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up int64 value `{}`:\n{}", name, err.what());
 }
 
 nvbench::float64_t named_values::get_float64(const std::string &name) const
@@ -124,10 +114,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up float64 value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up float64 value `{}`:\n{}", name, err.what());
 }
 
 const std::string &named_values::get_string(const std::string &name) const
@@ -137,10 +124,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up string value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up string value `{}`:\n{}", name, err.what());
 }
 
 void named_values::set_int64(std::string name, nvbench::int64_t value)
@@ -165,10 +149,9 @@ void named_values::set_value(std::string name, named_values::value_type value)
 
 void named_values::remove_value(const std::string &name)
 {
-  auto iter =
-    std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
-      return val.name == name;
-    });
+  auto iter = std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
+    return val.name == name;
+  });
   if (iter != m_storage.end())
   {
     m_storage.erase(iter);
diff --git a/nvbench/nvbench.cuh b/nvbench/nvbench.cuh
index 75bf1c1e..3fb933fb 100644
--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -24,6 +24,7 @@
 #include <nvbench/callable.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/cpu_timer.cuh>
+#include <nvbench/criterion_manager.cuh>
 #include <nvbench/create.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 55f7f1c7..1edac87f 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -21,6 +21,8 @@
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/benchmark_manager.cuh>
 #include <nvbench/csv_printer.cuh>
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/device_manager.cuh>
 #include <nvbench/git_revision.cuh>
 #include <nvbench/json_printer.cuh>
 #include <nvbench/markdown_printer.cuh>
@@ -82,20 +84,11 @@ std::string_view submatch_to_sv(const sv_submatch &in)
 //
 // So we're stuck with materializing a std::string and calling std::stoX(). Ah
 // well. At least it's not istream.
-void parse(std::string_view input, nvbench::int32_t &val)
-{
-  val = std::stoi(std::string(input));
-}
+void parse(std::string_view input, nvbench::int32_t &val) { val = std::stoi(std::string(input)); }
 
-void parse(std::string_view input, nvbench::int64_t &val)
-{
-  val = std::stoll(std::string(input));
-}
+void parse(std::string_view input, nvbench::int64_t &val) { val = std::stoll(std::string(input)); }
 
-void parse(std::string_view input, nvbench::float64_t &val)
-{
-  val = std::stod(std::string(input));
-}
+void parse(std::string_view input, nvbench::float64_t &val) { val = std::stod(std::string(input)); }
 
 void parse(std::string_view input, std::string &val) { val = input; }
 
@@ -112,9 +105,8 @@ std::vector<T> parse_list_values(std::string_view list_spec)
     "(?:,|$)"  // Delimiters
   };
 
-  auto values_begin =
-    sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
-  auto values_end = sv_regex_iterator{};
+  auto values_begin = sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
+  auto values_end   = sv_regex_iterator{};
   while (values_begin != values_end)
   {
     auto match          = *values_begin++;
@@ -131,8 +123,7 @@ std::vector<T> parse_list_values(std::string_view list_spec)
 // Parses a range specification "<start> : <stop> [ : <stride> ]" and returns
 // a vector filled with the specified range.
 template <typename T>
-std::vector<T> parse_range_values(std::string_view range_spec,
-                                  nvbench::wrapped_type<T>)
+std::vector<T> parse_range_values(std::string_view range_spec, nvbench::wrapped_type<T>)
 {
   std::vector<T> range_params;
 
@@ -143,9 +134,8 @@ std::vector<T> parse_range_values(std::string_view range_spec,
     "(?:$|:)"  // Delimiters
   };
 
-  auto values_begin =
-    sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
-  auto values_end = sv_regex_iterator{};
+  auto values_begin = sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
+  auto values_end   = sv_regex_iterator{};
   for (; values_begin != values_end; ++values_begin)
   {
     auto match          = *values_begin;
@@ -221,25 +211,15 @@ std::vector<T> parse_values(std::string_view value_spec)
                                        "$"};        // EOS
 
   sv_match match;
-  if (std::regex_search(value_spec.cbegin(),
-                        value_spec.cend(),
-                        match,
-                        list_regex))
+  if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, list_regex))
   {
     return parse_list_values<T>(submatch_to_sv(match[1]));
   }
-  else if (std::regex_search(value_spec.cbegin(),
-                             value_spec.cend(),
-                             match,
-                             range_regex))
+  else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, range_regex))
   {
-    return parse_range_values(submatch_to_sv(match[1]),
-                              nvbench::wrapped_type<T>{});
+    return parse_range_values(submatch_to_sv(match[1]), nvbench::wrapped_type<T>{});
   }
-  else if (std::regex_search(value_spec.cbegin(),
-                             value_spec.cend(),
-                             match,
-                             single_regex))
+  else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, single_regex))
   {
     T val;
     parse(submatch_to_sv(match[1]), val);
@@ -247,9 +227,7 @@ std::vector<T> parse_values(std::string_view value_spec)
   }
   else
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid axis value spec: {}",
-                  value_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid axis value spec: {}", value_spec);
   }
 }
 
@@ -389,7 +367,7 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
   }
 
   auto check_params = [&first, &last](std::size_t num_params) {
-    const std::size_t rem_args = std::distance(first, last) - 1;
+    const std::size_t rem_args = static_cast<std::size_t>(std::distance(first, last) - 1);
     if (rem_args < num_params)
     {
       NVBENCH_THROW(std::runtime_error,
@@ -400,6 +378,9 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
     }
   };
 
+  const nvbench::criterion_manager::params_description criterion_params =
+    nvbench::criterion_manager::get().get_params_description();
+
   while (first < last)
   {
     const auto &arg = *first;
@@ -423,7 +404,21 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
     }
     else if (arg == "--list" || arg == "-l")
     {
-      this->print_list();
+      nvbench::markdown_printer printer{std::cout};
+      this->print_list(printer);
+      std::exit(0);
+    }
+    else if (arg == "--jsonlist-benches")
+    {
+      nvbench::json_printer printer{std::cout};
+      const auto &bench_mgr = nvbench::benchmark_manager::get();
+      printer.print_benchmark_list(bench_mgr.get_benchmarks());
+      std::exit(0);
+    }
+    else if (arg == "--jsonlist-devices")
+    {
+      nvbench::json_printer printer{std::cout};
+      printer.print_devices_json();
       std::exit(0);
     }
     else if (arg == "--persistence-mode" || arg == "--pm")
@@ -443,6 +438,12 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->enable_run_once();
       first += 1;
     }
+    else if (arg == "--stopping-criterion")
+    {
+      check_params(1);
+      this->set_stopping_criterion(first[1]);
+      first += 2;
+    }
     else if (arg == "--disable-blocking-kernel")
     {
       this->disable_blocking_kernel();
@@ -454,7 +455,7 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->disable_blocking_kernel();
       first += 1;
     }
-    else if (arg == "--quiet" | arg == "-q")
+    else if (arg == "--quiet" || arg == "-q")
     {
       // Setting this flag prevents the default stdout printer from being
       // added.
@@ -514,18 +515,34 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->update_int64_prop(first[0], first[1]);
       first += 2;
     }
-    else if (arg == "--min-time" || arg == "--max-noise" ||
-             arg == "--skip-time" || arg == "--timeout")
+    else if (arg == "--skip-time" || arg == "--timeout")
     {
       check_params(1);
       this->update_float64_prop(first[0], first[1]);
       first += 2;
     }
     else
-    {
-      NVBENCH_THROW(std::runtime_error,
-                    "Unrecognized command-line argument: `{}`.",
-                    arg);
+    { // Try criterion params
+      if (arg.size() < 3 || arg[0] != '-' || arg[1] != '-')
+      {
+        NVBENCH_THROW(std::runtime_error, "Unrecognized command-line argument: `{}`.", arg);
+      }
+
+      std::string_view name(arg.c_str() + 2, arg.size() - 2);
+      auto it = std::find_if(criterion_params.begin(),
+                             criterion_params.end(),
+                             [&name](const auto &param) { return param.first == name; });
+
+      if (it != criterion_params.end())
+      {
+        check_params(1);
+        this->update_criterion_prop(first[0], first[1], it->second);
+        first += 2;
+      }
+      else
+      {
+        NVBENCH_THROW(std::runtime_error, "Unrecognized command-line argument: `{}`.", arg);
+      }
     }
   }
 }
@@ -534,7 +551,7 @@ void option_parser::add_markdown_printer(const std::string &spec)
 try
 {
   std::ostream &stream = this->printer_spec_to_ostream(spec);
-  auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
+  auto &printer        = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
   if (spec == "stdout")
   {
     printer.set_color(m_color_md_stdout_printer);
@@ -556,14 +573,10 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error while adding csv output for `{}`:\n{}",
-                spec,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error while adding csv output for `{}`:\n{}", spec, e.what());
 }
 
-void option_parser::add_json_printer(const std::string &spec,
-                                     bool enable_binary)
+void option_parser::add_json_printer(const std::string &spec, bool enable_binary)
 try
 {
   std::ostream &stream = this->printer_spec_to_ostream(spec);
@@ -610,11 +623,9 @@ void option_parser::print_version() const
              NVBENCH_GIT_VERSION);
 }
 
-void option_parser::print_list() const
+void option_parser::print_list(printer_base& printer) const
 {
   const auto &bench_mgr = nvbench::benchmark_manager::get();
-
-  nvbench::markdown_printer printer{std::cout};
   printer.print_device_info();
   printer.print_benchmark_list(bench_mgr.get_benchmarks());
 }
@@ -624,10 +635,7 @@ void option_parser::print_help() const
   fmt::print("{}\n{}\n", ::cli_help_text, ::cli_help_axis_text);
 }
 
-void option_parser::print_help_axis() const
-{
-  fmt::print("{}\n", ::cli_help_axis_text);
-}
+void option_parser::print_help_axis() const { fmt::print("{}\n", ::cli_help_axis_text); }
 
 void option_parser::set_persistence_mode(const std::string &state)
 try
@@ -685,9 +693,7 @@ try
   {
     if (rate_val == nvbench::device_info::clock_rate::none)
     {
-      fmt::print("Unlocking clocks for device '{}' ({}).\n",
-                 device.get_name(),
-                 device.get_id());
+      fmt::print("Unlocking clocks for device '{}' ({}).\n", device.get_name(), device.get_id());
     }
     else
     {
@@ -721,6 +727,20 @@ void option_parser::enable_run_once()
   bench.set_run_once(true);
 }
 
+void option_parser::set_stopping_criterion(const std::string &criterion)
+{
+  // If no active benchmark, save args as global.
+  if (m_benchmarks.empty())
+  {
+    m_global_benchmark_args.push_back("--stopping-criterion");
+    m_global_benchmark_args.push_back(criterion);
+    return;
+  }
+
+  benchmark_base &bench = *m_benchmarks.back();
+  bench.set_stopping_criterion(criterion);
+}
+
 void option_parser::disable_blocking_kernel()
 {
   // If no active benchmark, save args as global.
@@ -749,7 +769,7 @@ try
   catch (std::invalid_argument &)
   {}
 
-  m_benchmarks.push_back(idx >= 0 ? mgr.get_benchmark(idx).clone()
+  m_benchmarks.push_back(idx >= 0 ? mgr.get_benchmark(static_cast<std::size_t>(idx)).clone()
                                   : mgr.get_benchmark(name).clone());
 
   // Initialize the new benchmark with any global arguments:
@@ -757,16 +777,12 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --benchmark `{}`:\n{}",
-                name,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --benchmark `{}`:\n{}", name, e.what());
 }
 
 void option_parser::replay_global_args()
 {
-  this->parse_range(m_global_benchmark_args.cbegin(),
-                    m_global_benchmark_args.cend());
+  this->parse_range(m_global_benchmark_args.cbegin(), m_global_benchmark_args.cend());
 }
 
 void option_parser::update_devices(const std::string &devices)
@@ -790,10 +806,7 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --devices `{}`:\n{}",
-                devices,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --devices `{}`:\n{}", devices, e.what());
 }
 
 void option_parser::update_axis(const std::string &spec)
@@ -832,28 +845,20 @@ try
   switch (axis.get_type())
   {
     case axis_type::type:
-      this->update_type_axis(static_cast<nvbench::type_axis &>(axis),
-                             values,
-                             flags);
+      this->update_type_axis(static_cast<nvbench::type_axis &>(axis), values, flags);
       break;
 
     case axis_type::int64:
-      this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis),
-                              values,
-                              flags);
+      this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis), values, flags);
       break;
 
     case axis_type::float64:
-      this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis),
-                                values,
-                                flags);
+      this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis), values, flags);
 
       break;
 
     case axis_type::string:
-      this->update_string_axis(static_cast<nvbench::string_axis &>(axis),
-                               values,
-                               flags);
+      this->update_string_axis(static_cast<nvbench::string_axis &>(axis), values, flags);
 
       break;
 
@@ -866,10 +871,7 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --axis `{}`:\n{}",
-                spec,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --axis `{}`:\n{}", spec, e.what());
 }
 
 void option_parser::update_int64_axis(int64_axis &axis,
@@ -888,9 +890,7 @@ void option_parser::update_int64_axis(int64_axis &axis,
   }
   else
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for int64 axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for int64 axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<nvbench::int64_t>(value_spec);
@@ -905,9 +905,7 @@ void option_parser::update_float64_axis(float64_axis &axis,
   // Validate flags:
   if (!flag_spec.empty())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for float64 axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for float64 axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<nvbench::float64_t>(value_spec);
@@ -922,9 +920,7 @@ void option_parser::update_string_axis(string_axis &axis,
   // Validate flags:
   if (!flag_spec.empty())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for string axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for string axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<std::string>(value_spec);
@@ -939,9 +935,7 @@ void option_parser::update_type_axis(type_axis &axis,
   // Validate flags:
   if (!flag_spec.empty())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for type axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for type axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<std::string>(value_spec);
@@ -949,8 +943,7 @@ void option_parser::update_type_axis(type_axis &axis,
   axis.set_active_inputs(input_values);
 }
 
-void option_parser::update_int64_prop(const std::string &prop_arg,
-                                      const std::string &prop_val)
+void option_parser::update_int64_prop(const std::string &prop_arg, const std::string &prop_val)
 try
 {
   // If no active benchmark, save args as global.
@@ -983,9 +976,11 @@ catch (std::exception &e)
                 e.what());
 }
 
-void option_parser::update_float64_prop(const std::string &prop_arg,
-                                        const std::string &prop_val)
-try
+void option_parser::update_criterion_prop(
+  const std::string &prop_arg,
+  const std::string &prop_val,
+  const nvbench::named_values::type type)
+try 
 {
   // If no active benchmark, save args as global.
   if (m_benchmarks.empty())
@@ -996,18 +991,59 @@ try
   }
 
   benchmark_base &bench = *m_benchmarks.back();
+  nvbench::criterion_params& criterion_params = bench.get_criterion_params();
+  std::string name(prop_arg.begin() + 2, prop_arg.end());
+  if (type == nvbench::named_values::type::float64) 
+  {
+    nvbench::float64_t value{};
+    ::parse(prop_val, value);
 
-  nvbench::float64_t value{};
-  ::parse(prop_val, value);
-  if (prop_arg == "--min-time")
+    if (prop_arg == "--max-noise")
+    { // Specified as percentage, stored as ratio:
+      value /= 100.0;
+    }
+    criterion_params.set_float64(name, value);
+  }
+  else if (type == nvbench::named_values::type::int64) 
+  {
+    nvbench::int64_t value{};
+    ::parse(prop_val, value);
+    criterion_params.set_int64(name, value);
+  }
+  else if (type == nvbench::named_values::type::string) 
   {
-    bench.set_min_time(value);
+    criterion_params.set_string(name, prop_val);
   }
-  else if (prop_arg == "--max-noise")
-  { // Specified as percentage, stored as ratio:
-    bench.set_max_noise(value / 100.);
+  else 
+  {
+    NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
   }
-  else if (prop_arg == "--skip-time")
+}
+catch (std::exception& e)
+{
+  NVBENCH_THROW(std::runtime_error,
+                "Error handling option `{} {}`:\n{}",
+                prop_arg,
+                prop_val,
+                e.what());
+}
+
+void option_parser::update_float64_prop(const std::string &prop_arg, const std::string &prop_val)
+try
+{
+  // If no active benchmark, save args as global.
+  if (m_benchmarks.empty())
+  {
+    m_global_benchmark_args.push_back(prop_arg);
+    m_global_benchmark_args.push_back(prop_val);
+    return;
+  }
+
+  benchmark_base &bench = *m_benchmarks.back();
+
+  nvbench::float64_t value{};
+  ::parse(prop_val, value);
+  if (prop_arg == "--skip-time")
   {
     bench.set_skip_time(value);
   }
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index e35d7241..5bd834c3 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -20,6 +20,7 @@
 
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_multiplex.cuh>
+#include <nvbench/stopping_criterion.cuh>
 
 #include <iosfwd>
 #include <memory>
@@ -41,8 +42,7 @@ struct type_axis;
  */
 struct option_parser
 {
-  using benchmark_vector =
-    std::vector<std::unique_ptr<nvbench::benchmark_base>>;
+  using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
 
   option_parser();
   ~option_parser();
@@ -51,15 +51,9 @@ struct option_parser
   void parse(std::vector<std::string> args);
 
   [[nodiscard]] benchmark_vector &get_benchmarks() { return m_benchmarks; };
-  [[nodiscard]] const benchmark_vector &get_benchmarks() const
-  {
-    return m_benchmarks;
-  };
+  [[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
 
-  [[nodiscard]] const std::vector<std::string> &get_args() const
-  {
-    return m_args;
-  }
+  [[nodiscard]] const std::vector<std::string> &get_args() const { return m_args; }
 
   /*!
    * Returns the output format requested by the parse options.
@@ -86,13 +80,14 @@ private:
   std::ostream &printer_spec_to_ostream(const std::string &spec);
 
   void print_version() const;
-  void print_list() const;
+  void print_list(printer_base& printer) const;
   void print_help() const;
   void print_help_axis() const;
 
   void set_persistence_mode(const std::string &state);
   void lock_gpu_clocks(const std::string &rate);
 
+  void set_stopping_criterion(const std::string &criterion);
   void enable_run_once();
   void disable_blocking_kernel();
 
@@ -115,10 +110,12 @@ private:
                                std::string_view value_spec,
                                std::string_view flag_spec);
 
-  void update_int64_prop(const std::string &prop_arg,
-                         const std::string &prop_val);
-  void update_float64_prop(const std::string &prop_arg,
-                           const std::string &prop_val);
+  void update_int64_prop(const std::string &prop_arg, const std::string &prop_val);
+  void update_float64_prop(const std::string &prop_arg, const std::string &prop_val);
+
+  void update_criterion_prop(const std::string &prop_arg,
+                             const std::string &prop_val,
+                             const nvbench::named_values::type type);
 
   void update_used_device_state() const;
 
diff --git a/nvbench/printer_base.cuh b/nvbench/printer_base.cuh
index 0e28a352..13cf803b 100644
--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -22,6 +22,7 @@
 
 #include <iosfwd>
 #include <memory>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -76,19 +77,16 @@ struct printer_base
   virtual ~printer_base();
 
   // move-only
-  printer_base(const printer_base &) = delete;
-  printer_base(printer_base &&)      = default;
+  printer_base(const printer_base &)            = delete;
+  printer_base(printer_base &&)                 = default;
   printer_base &operator=(const printer_base &) = delete;
-  printer_base &operator=(printer_base &&) = default;
+  printer_base &operator=(printer_base &&)      = delete;
 
   /*!
    * Called once with the command line arguments used to invoke the current
    * executable.
    */
-  void log_argv(const std::vector<std::string> &argv)
-  {
-    this->do_log_argv(argv);
-  }
+  void log_argv(const std::vector<std::string> &argv) { this->do_log_argv(argv); }
 
   /*!
    * Print a summary of all detected devices, if supported.
@@ -108,19 +106,13 @@ struct printer_base
   /*!
    * Print a log message at the specified log level.
    */
-  void log(nvbench::log_level level, const std::string &msg)
-  {
-    this->do_log(level, msg);
-  }
+  void log(nvbench::log_level level, const std::string &msg) { this->do_log(level, msg); }
 
   /*!
    * Called before running the measurements associated with state.
    * Implementations are expected to call `log(log_level::run, ...)`.
    */
-  void log_run_state(const nvbench::state &exec_state)
-  {
-    this->do_log_run_state(exec_state);
-  }
+  void log_run_state(const nvbench::state &exec_state) { this->do_log_run_state(exec_state); }
 
   /*!
    * Measurements may call this to allow a printer to perform extra processing
@@ -181,10 +173,7 @@ struct printer_base
     return this->do_get_completed_state_count();
   }
 
-  virtual void set_total_state_count(std::size_t states)
-  {
-    this->do_set_total_state_count(states);
-  }
+  virtual void set_total_state_count(std::size_t states) { this->do_set_total_state_count(states); }
   [[nodiscard]] virtual std::size_t get_total_state_count() const
   {
     return this->do_get_total_state_count();
@@ -193,18 +182,22 @@ struct printer_base
 
 protected:
   // Implementation hooks for subclasses:
-  virtual void do_log_argv(const std::vector<std::string>&) {}
+  virtual void do_log_argv(const std::vector<std::string> &) {}
   virtual void do_print_device_info() {}
   virtual void do_print_log_preamble() {}
   virtual void do_print_log_epilogue() {}
   virtual void do_log(nvbench::log_level, const std::string &) {}
   virtual void do_log_run_state(const nvbench::state &) {}
-  virtual void
-  do_process_bulk_data_float64(nvbench::state &,
-                               const std::string &,
-                               const std::string &,
-                               const std::vector<nvbench::float64_t> &){};
-  virtual void do_print_benchmark_list(const benchmark_vector &) {}
+  virtual void do_process_bulk_data_float64(nvbench::state &,
+                                            const std::string &,
+                                            const std::string &,
+                                            const std::vector<nvbench::float64_t> &){};
+
+  virtual void do_print_benchmark_list(const benchmark_vector &) 
+  {
+    throw std::runtime_error{"nvbench::do_print_benchmark_list is not supported by this printer."};
+  }
+
   virtual void do_print_benchmark_results(const benchmark_vector &) {}
 
   virtual void do_set_completed_state_count(std::size_t states);
diff --git a/nvbench/printer_base.cxx b/nvbench/printer_base.cxx
index 66de7959..639edc20 100644
--- a/nvbench/printer_base.cxx
+++ b/nvbench/printer_base.cxx
@@ -38,19 +38,10 @@ void printer_base::do_set_completed_state_count(std::size_t states)
 
 void printer_base::do_add_completed_state() { ++m_completed_state_count; }
 
-std::size_t printer_base::do_get_completed_state_count() const
-{
-  return m_completed_state_count;
-}
+std::size_t printer_base::do_get_completed_state_count() const { return m_completed_state_count; }
 
-void printer_base::do_set_total_state_count(std::size_t states)
-{
-  m_total_state_count = states;
-}
+void printer_base::do_set_total_state_count(std::size_t states) { m_total_state_count = states; }
 
-std::size_t printer_base::do_get_total_state_count() const
-{
-  return m_total_state_count;
-}
+std::size_t printer_base::do_get_total_state_count() const { return m_total_state_count; }
 
 } // namespace nvbench
diff --git a/nvbench/printer_multiplex.cuh b/nvbench/printer_multiplex.cuh
index f32a0e9b..797b480c 100644
--- a/nvbench/printer_multiplex.cuh
+++ b/nvbench/printer_multiplex.cuh
@@ -40,10 +40,7 @@ struct printer_multiplex : nvbench::printer_base
     return static_cast<Format &>(*m_printers.back());
   }
 
-  [[nodiscard]] std::size_t get_printer_count() const
-  {
-    return m_printers.size();
-  }
+  [[nodiscard]] std::size_t get_printer_count() const { return m_printers.size(); }
 
 protected:
   void do_log_argv(const std::vector<std::string> &argv) override;
@@ -52,11 +49,10 @@ protected:
   void do_print_log_epilogue() override;
   void do_log(nvbench::log_level, const std::string &) override;
   void do_log_run_state(const nvbench::state &) override;
-  void do_process_bulk_data_float64(
-    nvbench::state &,
-    const std::string &,
-    const std::string &,
-    const std::vector<nvbench::float64_t> &) override;
+  void do_process_bulk_data_float64(nvbench::state &,
+                                    const std::string &,
+                                    const std::string &,
+                                    const std::vector<nvbench::float64_t> &) override;
   void do_print_benchmark_list(const benchmark_vector &benches) override;
   void do_print_benchmark_results(const benchmark_vector &benches) override;
   void do_set_completed_state_count(std::size_t states) override;
diff --git a/nvbench/printer_multiplex.cxx b/nvbench/printer_multiplex.cxx
index 86d99544..89867c12 100644
--- a/nvbench/printer_multiplex.cxx
+++ b/nvbench/printer_multiplex.cxx
@@ -67,11 +67,10 @@ void printer_multiplex::do_log_run_state(const nvbench::state &exec_state)
   }
 }
 
-void printer_multiplex::do_process_bulk_data_float64(
-  state &state,
-  const std::string &tag,
-  const std::string &hint,
-  const std::vector<nvbench::float64_t> &data)
+void printer_multiplex::do_process_bulk_data_float64(state &state,
+                                                     const std::string &tag,
+                                                     const std::string &hint,
+                                                     const std::vector<nvbench::float64_t> &data)
 {
   for (auto &format_ptr : m_printers)
   {
@@ -87,8 +86,7 @@ void printer_multiplex::do_print_benchmark_list(const benchmark_vector &benches)
   }
 }
 
-void printer_multiplex::do_print_benchmark_results(
-  const benchmark_vector &benches)
+void printer_multiplex::do_print_benchmark_results(const benchmark_vector &benches)
 {
   for (auto &format_ptr : m_printers)
   {
diff --git a/nvbench/range.cuh b/nvbench/range.cuh
index f0e82550..7000f872 100644
--- a/nvbench/range.cuh
+++ b/nvbench/range.cuh
@@ -29,13 +29,11 @@ namespace nvbench
 namespace detail
 {
 template <typename T>
-using range_output_t = std::conditional_t<std::is_floating_point_v<T>,
-                                          nvbench::float64_t,
-                                          nvbench::int64_t>;
+using range_output_t =
+  std::conditional_t<std::is_floating_point_v<T>, nvbench::float64_t, nvbench::int64_t>;
 }
 
-template <typename InT,
-          typename OutT = nvbench::detail::range_output_t<InT>>
+template <typename InT, typename OutT = nvbench::detail::range_output_t<InT>>
 auto range(InT start, InT end, InT stride = InT{1})
 {
   if constexpr (std::is_floating_point_v<InT>)
diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh
index 9435906d..f32b2223 100644
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -37,8 +37,7 @@ struct runner_base
 
   void generate_states();
 
-  void handle_sampling_exception(const std::exception &e,
-                                 nvbench::state &exec_state) const;
+  void handle_sampling_exception(const std::exception &e, nvbench::state &exec_state) const;
 
   void run_state_prologue(state &exec_state) const;
   void run_state_epilogue(state &exec_state) const;
@@ -51,11 +50,10 @@ struct runner_base
 template <typename BenchmarkType>
 struct runner : public runner_base
 {
-  using benchmark_type   = BenchmarkType;
-  using kernel_generator = typename benchmark_type::kernel_generator;
-  using type_configs     = typename benchmark_type::type_configs;
-  static constexpr std::size_t num_type_configs =
-    benchmark_type::num_type_configs;
+  using benchmark_type                          = BenchmarkType;
+  using kernel_generator                        = typename benchmark_type::kernel_generator;
+  using type_configs                            = typename benchmark_type::type_configs;
+  static constexpr std::size_t num_type_configs = benchmark_type::num_type_configs;
 
   explicit runner(benchmark_type &bench)
       : runner_base{bench}
@@ -86,38 +84,37 @@ private:
 
     // Iterate through type_configs:
     std::size_t type_config_index = 0;
-    nvbench::tl::foreach<type_configs>([&self   = *this,
-                                        &states = m_benchmark.m_states,
-                                        &type_config_index,
-                                        &device](auto type_config_wrapper) {
-      // Get current type_config:
-      using type_config = typename decltype(type_config_wrapper)::type;
-
-      // Find states with the current device / type_config
-      for (nvbench::state &cur_state : states)
-      {
-        if (cur_state.get_device() == device &&
-            cur_state.get_type_config_index() == type_config_index)
+    nvbench::tl::foreach<type_configs>(
+      [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
+        auto type_config_wrapper) {
+        // Get current type_config:
+        using type_config = typename decltype(type_config_wrapper)::type;
+
+        // Find states with the current device / type_config
+        for (nvbench::state &cur_state : states)
         {
-          self.run_state_prologue(cur_state);
-          try
+          if (cur_state.get_device() == device &&
+              cur_state.get_type_config_index() == type_config_index)
           {
-            kernel_generator{}(cur_state, type_config{});
-            if (cur_state.is_skipped())
+            self.run_state_prologue(cur_state);
+            try
             {
-              self.print_skip_notification(cur_state);
+              kernel_generator{}(cur_state, type_config{});
+              if (cur_state.is_skipped())
+              {
+                self.print_skip_notification(cur_state);
+              }
             }
+            catch (std::exception &e)
+            {
+              self.handle_sampling_exception(e, cur_state);
+            }
+            self.run_state_epilogue(cur_state);
           }
-          catch (std::exception &e)
-          {
-            self.handle_sampling_exception(e, cur_state);
-          }
-          self.run_state_epilogue(cur_state);
         }
-      }
 
-      ++type_config_index;
-    });
+        ++type_config_index;
+      });
   }
 };
 
diff --git a/nvbench/runner.cxx b/nvbench/runner.cxx
index 3aba964d..93cedf57 100644
--- a/nvbench/runner.cxx
+++ b/nvbench/runner.cxx
@@ -35,8 +35,7 @@ void runner_base::generate_states()
   m_benchmark.m_states = nvbench::detail::state_generator::create(m_benchmark);
 }
 
-void runner_base::handle_sampling_exception(const std::exception &e,
-                                            state &exec_state) const
+void runner_base::handle_sampling_exception(const std::exception &e, state &exec_state) const
 {
   // If the state is skipped, that means the execution framework class handled
   // the error already.
@@ -62,8 +61,7 @@ void runner_base::handle_sampling_exception(const std::exception &e,
 void runner_base::run_state_prologue(nvbench::state &exec_state) const
 {
   // Log if a printer exists:
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log_run_state(exec_state);
@@ -73,19 +71,16 @@ void runner_base::run_state_prologue(nvbench::state &exec_state) const
 void runner_base::run_state_epilogue(state &exec_state) const
 {
   // Notify the printer that the state has completed::
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.add_completed_state();
   }
 }
 
-
 void runner_base::print_skip_notification(state &exec_state) const
 {
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::skip, exec_state.get_skip_reason());
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 336ba2ba..09795de3 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -24,6 +24,7 @@
 #include <nvbench/named_values.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>
+#include <nvbench/stopping_criterion.cuh>
 
 #include <functional>
 #include <optional>
@@ -58,106 +59,79 @@ struct state_tester;
 struct state
 {
   // move-only
-  state(const state &) = delete;
-  state(state &&)      = default;
+  state(const state &)            = delete;
+  state(state &&)                 = default;
   state &operator=(const state &) = delete;
-  state &operator=(state &&) = default;
+  state &operator=(state &&)      = default;
 
-  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
-  {
-    return m_cuda_stream;
-  }
-  void set_cuda_stream(nvbench::cuda_stream &&stream)
-  {
-    m_cuda_stream = std::move(stream);
-  }
+  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
+  void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
 
   /// The CUDA device associated with with this benchmark state. May be
   /// nullopt for CPU-only benchmarks.
-  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
-  {
-    return m_device;
-  }
+  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }
 
   /// An index into a benchmark::type_configs type_list. Returns 0 if no type
   /// axes in the associated benchmark.
-  [[nodiscard]] std::size_t get_type_config_index() const
-  {
-    return m_type_config_index;
-  }
+  [[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
 
   [[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
-  [[nodiscard]] nvbench::int64_t
-  get_int64_or_default(const std::string &axis_name,
-                       nvbench::int64_t default_value) const;
-
-  [[nodiscard]] nvbench::float64_t
-  get_float64(const std::string &axis_name) const;
-  [[nodiscard]] nvbench::float64_t
-  get_float64_or_default(const std::string &axis_name,
-                         nvbench::float64_t default_value) const;
-
-  [[nodiscard]] const std::string &
-  get_string(const std::string &axis_name) const;
-  [[nodiscard]] const std::string &
-  get_string_or_default(const std::string &axis_name,
-                        const std::string &default_value) const;
+  [[nodiscard]] nvbench::int64_t get_int64_or_default(const std::string &axis_name,
+                                                      nvbench::int64_t default_value) const;
+
+  [[nodiscard]] nvbench::float64_t get_float64(const std::string &axis_name) const;
+  [[nodiscard]] nvbench::float64_t get_float64_or_default(const std::string &axis_name,
+                                                          nvbench::float64_t default_value) const;
+
+  [[nodiscard]] const std::string &get_string(const std::string &axis_name) const;
+  [[nodiscard]] const std::string &get_string_or_default(const std::string &axis_name,
+                                                         const std::string &default_value) const;
 
   void add_element_count(std::size_t elements, std::string column_name = {});
 
   void set_element_count(std::size_t elements) { m_element_count = elements; }
-  [[nodiscard]] std::size_t get_element_count() const
-  {
-    return m_element_count;
-  }
+  [[nodiscard]] std::size_t get_element_count() const { return m_element_count; }
 
   template <typename ElementType>
   void add_global_memory_reads(std::size_t count, std::string column_name = {})
   {
-    this->add_global_memory_reads(count * sizeof(ElementType),
-                                  std::move(column_name));
+    this->add_global_memory_reads(count * sizeof(ElementType), std::move(column_name));
   }
   void add_global_memory_reads(std::size_t bytes, std::string column_name = {});
 
   template <typename ElementType>
   void add_global_memory_writes(std::size_t count, std::string column_name = {})
   {
-    this->add_global_memory_writes(count * sizeof(ElementType),
-                                   std::move(column_name));
+    this->add_global_memory_writes(count * sizeof(ElementType), std::move(column_name));
   }
-  void add_global_memory_writes(std::size_t bytes,
-                                std::string column_name = {});
+  void add_global_memory_writes(std::size_t bytes, std::string column_name = {});
 
   void add_buffer_size(std::size_t num_bytes,
                        std::string summary_tag,
                        std::string column_name = {},
                        std::string description = {});
 
-  void set_global_memory_rw_bytes(std::size_t bytes)
-  {
-    m_global_memory_rw_bytes = bytes;
-  }
-  [[nodiscard]] std::size_t get_global_memory_rw_bytes() const
-  {
-    return m_global_memory_rw_bytes;
-  }
+  void set_global_memory_rw_bytes(std::size_t bytes) { m_global_memory_rw_bytes = bytes; }
+  [[nodiscard]] std::size_t get_global_memory_rw_bytes() const { return m_global_memory_rw_bytes; }
 
   void skip(std::string reason) { m_skip_reason = std::move(reason); }
   [[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
-  [[nodiscard]] const std::string &get_skip_reason() const
-  {
-    return m_skip_reason;
-  }
+  [[nodiscard]] const std::string &get_skip_reason() const { return m_skip_reason; }
 
   /// Execute at least this many trials per measurement. @{
-  [[nodiscard]] nvbench::int64_t get_min_samples() const
-  {
-    return m_min_samples;
-  }
-  void set_min_samples(nvbench::int64_t min_samples)
+  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
+  void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
+  /// @}
+
+  [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
   {
-    m_min_samples = min_samples;
+    return m_criterion_params;
   }
+
+  /// Control the stopping criterion for the measurement loop.
+  /// @{
+  [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
+  void set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); }
   /// @}
 
   /// If true, the benchmark is only run once, skipping all warmup runs and only
@@ -173,16 +147,30 @@ struct state
   void set_disable_blocking_kernel(bool v) { m_disable_blocking_kernel = v; }
   /// @}
 
-  /// Accumulate at least this many seconds of timing data per measurement. @{
-  [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
-  void set_min_time(nvbench::float64_t min_time) { m_min_time = min_time; }
+  /// Accumulate at least this many seconds of timing data per measurement. 
+  /// Only applies to `stdrel` stopping criterion. @{
+  [[nodiscard]] nvbench::float64_t get_min_time() const
+  {
+    return m_criterion_params.get_float64("min-time");
+  }
+  void set_min_time(nvbench::float64_t min_time)
+  {
+    m_criterion_params.set_float64("min-time", min_time);
+  }
   /// @}
 
   /// Specify the maximum amount of noise if a measurement supports noise.
   /// Noise is the relative standard deviation:
-  /// `noise = stdev / mean_time`. @{
-  [[nodiscard]] nvbench::float64_t get_max_noise() const { return m_max_noise; }
-  void set_max_noise(nvbench::float64_t max_noise) { m_max_noise = max_noise; }
+  /// `noise = stdev / mean_time`.
+  /// Only applies to `stdrel` stopping criterion. @{
+  [[nodiscard]] nvbench::float64_t get_max_noise() const
+  {
+    return m_criterion_params.get_float64("max-noise");
+  }
+  void set_max_noise(nvbench::float64_t max_noise)
+  {
+    m_criterion_params.set_float64("max-noise", max_noise);
+  }
   /// @}
 
   /// If a warmup run finishes in less than `skip_time`, the measurement will
@@ -222,20 +210,14 @@ struct state
   }
   ///@}
 
-  [[nodiscard]] const named_values &get_axis_values() const
-  {
-    return m_axis_values;
-  }
+  [[nodiscard]] const named_values &get_axis_values() const { return m_axis_values; }
 
   /*!
    * Return a string of "axis_name1=input_string1 axis_name2=input_string2 ..."
    */
   [[nodiscard]] std::string get_axis_values_as_string(bool color = false) const;
 
-  [[nodiscard]] const benchmark_base &get_benchmark() const
-  {
-    return m_benchmark;
-  }
+  [[nodiscard]] const benchmark_base &get_benchmark() const { return m_benchmark; }
 
   void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
   void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
@@ -252,26 +234,11 @@ struct state
     collect_dram_throughput();
   }
 
-  [[nodiscard]] bool is_l1_hit_rate_collected() const
-  {
-    return m_collect_l1_hit_rates;
-  }
-  [[nodiscard]] bool is_l2_hit_rate_collected() const
-  {
-    return m_collect_l2_hit_rates;
-  }
-  [[nodiscard]] bool is_stores_efficiency_collected() const
-  {
-    return m_collect_stores_efficiency;
-  }
-  [[nodiscard]] bool is_loads_efficiency_collected() const
-  {
-    return m_collect_loads_efficiency;
-  }
-  [[nodiscard]] bool is_dram_throughput_collected() const
-  {
-    return m_collect_dram_throughput;
-  }
+  [[nodiscard]] bool is_l1_hit_rate_collected() const { return m_collect_l1_hit_rates; }
+  [[nodiscard]] bool is_l2_hit_rate_collected() const { return m_collect_l2_hit_rates; }
+  [[nodiscard]] bool is_stores_efficiency_collected() const { return m_collect_stores_efficiency; }
+  [[nodiscard]] bool is_loads_efficiency_collected() const { return m_collect_loads_efficiency; }
+  [[nodiscard]] bool is_dram_throughput_collected() const { return m_collect_dram_throughput; }
 
   [[nodiscard]] bool is_cupti_required() const
   {
@@ -306,8 +273,7 @@ struct state
   template <typename KernelLauncher>
   void exec(KernelLauncher &&kernel_launcher)
   {
-    this->exec(nvbench::exec_tag::none,
-               std::forward<KernelLauncher>(kernel_launcher));
+    this->exec(nvbench::exec_tag::none, std::forward<KernelLauncher>(kernel_launcher));
   }
 
 private:
@@ -330,9 +296,11 @@ private:
   bool m_run_once{false};
   bool m_disable_blocking_kernel{false};
 
+
+  nvbench::criterion_params m_criterion_params;
+  std::string m_stopping_criterion;
+
   nvbench::int64_t m_min_samples;
-  nvbench::float64_t m_min_time;
-  nvbench::float64_t m_max_noise;
 
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index 0774faa7..1be48c58 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -36,9 +36,9 @@ state::state(const benchmark_base &bench)
     : m_benchmark{bench}
     , m_run_once{bench.get_run_once()}
     , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
+    , m_criterion_params{bench.get_criterion_params()}
+    , m_stopping_criterion(bench.get_stopping_criterion())
     , m_min_samples{bench.get_min_samples()}
-    , m_min_time{bench.get_min_time()}
-    , m_max_noise{bench.get_max_noise()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
 {}
@@ -53,9 +53,9 @@ state::state(const benchmark_base &bench,
     , m_type_config_index{type_config_index}
     , m_run_once{bench.get_run_once()}
     , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
+    , m_criterion_params{bench.get_criterion_params()}
+    , m_stopping_criterion(bench.get_stopping_criterion())
     , m_min_samples{bench.get_min_samples()}
-    , m_min_time{bench.get_min_time()}
-    , m_max_noise{bench.get_max_noise()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
 {}
@@ -65,9 +65,8 @@ nvbench::int64_t state::get_int64(const std::string &axis_name) const
   return m_axis_values.get_int64(axis_name);
 }
 
-nvbench::int64_t
-state::get_int64_or_default(const std::string &axis_name,
-                            nvbench::int64_t default_value) const
+nvbench::int64_t state::get_int64_or_default(const std::string &axis_name,
+                                             nvbench::int64_t default_value) const
 try
 {
   return this->get_int64(axis_name);
@@ -82,9 +81,8 @@ nvbench::float64_t state::get_float64(const std::string &axis_name) const
   return m_axis_values.get_float64(axis_name);
 }
 
-nvbench::float64_t
-state::get_float64_or_default(const std::string &axis_name,
-                              nvbench::float64_t default_value) const
+nvbench::float64_t state::get_float64_or_default(const std::string &axis_name,
+                                                 nvbench::float64_t default_value) const
 try
 {
   return this->get_float64(axis_name);
@@ -99,9 +97,8 @@ const std::string &state::get_string(const std::string &axis_name) const
   return m_axis_values.get_string(axis_name);
 }
 
-const std::string &
-state::get_string_or_default(const std::string &axis_name,
-                             const std::string &default_value) const
+const std::string &state::get_string_or_default(const std::string &axis_name,
+                                                const std::string &default_value) const
 try
 {
   return this->get_string(axis_name);
@@ -125,20 +122,18 @@ summary &state::add_summary(summary s)
 const summary &state::get_summary(std::string_view tag) const
 {
   // Check tags first
-  auto iter =
-    std::find_if(m_summaries.cbegin(),
-                 m_summaries.cend(),
-                 [&tag](const auto &s) { return s.get_tag() == tag; });
+  auto iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
+    return s.get_tag() == tag;
+  });
   if (iter != m_summaries.cend())
   {
     return *iter;
   }
 
   // Then names:
-  iter =
-    std::find_if(m_summaries.cbegin(),
-                 m_summaries.cend(),
-                 [&tag](const auto &s) { return s.get_string("name") == tag; });
+  iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
+    return s.get_string("name") == tag;
+  });
   if (iter != m_summaries.cend())
   {
     return *iter;
@@ -150,20 +145,18 @@ const summary &state::get_summary(std::string_view tag) const
 summary &state::get_summary(std::string_view tag)
 {
   // Check tags first
-  auto iter =
-    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
-      return s.get_tag() == tag;
-    });
+  auto iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+    return s.get_tag() == tag;
+  });
   if (iter != m_summaries.end())
   {
     return *iter;
   }
 
   // Then names:
-  iter =
-    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
-      return s.get_string("name") == tag;
-    });
+  iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+    return s.get_string("name") == tag;
+  });
   if (iter != m_summaries.end())
   {
     return *iter;
@@ -187,18 +180,17 @@ std::string state::get_axis_values_as_string(bool color) const
   // Create a Key=Value list of all parameters:
   fmt::memory_buffer buffer;
 
-  auto append_key_value = [&buffer, &style](const std::string &key,
-                                            const auto &value,
-                                            std::string value_fmtstr = "{}") {
-    constexpr auto key_format   = fmt::emphasis::italic;
-    constexpr auto value_format = fmt::emphasis::bold;
-
-    fmt::format_to(buffer,
-                   "{}{}={}",
-                   buffer.size() == 0 ? "" : " ",
-                   fmt::format(style(key_format), "{}", key),
-                   fmt::format(style(value_format), value_fmtstr, value));
-  };
+  auto append_key_value =
+    [&buffer, &style](const std::string &key, const auto &value, std::string value_fmtstr = "{}") {
+      constexpr auto key_format   = fmt::emphasis::italic;
+      constexpr auto value_format = fmt::emphasis::bold;
+
+      fmt::format_to(std::back_inserter(buffer),
+                     "{}{}={}",
+                     buffer.size() == 0 ? "" : " ",
+                     fmt::format(style(key_format), "{}", key),
+                     fmt::format(style(value_format), value_fmtstr, value));
+    };
 
   if (m_device)
   {
@@ -211,8 +203,7 @@ std::string state::get_axis_values_as_string(bool color) const
     const auto axis_type = m_axis_values.get_type(name);
 
     // Handle power-of-two int64 axes differently:
-    if (axis_type == named_values::type::int64 &&
-        axes.get_int64_axis(name).is_power_of_two())
+    if (axis_type == named_values::type::int64 && axes.get_int64_axis(name).is_power_of_two())
     {
       const nvbench::int64_t value    = m_axis_values.get_int64(name);
       const nvbench::int64_t exponent = int64_axis::compute_log2(value);
@@ -242,10 +233,9 @@ std::string state::get_short_description(bool color) const
     return color ? fmt_style : no_style;
   };
 
-  return fmt::format(
-    "{} [{}]",
-    fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
-    this->get_axis_values_as_string(color));
+  return fmt::format("{} [{}]",
+                     fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
+                     this->get_axis_values_as_string(color));
 }
 
 void state::add_element_count(std::size_t elements, std::string column_name)
diff --git a/nvbench/stopping_criterion.cuh b/nvbench/stopping_criterion.cuh
new file mode 100644
index 00000000..006a6994
--- /dev/null
+++ b/nvbench/stopping_criterion.cuh
@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/named_values.cuh>
+#include <nvbench/types.cuh>
+
+#include <string>
+
+#include <initializer_list>
+#include <unordered_map>
+
+namespace nvbench
+{
+
+namespace detail 
+{
+
+constexpr nvbench::float64_t compat_min_time() { return 0.5; }    // 0.5 seconds
+constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relative standard deviation
+
+} // namespace detail
+
+/**
+ * Stores all the parameters for stopping criterion in use
+ */
+class criterion_params
+{
+  nvbench::named_values m_named_values;
+public:
+  criterion_params();
+  criterion_params(std::initializer_list<std::pair<std::string, nvbench::named_values::value_type>>);
+
+  /**
+   * Set parameter values from another criterion_params object if they exist
+   *
+   * Parameters in `other` that do not correspond to parameters in `this` are ignored.
+   */
+  void set_from(const criterion_params &other);
+
+  void set_int64(std::string name, nvbench::int64_t value);
+  void set_float64(std::string name, nvbench::float64_t value);
+  void set_string(std::string name, std::string value);
+
+  [[nodiscard]] std::vector<std::string> get_names() const;
+  [[nodiscard]] nvbench::named_values::type get_type(const std::string &name) const;
+
+  [[nodiscard]] bool has_value(const std::string &name) const;
+  [[nodiscard]] nvbench::int64_t get_int64(const std::string &name) const;
+  [[nodiscard]] nvbench::float64_t get_float64(const std::string &name) const;
+  [[nodiscard]] std::string get_string(const std::string &name) const;
+};
+
+/**
+ * Stopping criterion interface
+ */
+class stopping_criterion_base
+{
+protected:
+  std::string m_name;
+  criterion_params m_params;
+
+public:
+  /**
+   * @param name Unique name of the criterion
+   * @param params Default values for all parameters of the criterion
+   */
+  explicit stopping_criterion_base(std::string name, criterion_params params)
+      : m_name{std::move(name)}
+      , m_params{std::move(params)}
+  {}
+
+  virtual ~stopping_criterion_base() = default;
+
+  [[nodiscard]] const std::string &get_name() const { return m_name; }
+  [[nodiscard]] const criterion_params &get_params() const { return m_params; }
+
+  /**
+   * Initialize the criterion with the given parameters
+   *
+   * This method is called once per benchmark run, before any measurements are provided.
+   */
+  void initialize(const criterion_params &params) 
+  {
+    m_params.set_from(params);
+    this->do_initialize();
+  }
+
+  /**
+   * Add the latest measurement to the criterion
+   */
+  void add_measurement(nvbench::float64_t measurement)
+  {
+    this->do_add_measurement(measurement);
+  }
+
+  /**
+   * Check if the criterion has been met for all measurements processed by `add_measurement`
+   */
+  bool is_finished()
+  {
+    return this->do_is_finished();
+  }
+
+protected:
+  /**
+   * Initialize the criterion after updaring the parameters
+   */
+  virtual void do_initialize() = 0;
+
+  /**
+   * Add the latest measurement to the criterion
+   */
+  virtual void do_add_measurement(nvbench::float64_t measurement) = 0;
+
+  /**
+   * Check if the criterion has been met for all measurements processed by `add_measurement`
+   */
+  virtual bool do_is_finished() = 0;
+};
+
+} // namespace nvbench
diff --git a/nvbench/stopping_criterion.cxx b/nvbench/stopping_criterion.cxx
new file mode 100644
index 00000000..976a1a71
--- /dev/null
+++ b/nvbench/stopping_criterion.cxx
@@ -0,0 +1,124 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/stopping_criterion.cuh>
+
+#include <nvbench/detail/throw.cuh>
+
+
+namespace nvbench
+{
+
+// Default constructor for compatibility with old code
+criterion_params::criterion_params()
+    : criterion_params{{"max-noise", nvbench::detail::compat_max_noise()},
+                       {"min-time", nvbench::detail::compat_min_time()}}
+{}
+
+criterion_params::criterion_params(
+  std::initializer_list<std::pair<std::string, nvbench::named_values::value_type>> list)
+{
+  for (const auto &[name, value] : list)
+  {
+    m_named_values.set_value(name, value);
+  }
+}
+
+void criterion_params::set_from(const criterion_params &other)
+{
+  for (const std::string &name : this->get_names())
+  {
+    if (other.has_value(name))
+    {
+      if (this->get_type(name) != other.get_type(name))
+      {
+        NVBENCH_THROW(std::runtime_error,
+                      "Mismatched types for named value \"{}\". "
+                      "Expected {}, got {}.",
+                      name,
+                      static_cast<int>(this->get_type(name)),
+                      static_cast<int>(other.get_type(name)));
+      }
+      m_named_values.remove_value(name);
+      m_named_values.set_value(name, other.m_named_values.get_value(name));
+    }
+  }
+}
+
+void criterion_params::set_int64(std::string name, nvbench::int64_t value)
+{
+  if (m_named_values.has_value(name)) 
+  {
+    m_named_values.remove_value(name);
+  }
+
+  m_named_values.set_int64(name, value);
+}
+
+void criterion_params::set_float64(std::string name, nvbench::float64_t value)
+{
+  if (m_named_values.has_value(name)) 
+  {
+    m_named_values.remove_value(name);
+  }
+
+  m_named_values.set_float64(name, value);
+}
+
+void criterion_params::set_string(std::string name, std::string value)
+{
+  if (m_named_values.has_value(name)) 
+  {
+    m_named_values.remove_value(name);
+  }
+
+  m_named_values.set_string(name, std::move(value));
+}
+
+bool criterion_params::has_value(const std::string &name) const
+{
+  return m_named_values.has_value(name);
+}
+
+nvbench::int64_t criterion_params::get_int64(const std::string &name) const
+{
+  return m_named_values.get_int64(name);
+}
+
+nvbench::float64_t criterion_params::get_float64(const std::string &name) const
+{
+  return m_named_values.get_float64(name);
+}
+
+std::string criterion_params::get_string(const std::string &name) const
+{
+  return m_named_values.get_string(name);
+}
+
+std::vector<std::string> criterion_params::get_names() const
+{
+  return m_named_values.get_names();
+}
+
+nvbench::named_values::type criterion_params::get_type(const std::string &name) const
+{
+  return m_named_values.get_type(name);
+}
+
+
+} // namespace nvbench::detail
diff --git a/nvbench/string_axis.cuh b/nvbench/string_axis.cuh
index 2f526e7a..a8af16ef 100644
--- a/nvbench/string_axis.cuh
+++ b/nvbench/string_axis.cuh
@@ -36,25 +36,13 @@ struct string_axis final : public axis_base
 
   ~string_axis() final;
 
-  void set_inputs(std::vector<std::string> inputs)
-  {
-    m_values = std::move(inputs);
-  }
-  [[nodiscard]] const std::string &get_value(std::size_t i) const
-  {
-    return m_values[i];
-  }
+  void set_inputs(std::vector<std::string> inputs) { m_values = std::move(inputs); }
+  [[nodiscard]] const std::string &get_value(std::size_t i) const { return m_values[i]; }
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<string_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<string_axis>(*this); }
   std::size_t do_get_size() const final { return m_values.size(); }
-  std::string do_get_input_string(std::size_t i) const final
-  {
-    return m_values[i];
-  }
+  std::string do_get_input_string(std::size_t i) const final { return m_values[i]; }
   std::string do_get_description(std::size_t) const final { return {}; }
 
   std::vector<std::string> m_values;
diff --git a/nvbench/summary.cuh b/nvbench/summary.cuh
index 4576b15a..66093c05 100644
--- a/nvbench/summary.cuh
+++ b/nvbench/summary.cuh
@@ -92,10 +92,10 @@ struct summary : public nvbench::named_values
   {}
 
   // move-only
-  summary(const summary &) = delete;
-  summary(summary &&)      = default;
+  summary(const summary &)            = delete;
+  summary(summary &&)                 = default;
   summary &operator=(const summary &) = delete;
-  summary &operator=(summary &&) = default;
+  summary &operator=(summary &&)      = default;
 
   void set_tag(std::string tag) { m_tag = std::move(tag); }
   [[nodiscard]] const std::string &get_tag() const { return m_tag; }
diff --git a/nvbench/test_kernels.cuh b/nvbench/test_kernels.cuh
index e08db315..f46216dc 100644
--- a/nvbench/test_kernels.cuh
+++ b/nvbench/test_kernels.cuh
@@ -18,6 +18,8 @@
 
 #pragma once
 
+#include <nvbench/types.cuh>
+
 #include <cuda/std/chrono>
 
 #include <cuda_runtime.h>
@@ -38,8 +40,8 @@ namespace nvbench
 __global__ void sleep_kernel(double seconds)
 {
   const auto start = cuda::std::chrono::high_resolution_clock::now();
-  const auto ns    = cuda::std::chrono::nanoseconds(
-    static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
+  const auto ns =
+    cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
   const auto finish = start + ns;
 
   auto now = cuda::std::chrono::high_resolution_clock::now();
@@ -53,7 +55,7 @@ __global__ void sleep_kernel(double seconds)
  * Naive copy of `n` values from `in` -> `out`.
  */
 template <typename T, typename U>
-__global__ void copy_kernel(const T* in, U* out, std::size_t n)
+__global__ void copy_kernel(const T *in, U *out, std::size_t n)
 {
   const auto init = blockIdx.x * blockDim.x + threadIdx.x;
   const auto step = blockDim.x * gridDim.x;
@@ -68,7 +70,7 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
  * For `i <- [0,n)`, `out[i] = in[i] % 2`.
  */
 template <typename T, typename U>
-__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
+__global__ void mod2_kernel(const T *in, U *out, std::size_t n)
 {
   const auto init = blockIdx.x * blockDim.x + threadIdx.x;
   const auto step = blockDim.x * gridDim.x;
@@ -79,4 +81,4 @@ __global__ void mod2_kernel(const T* in, U* out, std::size_t n)
   }
 }
 
-}
+} // namespace nvbench
diff --git a/nvbench/type_axis.cuh b/nvbench/type_axis.cuh
index 2ee91445..3a4c59a1 100644
--- a/nvbench/type_axis.cuh
+++ b/nvbench/type_axis.cuh
@@ -43,7 +43,7 @@ struct type_axis final : public axis_base
   template <typename TypeList>
   void set_inputs();
 
-  void set_active_inputs(const std::vector<std::string>& inputs);
+  void set_active_inputs(const std::vector<std::string> &inputs);
 
   [[nodiscard]] bool get_is_active(const std::string &input) const;
   [[nodiscard]] bool get_is_active(std::size_t index) const;
@@ -57,23 +57,13 @@ struct type_axis final : public axis_base
   /**
    * The index in this axis of the type with the specified `input_string`.
    */
-  [[nodiscard]] std::size_t
-  get_type_index(const std::string &input_string) const;
+  [[nodiscard]] std::size_t get_type_index(const std::string &input_string) const;
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<type_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<type_axis>(*this); }
   std::size_t do_get_size() const final { return m_input_strings.size(); }
-  std::string do_get_input_string(std::size_t i) const final
-  {
-    return m_input_strings[i];
-  }
-  std::string do_get_description(std::size_t i) const final
-  {
-    return m_descriptions[i];
-  }
+  std::string do_get_input_string(std::size_t i) const final { return m_input_strings[i]; }
+  std::string do_get_description(std::size_t i) const final { return m_descriptions[i]; }
 
   std::vector<std::string> m_input_strings;
   std::vector<std::string> m_descriptions;
diff --git a/nvbench/type_axis.cxx b/nvbench/type_axis.cxx
index af436ad1..f89ec1d5 100644
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -35,10 +35,10 @@ void type_axis::set_active_inputs(const std::vector<std::string> &inputs)
 {
   m_mask.clear();
   m_mask.resize(m_input_strings.size(), false);
-  for (const auto& input : inputs)
+  for (const auto &input : inputs)
   {
     const auto idx = this->get_type_index(input);
-    m_mask[idx] = true;
+    m_mask[idx]    = true;
   }
 }
 
@@ -47,21 +47,16 @@ bool type_axis::get_is_active(const std::string &input) const
   return this->get_is_active(this->get_type_index(input));
 }
 
-bool type_axis::get_is_active(std::size_t idx) const
-{
-  return m_mask.at(idx);
-}
+bool type_axis::get_is_active(std::size_t idx) const { return m_mask.at(idx); }
 
 std::size_t type_axis::get_active_count() const
 {
-  return static_cast<std::size_t>(
-    std::count(m_mask.cbegin(), m_mask.cend(), true));
+  return static_cast<std::size_t>(std::count(m_mask.cbegin(), m_mask.cend(), true));
 }
 
 std::size_t type_axis::get_type_index(const std::string &input_string) const
 {
-  auto it =
-    std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
+  auto it = std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
   if (it == m_input_strings.end())
   {
     NVBENCH_THROW(std::runtime_error,
@@ -72,7 +67,7 @@ std::size_t type_axis::get_type_index(const std::string &input_string) const
                   m_input_strings);
   }
 
-  return it - m_input_strings.cbegin();
+  return static_cast<std::size_t>(it - m_input_strings.cbegin());
 }
 
 } // namespace nvbench
diff --git a/nvbench/type_strings.cuh b/nvbench/type_strings.cuh
index 287e0f93..b915854c 100644
--- a/nvbench/type_strings.cuh
+++ b/nvbench/type_strings.cuh
@@ -30,17 +30,17 @@ namespace nvbench
 std::string demangle(const std::string &str);
 
 template <typename T>
-std::string demangle() { return demangle(typeid(T).name()); }
+std::string demangle()
+{
+  return demangle(typeid(T).name());
+}
 
 template <typename T>
 struct type_strings
 {
   // The string used to identify the type in shorthand (e.g. output tables and
   // CLI options):
-  static std::string input_string()
-  {
-    return nvbench::demangle<T>();
-  }
+  static std::string input_string() { return nvbench::demangle<T>(); }
 
   // A more descriptive identifier for the type, if input_string is not a common
   // identifier. May be blank if `input_string` is obvious.
@@ -56,10 +56,7 @@ struct type_strings<std::integral_constant<T, Value>>
 
   // A more descriptive identifier for the type, if input_string is not a common
   // identifier. May be blank if `input_string` is obvious.
-  static std::string description()
-  {
-    return nvbench::demangle<std::integral_constant<T, Value>>();
-  }
+  static std::string description() { return nvbench::demangle<std::integral_constant<T, Value>>(); }
 };
 
 } // namespace nvbench
@@ -67,15 +64,15 @@ struct type_strings<std::integral_constant<T, Value>>
 /*!
  * Declare an `input_string` and `description` to use with a specific `type`.
  */
-#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description)           \
-  namespace nvbench                                                            \
-  {                                                                            \
-  template <>                                                                  \
-  struct type_strings<Type>                                                    \
-  {                                                                            \
-    static std::string input_string() { return {InputString}; }                \
-    static std::string description() { return {Description}; }                 \
-  };                                                                           \
+#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description)                               \
+  namespace nvbench                                                                                \
+  {                                                                                                \
+  template <>                                                                                      \
+  struct type_strings<Type>                                                                        \
+  {                                                                                                \
+    static std::string input_string() { return {InputString}; }                                    \
+    static std::string description() { return {Description}; }                                     \
+  };                                                                                               \
   }
 
 NVBENCH_DECLARE_TYPE_STRINGS(nvbench::int8_t, "I8", "int8_t");
diff --git a/scripts/nvbench_histogram.py b/scripts/nvbench_histogram.py
old mode 100644
new mode 100755
index 1df17cc4..5c37d293
--- a/scripts/nvbench_histogram.py
+++ b/scripts/nvbench_histogram.py
@@ -38,6 +38,20 @@ def parse_files():
     return filenames
 
 
+def extract_filename(summary):
+    summary_data = summary["data"]
+    value_data = next(filter(lambda v: v["name"] == "filename", summary_data))
+    assert(value_data["type"] == "string")
+    return value_data["value"]
+
+
+def extract_size(summary):
+    summary_data = summary["data"]
+    value_data = next(filter(lambda v: v["name"] == "size", summary_data))
+    assert(value_data["type"] == "int64")
+    return int(value_data["value"])
+
+
 def parse_samples_meta(filename, state):
     summaries = state["summaries"]
     if not summaries:
@@ -49,13 +63,13 @@ def parse_samples_meta(filename, state):
     if not summary:
         return None, None
 
-    sample_filename = summary["filename"]["value"]
+    sample_filename = extract_filename(summary)
 
     # If not absolute, the path is relative to the associated .json file:
     if not os.path.isabs(sample_filename):
         sample_filename = os.path.join(os.path.dirname(filename), sample_filename)
 
-    sample_count = int(summary["size"]["value"])
+    sample_count = extract_size(summary)
     return sample_count, sample_filename
 
 
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 4928ebc9..f4072586 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -4,22 +4,36 @@ set(test_srcs
   create.cu
   cuda_timer.cu
   cpu_timer.cu
+  criterion_manager.cu
+  criterion_params.cu
+  custom_main_custom_args.cu
+  custom_main_custom_exceptions.cu
+  custom_main_global_state_raii.cu
   enum_type_list.cu
+  entropy_criterion.cu
   float64_axis.cu
   int64_axis.cu
   named_values.cu
   option_parser.cu
   range.cu
+  reset_error.cu
   ring_buffer.cu
   runner.cu
   state.cu
+  statistics.cu
   state_generator.cu
+  stdrel_criterion.cu
   string_axis.cu
   type_axis.cu
   type_list.cu
 )
 
-# Metatarget for all examples:
+# Custom arguments:
+# CTest commands+args can't be modified after creation, so we need to rely on substitution.
+set(NVBench_TEST_ARGS_nvbench.test.custom_main_custom_args "--quiet" "--my-custom-arg" "--run-once" "-d" "0")
+set(NVBench_TEST_ARGS_nvbench.test.custom_main_custom_exceptions "--quiet" "--run-once" "-d" "0")
+
+# Metatarget for all tests:
 add_custom_target(nvbench.test.all)
 add_dependencies(nvbench.all nvbench.test.all)
 
@@ -31,10 +45,14 @@ foreach(test_src IN LISTS test_srcs)
   target_link_libraries(${test_name} PRIVATE nvbench::nvbench fmt)
   set_target_properties(${test_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
   nvbench_config_target(${test_name})
-  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>")
+  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>" ${NVBench_TEST_ARGS_${test_name}})
 
   add_dependencies(nvbench.test.all ${test_name})
 endforeach()
 
+set_tests_properties(nvbench.test.custom_main_custom_exceptions PROPERTIES
+  PASS_REGULAR_EXPRESSION "Custom error detected: Expected exception thrown."
+)
+
 add_subdirectory(cmake)
 add_subdirectory(device)
diff --git a/testing/axes_metadata.cu b/testing/axes_metadata.cu
index 1ea7dd61..cf4d93a6 100644
--- a/testing/axes_metadata.cu
+++ b/testing/axes_metadata.cu
@@ -129,13 +129,13 @@ void test_type_axes()
   fmt::memory_buffer buffer;
   for (const auto &axis : axes.get_axes())
   {
-    fmt::format_to(buffer, "Axis: {}\n", axis->get_name());
+    fmt::format_to(std::back_inserter(buffer), "Axis: {}\n", axis->get_name());
     const auto num_values = axis->get_size();
     for (std::size_t i = 0; i < num_values; ++i)
     {
       auto input_string = axis->get_input_string(i);
       auto description  = axis->get_description(i);
-      fmt::format_to(buffer,
+      fmt::format_to(std::back_inserter(buffer),
                      " - {}{}\n",
                      input_string,
                      description.empty() ? ""
@@ -159,7 +159,7 @@ Axis: Other
   const std::string test = fmt::to_string(buffer);
   const auto diff =
     std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
-  const auto idx = diff.second - test.cbegin();
+  const auto idx = static_cast<std::size_t>(diff.second - test.cbegin());
   ASSERT_MSG(test == ref,
              "Differs at character {}.\n"
              "Expected:\n\"{}\"\n\n"
diff --git a/testing/benchmark.cu b/testing/benchmark.cu
index 71ffe033..9581b12c 100644
--- a/testing/benchmark.cu
+++ b/testing/benchmark.cu
@@ -44,13 +44,13 @@ std::vector<T> sort(std::vector<T> &&vec)
 void no_op_generator(nvbench::state &state)
 {
   fmt::memory_buffer params;
-  fmt::format_to(params, "Params:");
+  fmt::format_to(std::back_inserter(params), "Params:");
   const auto &axis_values = state.get_axis_values();
   for (const auto &name : sort(axis_values.get_names()))
   {
     std::visit(
       [&params, &name](const auto &value) {
-        fmt::format_to(params, " {}: {}", name, value);
+        fmt::format_to(std::back_inserter(params), " {}: {}", name, value);
       },
       axis_values.get_value(name));
   }
@@ -101,13 +101,13 @@ void test_type_axes()
   const auto &axes = bench.get_axes().get_axes();
   for (const auto &axis : axes)
   {
-    fmt::format_to(buffer, "Axis: {}\n", axis->get_name());
+    fmt::format_to(std::back_inserter(buffer), "Axis: {}\n", axis->get_name());
     const auto num_values = axis->get_size();
     for (std::size_t i = 0; i < num_values; ++i)
     {
       auto input_string = axis->get_input_string(i);
       auto description  = axis->get_description(i);
-      fmt::format_to(buffer,
+      fmt::format_to(std::back_inserter(buffer),
                      " - {}{}\n",
                      input_string,
                      description.empty() ? ""
@@ -148,7 +148,7 @@ void test_type_configs()
       using Integer = nvbench::tl::get<0, Conf>;
       using Float   = nvbench::tl::get<1, Conf>;
       using Other   = nvbench::tl::get<2, Conf>;
-      fmt::format_to(buffer,
+      fmt::format_to(std::back_inserter(buffer),
                      "type_configs[{:2d}] = <{:>3}, {:>3}, {:>4}>\n",
                      idx++,
                      nvbench::type_strings<Integer>::input_string(),
diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt
index 2cb2f5fa..6932c00c 100644
--- a/testing/cmake/CMakeLists.txt
+++ b/testing/cmake/CMakeLists.txt
@@ -12,6 +12,7 @@ set(cmake_opts
   -D "CMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}"
   -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
   -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+  -D "CMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}"
   -D "CMAKE_CUDA_ARCHITECTURES=${arches}"
 )
 
diff --git a/testing/create.cu b/testing/create.cu
index d7d9586e..6ed7fff5 100644
--- a/testing/create.cu
+++ b/testing/create.cu
@@ -44,13 +44,13 @@ std::vector<T> sort(std::vector<T> &&vec)
 void no_op_generator(nvbench::state &state)
 {
   fmt::memory_buffer params;
-  fmt::format_to(params, "Params:");
+  fmt::format_to(std::back_inserter(params), "Params:");
   const auto &axis_values = state.get_axis_values();
   for (const auto &name : sort(axis_values.get_names()))
   {
     std::visit(
       [&params, &name](const auto &value) {
-        fmt::format_to(params, " {}: {}", name, value);
+        fmt::format_to(std::back_inserter(params), " {}: {}", name, value);
       },
       axis_values.get_value(name));
   }
@@ -109,7 +109,7 @@ std::string run_and_get_state_string(nvbench::benchmark_base &bench,
   for (const auto &state : states)
   {
     ASSERT(state.is_skipped());
-    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
+    fmt::format_to(std::back_inserter(buffer), "{}\n", state.get_skip_reason());
   }
   return fmt::to_string(buffer);
 }
diff --git a/testing/criterion_manager.cu b/testing/criterion_manager.cu
new file mode 100644
index 00000000..da0ddb0f
--- /dev/null
+++ b/testing/criterion_manager.cu
@@ -0,0 +1,75 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/types.cuh>
+
+#include "test_asserts.cuh"
+
+void test_standard_criteria_exist()
+{
+  ASSERT(nvbench::criterion_manager::get().get_criterion("stdrel").get_name() == "stdrel");
+  ASSERT(nvbench::criterion_manager::get().get_criterion("entropy").get_name() == "entropy");
+}
+
+class custom_criterion : public nvbench::stopping_criterion_base
+{
+public:
+  custom_criterion()
+      : nvbench::stopping_criterion_base("custom", nvbench::criterion_params{})
+  {}
+
+protected:
+  virtual void do_initialize() override {}
+  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override {}
+  virtual bool do_is_finished() override { return true; }
+};
+
+void test_no_duplicates_are_allowed()
+{
+  nvbench::criterion_manager& manager = nvbench::criterion_manager::get();
+  bool exception_triggered = false;
+
+  try {
+    [[maybe_unused]] nvbench::stopping_criterion_base& _ = manager.get_criterion("custom");
+  } catch(...) {
+    exception_triggered = true;
+  }
+  ASSERT(exception_triggered);
+
+  std::unique_ptr<custom_criterion> custom_ptr = std::make_unique<custom_criterion>();
+  custom_criterion* custom_raw = custom_ptr.get();
+  ASSERT(&manager.add(std::move(custom_ptr)) == custom_raw);
+
+  nvbench::stopping_criterion_base& custom = nvbench::criterion_manager::get().get_criterion("custom");
+  ASSERT(custom_raw == &custom);
+
+  exception_triggered = false;
+  try {
+    manager.add(std::make_unique<custom_criterion>());
+  } catch(...) {
+    exception_triggered = true;
+  }
+  ASSERT(exception_triggered);
+}
+
+int main()
+{
+  test_standard_criteria_exist();
+  test_no_duplicates_are_allowed();
+}
diff --git a/testing/criterion_params.cu b/testing/criterion_params.cu
new file mode 100644
index 00000000..4eceefaa
--- /dev/null
+++ b/testing/criterion_params.cu
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/types.cuh>
+
+#include "test_asserts.cuh"
+
+void test_compat_parameters()
+{
+  nvbench::criterion_params params;
+
+  ASSERT(params.has_value("max-noise"));
+  ASSERT(params.has_value("min-time"));
+
+  ASSERT(params.get_float64("max-noise") == nvbench::detail::compat_max_noise());
+  ASSERT(params.get_float64("min-time") == nvbench::detail::compat_min_time());
+}
+
+void test_compat_overwrite()
+{
+  nvbench::criterion_params params;
+  params.set_float64("max-noise", 40000.0);
+  params.set_float64("min-time", 42000.0);
+
+  ASSERT(params.get_float64("max-noise") == 40000.0);
+  ASSERT(params.get_float64("min-time") == 42000.0);
+}
+
+void test_overwrite()
+{
+  nvbench::criterion_params params;
+  ASSERT(!params.has_value("custom"));
+
+  params.set_float64("custom", 42.0);
+  ASSERT(params.get_float64("custom") == 42.0);
+
+  params.set_float64("custom", 4.2);
+  ASSERT(params.get_float64("custom") == 4.2);
+}
+
+int main()
+{
+  test_compat_parameters();
+  test_compat_overwrite();
+  test_overwrite();
+}
+
diff --git a/testing/custom_main_custom_args.cu b/testing/custom_main_custom_args.cu
new file mode 100644
index 00000000..f7e331e3
--- /dev/null
+++ b/testing/custom_main_custom_args.cu
@@ -0,0 +1,132 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+#include "nvbench/cuda_call.cuh"
+
+/******************************************************************************
+ * Install custom parser.
+ * sSee <nvbench/main.cuh> for more details.
+ ******************************************************************************/
+
+//
+// Step 1: Define a custom argument handler that accepts a vector of strings.
+//          - This handler should modify the vector in place to remove any custom
+//            arguments it handles. NVbench will then parse the remaining arguments.
+//          - The handler should also update any application state needed to handle
+//            the custom arguments.
+//
+
+// User code to handle a specific argument:
+void handle_my_custom_arg();
+
+// NVBench hook for modiifying the command line arguments before parsing:
+void custom_arg_handler(std::vector<std::string> &args)
+{
+  // Handle and remove "--my-custom-arg"
+  if (auto it = std::find(args.begin(), args.end(), "--my-custom-arg"); it != args.end())
+  {
+    handle_my_custom_arg();
+    args.erase(it);
+  }
+}
+
+//
+// Step 2: Install the custom argument handler.
+//         - This is done by defining a macro that invokes the custom argument handler.
+//
+
+// Install the custom argument handler:
+// Either define this before any NVBench headers are included, or undefine and redefine:
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) custom_arg_handler(args)
+
+// Step 3: Define `main`
+//
+// After installing the custom argument handler, define the main function using:
+//
+// ```
+// NVBENCH_MAIN
+// ```
+//
+// Here, this is done at the end of this file.
+
+/******************************************************************************
+ * Unit test verification:
+ ******************************************************************************/
+
+// Track whether the args are found / handled.
+bool h_custom_arg_found             = false;
+bool h_handled_on_device            = false;
+__device__ bool d_custom_arg_found  = false;
+__device__ bool d_handled_on_device = false;
+
+// Copy host values to device:
+void copy_host_state_to_device()
+{
+  NVBENCH_CUDA_CALL(cudaMemcpyToSymbol(d_custom_arg_found, &h_custom_arg_found, sizeof(bool)));
+  NVBENCH_CUDA_CALL(cudaMemcpyToSymbol(d_handled_on_device, &h_handled_on_device, sizeof(bool)));
+}
+
+// Copy device values to host:
+void copy_device_state_to_host()
+{
+  NVBENCH_CUDA_CALL(cudaMemcpyFromSymbol(&h_custom_arg_found, d_custom_arg_found, sizeof(bool)));
+  NVBENCH_CUDA_CALL(cudaMemcpyFromSymbol(&h_handled_on_device, d_handled_on_device, sizeof(bool)));
+}
+
+void handle_my_custom_arg()
+{
+  h_custom_arg_found = true;
+  copy_host_state_to_device();
+}
+
+void verify()
+{
+  copy_device_state_to_host();
+  if (!h_custom_arg_found)
+  {
+    throw std::runtime_error("Custom argument not detected.");
+  }
+  if (!h_handled_on_device)
+  {
+    throw std::runtime_error("Custom argument not handled on device.");
+  }
+}
+
+// Install a verification check to ensure the custom argument was handled.
+// Use the `PRE` finalize hook to ensure we check device state before resetting the context.
+#undef NVBENCH_MAIN_FINALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_PRE() verify()
+
+// Simple kernel/benchmark to make sure that the handler can successfully modify CUDA state:
+__global__ void kernel()
+{
+  if (d_custom_arg_found)
+  {
+    d_handled_on_device = true;
+  }
+}
+void bench(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { kernel<<<1, 1>>>(); });
+}
+NVBENCH_BENCH(bench);
+
+// Define the customized main function:
+NVBENCH_MAIN
diff --git a/testing/custom_main_custom_exceptions.cu b/testing/custom_main_custom_exceptions.cu
new file mode 100644
index 00000000..b1f9b9c2
--- /dev/null
+++ b/testing/custom_main_custom_exceptions.cu
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <stdexcept>
+
+/******************************************************************************
+ * Install exception handler around the NVBench main body. This is used
+ * to print helpful information when a user exception is thrown before exiting.
+ *
+ * Note that this will **NOT** be used when a benchmark throws an exception.
+ * That will fail the benchmark and note the exception, and continue
+ * execution.
+ *
+ * This is used to catch exceptions in user extensions of NVBench, things like
+ * customized initialization, command line parsing, finalization, etc. See
+ * <nvbench/main.cuh> for more details.
+ ******************************************************************************/
+
+struct user_exception : public std::runtime_error
+{
+  user_exception()
+      : std::runtime_error("Expected exception thrown.")
+  {}
+};
+
+// User code to handle user exception:
+void handle_my_exception(user_exception &e)
+{
+  std::cerr << "Custom error detected: " << e.what() << std::endl;
+  std::exit(1);
+}
+
+// Install the exception handler around the NVBench main body.
+// NVBench will have sensible defaults for common exceptions following this if no terminating catch
+// block is defined.
+// Either define this before any NVBench headers are included, or undefine and redefine.
+#undef NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM                                                       \
+  catch (user_exception & e) { handle_my_exception(e); }
+
+// For testing purposes, install a argument parser that throws:
+void really_robust_argument_parser(std::vector<std::string> &) { throw user_exception(); }
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) really_robust_argument_parser(args);
+
+// Define the customized main function:
+NVBENCH_MAIN
diff --git a/testing/custom_main_global_state_raii.cu b/testing/custom_main_global_state_raii.cu
new file mode 100644
index 00000000..e3584ab6
--- /dev/null
+++ b/testing/custom_main_global_state_raii.cu
@@ -0,0 +1,121 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstdio>
+
+/******************************************************************************
+ * Test having global state that is initialized and finalized via RAII.
+ *****************************************************************************/
+
+struct raii
+{
+  const char m_ref_data[6];
+  char *m_data;
+  bool m_cuda;
+
+  const char *m_outer_data;
+  bool m_outer_cuda;
+
+  explicit raii(bool cuda, char *outer_data = nullptr, bool outer_cuda = false)
+      : m_ref_data{'a', 'b', 'c', '1', '2', '3'}
+      , m_data(nullptr)
+      , m_cuda(cuda)
+      , m_outer_data(outer_data)
+      , m_outer_cuda(outer_cuda)
+  {
+    if (m_cuda)
+    {
+      printf("(%p) RAII test: allocating device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaMalloc(&m_data, 6));
+      NVBENCH_CUDA_CALL(cudaMemcpy(m_data, m_ref_data, 6, cudaMemcpyHostToDevice));
+    }
+    else
+    {
+      printf("(%p) RAII test: allocating host memory\n", this);
+      m_data = new char[6];
+      std::copy(m_ref_data, m_ref_data + 6, m_data);
+    }
+  }
+
+  ~raii()
+  {
+    this->verify();
+    if (m_cuda)
+    {
+      printf("(%p) RAII test: invalidating device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaMemset(m_data, 0, 6));
+      printf("(%p) RAII test: freeing device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaFree(m_data));
+    }
+    else
+    {
+      printf("(%p) RAII test: invalidating host memory\n", this);
+      std::fill(m_data, m_data + 6, '\0');
+      printf("(%p) RAII test: freeing host memory\n", this);
+      delete[] m_data;
+    }
+  }
+
+  void verify() noexcept
+  {
+    printf("(%p) RAII test: verifying instance state\n", this);
+    this->verify(m_cuda, m_data);
+    if (m_outer_data)
+    {
+      printf("(%p) RAII test: verifying outer state\n", this);
+      this->verify(m_outer_cuda, m_outer_data);
+    }
+  }
+
+  void verify(bool cuda, const char *data) noexcept
+  {
+    if (cuda)
+    {
+      char test_data[6];
+      NVBENCH_CUDA_CALL(cudaMemcpy(test_data, data, 6, cudaMemcpyDeviceToHost));
+      if (strncmp(test_data, m_ref_data, 6) != 0)
+      {
+        printf("(%p) RAII test failed: device data mismatch\n", this);
+        std::exit(1);
+      }
+    }
+    else
+    {
+      if (strncmp(data, m_ref_data, 6) != 0)
+      {
+        printf("(%p) RAII test failed: host data mismatch\n", this);
+        std::exit(1);
+      }
+    }
+  }
+};
+
+// These will be destroyed in the opposite order in which they are created:
+
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv) raii raii_outer(false);
+
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)                                            \
+  [[maybe_unused]] raii raii_inner(true, raii_outer.m_data, raii_outer.m_cuda);
+
+NVBENCH_MAIN
diff --git a/testing/entropy_criterion.cu b/testing/entropy_criterion.cu
new file mode 100644
index 00000000..df489c96
--- /dev/null
+++ b/testing/entropy_criterion.cu
@@ -0,0 +1,91 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/entropy_criterion.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
+
+#include "test_asserts.cuh"
+
+#include <vector>
+#include <random>
+#include <numeric>
+
+void test_const()
+{
+  nvbench::criterion_params params;
+  nvbench::detail::entropy_criterion criterion;
+
+  criterion.initialize(params);
+  for (int i = 0; i < 6; i++) 
+  { // nvbench wants at least 5 to compute the standard deviation
+    criterion.add_measurement(42.0);
+  }
+  ASSERT(criterion.is_finished());
+}
+
+void produce_entropy_arch(nvbench::detail::entropy_criterion &criterion)
+{
+  /*
+   * This pattern is designed to simulate the entropy:
+   *
+   *   0.0, 1.0, 1.5, 2.0, 2.3, 2.5 <---- no unexpected measurement after this point
+   *   2.5, 2.4, 2.2, 2.1, 2.0, 1.9 <-+
+   *   1.8, 1.7, 1.6, 1.6, 1.5, 1.4   |
+   *   1.4, 1.3, 1.3, 1.3, 1.2, 1.2   |
+   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample, 
+   *   1.0, 0.9, 0.9, 0.9, 0.9, 0.9   |   so the slope should be negative
+   *   0.8, 0.8, 0.8, 0.8, 0.8, 0.8   |
+   *   0.7, 0.7, 0.7, 0.7, 0.7, 0.7 <-+
+   */
+  for (nvbench::float64_t x = 0.0; x < 50.0; x += 1.0)
+  {
+    criterion.add_measurement(x > 5.0 ? 5.0 : x);
+  }
+}
+
+void test_entropy_arch()
+{
+  nvbench::detail::entropy_criterion criterion;
+
+  // The R2 should be around 0.5
+  // The angle should be around -1.83
+  nvbench::criterion_params params;
+  params.set_float64("min-r2", 0.3);
+  params.set_float64("max-angle", -1.0);
+  criterion.initialize(params);
+  produce_entropy_arch(criterion);
+  ASSERT(criterion.is_finished());
+
+  params.set_float64("min-r2", 0.7);
+  criterion.initialize(params);
+  produce_entropy_arch(criterion);
+  ASSERT(!criterion.is_finished());
+
+  params.set_float64("min-r2", 0.3);
+  params.set_float64("max-angle", -2.0);
+  criterion.initialize(params);
+  produce_entropy_arch(criterion);
+  ASSERT(!criterion.is_finished());
+}
+
+int main()
+{
+  test_const();
+  test_entropy_arch();
+}
diff --git a/testing/enum_type_list.cu b/testing/enum_type_list.cu
index 05b26052..88535ba3 100644
--- a/testing/enum_type_list.cu
+++ b/testing/enum_type_list.cu
@@ -24,6 +24,11 @@
 
 #include <type_traits>
 
+// If using gcc version < 7, disable some tests to WAR a compiler bug. See NVIDIA/nvbench#39.
+#if defined(__GNUC__) && __GNUC__ == 7
+#define USING_GCC_7
+#endif
+
 enum class scoped_enum
 {
   val_1,
@@ -109,9 +114,11 @@ void test_int()
 
 void test_scoped_enum()
 {
+#ifndef USING_GCC_7
   ASSERT((
     std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
                    nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
+#endif
   ASSERT((
     std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1,
                                            scoped_enum::val_2,
@@ -123,6 +130,7 @@ void test_scoped_enum()
 
 void test_unscoped_enum()
 {
+#ifndef USING_GCC_7
   ASSERT(
     (std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
                     nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
@@ -132,6 +140,7 @@ void test_unscoped_enum()
       nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
                          nvbench::enum_type<unscoped_val_2>,
                          nvbench::enum_type<unscoped_val_3>>>));
+#endif
 }
 
 void test_scoped_enum_type_strings()
diff --git a/testing/option_parser.cu b/testing/option_parser.cu
index 9d7e6a9c..167e833f 100644
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -57,8 +57,8 @@ states_to_string(const std::vector<nvbench::state> &states)
   std::string table_format = "| {:^5} | {:^10} | {:^4} | {:^4} | {:^4} "
                              "| {:^4} | {:^6} | {:^8} |\n";
 
-  fmt::format_to(buffer, "\n");
-  fmt::format_to(buffer,
+  fmt::format_to(std::back_inserter(buffer), "\n");
+  fmt::format_to(std::back_inserter(buffer),
                  table_format,
                  "State",
                  "TypeConfig",
@@ -72,7 +72,7 @@ states_to_string(const std::vector<nvbench::state> &states)
   std::size_t config = 0;
   for (const auto &state : states)
   {
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    table_format,
                    config++,
                    state.get_type_config_index(),
@@ -1229,6 +1229,27 @@ void test_timeout()
   ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
 }
 
+void test_stopping_criterion()
+{
+  nvbench::option_parser parser;
+  parser.parse(
+    {"--benchmark", "DummyBench", 
+     "--stopping-criterion", "entropy",
+     "--max-angle", "0.42",
+     "--min-r2", "0.6"});
+  const auto& states = parser_to_states(parser);
+
+  ASSERT(states.size() == 1);
+  ASSERT(states[0].get_stopping_criterion() == "entropy");
+
+  const nvbench::criterion_params &criterion_params = states[0].get_criterion_params();
+  ASSERT(criterion_params.has_value("max-angle"));
+  ASSERT(criterion_params.has_value("min-r2"));
+
+  ASSERT(criterion_params.get_float64("max-angle") == 0.42);
+  ASSERT(criterion_params.get_float64("min-r2") == 0.6);
+}
+
 int main()
 try
 {
@@ -1265,6 +1286,8 @@ try
   test_skip_time();
   test_timeout();
 
+  test_stopping_criterion();
+
   return 0;
 }
 catch (std::exception &err)
diff --git a/testing/reset_error.cu b/testing/reset_error.cu
new file mode 100644
index 00000000..8fece930
--- /dev/null
+++ b/testing/reset_error.cu
@@ -0,0 +1,30 @@
+#include <nvbench/cuda_call.cuh>
+
+#include "test_asserts.cuh"
+
+
+namespace
+{
+    __global__ void multiply5(const int32_t* __restrict__ a, int32_t* __restrict__ b)
+    {
+      const auto id = blockIdx.x * blockDim.x + threadIdx.x;
+      b[id] = 5 * a[id];
+    }
+}
+
+int main()
+{ 
+  multiply5<<<256, 256>>>(nullptr, nullptr);
+
+  try
+  {
+    NVBENCH_CUDA_CALL(cudaStreamSynchronize(0));
+    ASSERT(false);
+  }
+  catch (const std::runtime_error &)
+  {
+    ASSERT(cudaGetLastError() == cudaError_t::cudaSuccess);
+  }
+
+  return 0;
+}
diff --git a/testing/ring_buffer.cu b/testing/ring_buffer.cu
index 4e138056..5af53431 100644
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -27,7 +27,7 @@ template <typename T>
 bool equal(const nvbench::detail::ring_buffer<T> &buffer,
            const std::vector<T> &reference)
 {
-  return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin());
+  return std::equal(buffer.begin(), buffer.end(), reference.begin());
 }
 
 int main()
@@ -62,12 +62,12 @@ try
   ASSERT(avg.size() == 3);
   ASSERT(avg.capacity() == 3);
   ASSERT_MSG(avg.back() == 5, " (got {})", avg.back());
-  ASSERT(equal(avg, {5, 2, -15}));
+  ASSERT(equal(avg, {2, -15, 5}));
 
   avg.push_back(0);
   ASSERT(avg.size() == 3);
   ASSERT(avg.capacity() == 3);
-  ASSERT(equal(avg, {5, 0, -15}));
+  ASSERT(equal(avg, {-15, 5, 0}));
   ASSERT_MSG(avg.back() == 0, " (got {})", avg.back());
 
   avg.push_back(128);
diff --git a/testing/runner.cu b/testing/runner.cu
index 157e4548..6335d276 100644
--- a/testing/runner.cu
+++ b/testing/runner.cu
@@ -43,13 +43,13 @@ std::vector<T> sort(std::vector<T> &&vec)
 void no_op_generator(nvbench::state &state)
 {
   fmt::memory_buffer params;
-  fmt::format_to(params, "Params:");
+  fmt::format_to(std::back_inserter(params), "Params:");
   const auto &axis_values = state.get_axis_values();
   for (const auto &name : sort(axis_values.get_names()))
   {
     std::visit(
       [&params, &name](const auto &value) {
-        fmt::format_to(params, " {}: {}", name, value);
+        fmt::format_to(std::back_inserter(params), " {}: {}", name, value);
       },
       axis_values.get_value(name));
   }
@@ -124,7 +124,7 @@ void test_non_types()
   for (const auto &state : bench.get_states())
   {
     ASSERT(state.is_skipped() == true);
-    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
+    fmt::format_to(std::back_inserter(buffer), "{}\n", state.get_skip_reason());
   }
 
   const std::string ref = R"expected(Params: Float: 11 Int: 1 String: One
@@ -184,7 +184,7 @@ void test_types()
   for (const auto &state : bench.get_states())
   {
     ASSERT(state.is_skipped() == true);
-    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
+    fmt::format_to(std::back_inserter(buffer), "{}\n", state.get_skip_reason());
   }
 
   const std::string ref = R"expected(Params: FloatT: F32 IntT: I32 MiscT: bool
@@ -228,7 +228,7 @@ void test_both()
   for (const auto &state : bench.get_states())
   {
     ASSERT(state.is_skipped() == true);
-    fmt::format_to(buffer, "{}\n", state.get_skip_reason());
+    fmt::format_to(std::back_inserter(buffer), "{}\n", state.get_skip_reason());
   }
 
   const std::string ref =
diff --git a/testing/state_generator.cu b/testing/state_generator.cu
index cb584be5..f75be021 100644
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -89,17 +89,17 @@ void test_basic()
   for (sg.init(); sg.iter_valid(); sg.next())
   {
     line.clear();
-    fmt::format_to(line, "| {:^2}", line_num++);
+    fmt::format_to(std::back_inserter(line), "| {:^2}", line_num++);
     for (auto &axis_index : sg.get_current_indices())
     {
       ASSERT(axis_index.type == nvbench::axis_type::string);
-      fmt::format_to(line,
+      fmt::format_to(std::back_inserter(line),
                      " | {}: {}/{}",
                      axis_index.axis,
                      axis_index.index,
                      axis_index.size);
     }
-    fmt::format_to(buffer, "{} |\n", fmt::to_string(line));
+    fmt::format_to(std::back_inserter(buffer), "{} |\n", fmt::to_string(line));
   }
 
   const std::string ref =
@@ -166,8 +166,8 @@ void test_create()
   const std::string table_format =
     "| {:^5} | {:^10} | {:^7} | {:^7} | {:^9} | {:^9} |\n";
 
-  fmt::format_to(buffer, "\n");
-  fmt::format_to(buffer,
+  fmt::format_to(std::back_inserter(buffer), "\n");
+  fmt::format_to(std::back_inserter(buffer),
                  table_format,
                  "State",
                  "TypeConfig",
@@ -179,7 +179,7 @@ void test_create()
   std::size_t config = 0;
   for (const auto &state : states)
   {
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    table_format,
                    config++,
                    state.get_type_config_index(),
@@ -258,8 +258,8 @@ void test_create_with_types()
   std::string table_format = "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} "
                              "| {:^7} | {:^9} | {:^9} |\n";
 
-  fmt::format_to(buffer, "\n");
-  fmt::format_to(buffer,
+  fmt::format_to(std::back_inserter(buffer), "\n");
+  fmt::format_to(std::back_inserter(buffer),
                  table_format,
                  "State",
                  "TypeConfig",
@@ -274,7 +274,7 @@ void test_create_with_types()
   std::size_t config = 0;
   for (const auto &state : states)
   {
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    table_format,
                    config++,
                    state.get_type_config_index(),
@@ -607,8 +607,8 @@ void test_create_with_masked_types()
   std::string table_format = "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} "
                              "| {:^7} | {:^9} | {:^9} |\n";
 
-  fmt::format_to(buffer, "\n");
-  fmt::format_to(buffer,
+  fmt::format_to(std::back_inserter(buffer), "\n");
+  fmt::format_to(std::back_inserter(buffer),
                  table_format,
                  "State",
                  "TypeConfig",
@@ -623,7 +623,7 @@ void test_create_with_masked_types()
   std::size_t config = 0;
   for (const auto &state : states)
   {
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    table_format,
                    config++,
                    state.get_type_config_index(),
@@ -737,13 +737,13 @@ void test_devices()
   fmt::memory_buffer buffer;
   const std::string table_format = "| {:^5} | {:^6} | {:^5} | {:^3} |\n";
 
-  fmt::format_to(buffer, "\n");
-  fmt::format_to(buffer, table_format, "State", "Device", "S", "I");
+  fmt::format_to(std::back_inserter(buffer), "\n");
+  fmt::format_to(std::back_inserter(buffer), table_format, "State", "Device", "S", "I");
 
   std::size_t config = 0;
   for (const auto &state : states)
   {
-    fmt::format_to(buffer,
+    fmt::format_to(std::back_inserter(buffer),
                    table_format,
                    config++,
                    state.get_device()->get_id(),
diff --git a/testing/statistics.cu b/testing/statistics.cu
new file mode 100644
index 00000000..a67a0448
--- /dev/null
+++ b/testing/statistics.cu
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/statistics.cuh>
+#include <nvbench/types.cuh>
+
+#include "test_asserts.cuh"
+
+#include <algorithm>
+#include <vector>
+
+namespace statistics = nvbench::detail::statistics;
+
+void test_mean()
+{
+  {
+    std::vector<nvbench::float64_t> data{1.0, 2.0, 3.0, 4.0, 5.0};
+    const nvbench::float64_t actual = statistics::compute_mean(std::begin(data), std::end(data));
+    const nvbench::float64_t expected = 3.0;
+    ASSERT(std::abs(actual - expected) < 0.001);
+  }
+
+  {
+    std::vector<nvbench::float64_t> data;
+    const bool finite = std::isfinite(statistics::compute_mean(std::begin(data), std::end(data)));
+    ASSERT(!finite);
+  }
+}
+
+void test_std()
+{
+  std::vector<nvbench::float64_t> data{1.0, 2.0, 3.0, 4.0, 5.0};
+  const nvbench::float64_t mean = 3.0;
+  const nvbench::float64_t actual = statistics::standard_deviation(std::begin(data), std::end(data), mean);
+  const nvbench::float64_t expected = 1.581;
+  ASSERT(std::abs(actual - expected) < 0.001);
+}
+
+void test_lin_regression()
+{
+  {
+    std::vector<nvbench::float64_t> ys{1.0, 2.0, 3.0, 4.0, 5.0};
+    auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
+    ASSERT(slope == 1.0);
+    ASSERT(intercept == 1.0);
+  }
+  {
+    std::vector<nvbench::float64_t> ys{42.0, 42.0, 42.0};
+    auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
+    ASSERT(slope == 0.0);
+    ASSERT(intercept == 42.0);
+  }
+  {
+    std::vector<nvbench::float64_t> ys{8.0, 4.0, 0.0};
+    auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
+    ASSERT(slope == -4.0);
+    ASSERT(intercept == 8.0);
+  }
+}
+
+void test_r2()
+{
+  {
+    std::vector<nvbench::float64_t> ys{1.0, 2.0, 3.0, 4.0, 5.0};
+    auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
+    const nvbench::float64_t actual = statistics::compute_r2(std::begin(ys), std::end(ys), slope, intercept);
+    const nvbench::float64_t expected = 1.0;
+    ASSERT(std::abs(actual - expected) < 0.001);
+  }
+  {
+    std::vector<nvbench::float64_t> signal{1.0, 2.0, 3.0, 4.0, 5.0};
+    std::vector<nvbench::float64_t> noise{-1.0, 1.0, -1.0, 1.0, -1.0};
+    std::vector<nvbench::float64_t> ys(signal.size());
+
+    std::transform(std::begin(signal),
+                   std::end(signal),
+                   std::begin(noise),
+                   std::begin(ys),
+                   std::plus<nvbench::float64_t>());
+
+    auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
+    const nvbench::float64_t expected = 0.675;
+    const nvbench::float64_t actual = statistics::compute_r2(std::begin(ys), std::end(ys), slope, intercept);
+    ASSERT(std::abs(actual - expected) < 0.001);
+  }
+}
+
+void test_slope_conversion()
+{
+  {
+    const nvbench::float64_t actual = statistics::slope2deg(0.0);
+    const nvbench::float64_t expected = 0.0;
+    ASSERT(std::abs(actual - expected) < 0.001);
+  }
+  {
+    const nvbench::float64_t actual = statistics::slope2deg(1.0);
+    const nvbench::float64_t expected = 45.0;
+    ASSERT(std::abs(actual - expected) < 0.001);
+  }
+  {
+    const nvbench::float64_t actual = statistics::slope2deg(5.0);
+    const nvbench::float64_t expected = 78.69;
+    ASSERT(std::abs(actual - expected) < 0.001);
+  }
+}
+
+int main()
+{
+  test_mean();
+  test_std();
+  test_lin_regression();
+  test_r2();
+  test_slope_conversion();
+}
diff --git a/testing/stdrel_criterion.cu b/testing/stdrel_criterion.cu
new file mode 100644
index 00000000..f0affea0
--- /dev/null
+++ b/testing/stdrel_criterion.cu
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/stdrel_criterion.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
+
+#include "test_asserts.cuh"
+
+#include <vector>
+#include <random>
+#include <numeric>
+
+void test_const()
+{
+  nvbench::criterion_params params;
+  nvbench::detail::stdrel_criterion criterion;
+
+  criterion.initialize(params);
+  for (int i = 0; i < 5; i++)
+  { // nvbench wants at least 5 to compute the standard deviation
+    criterion.add_measurement(42.0);
+  }
+  ASSERT(criterion.is_finished());
+}
+
+std::vector<double> generate(double mean, double rel_std_dev, int size)
+{
+  static std::mt19937::result_type seed = 0;
+  std::mt19937 gen(seed++);
+  std::vector<nvbench::float64_t> v(static_cast<std::size_t>(size));
+  std::normal_distribution<nvbench::float64_t> dist(mean, mean * rel_std_dev);
+  std::generate(v.begin(), v.end(), [&]{ return dist(gen); });
+  return v;
+}
+
+void test_stdrel()
+{
+  const nvbench::int64_t size = 10;
+  const nvbench::float64_t mean = 42.0;
+  const nvbench::float64_t max_noise = 0.1;
+
+  nvbench::criterion_params params;
+  params.set_float64("max-noise", max_noise);
+
+  nvbench::detail::stdrel_criterion criterion;
+  criterion.initialize(params);
+
+  for (nvbench::float64_t measurement: generate(mean, max_noise / 2, size))
+  {
+    criterion.add_measurement(measurement);
+  }
+  ASSERT(criterion.is_finished());
+
+  params.set_float64("max-noise", max_noise);
+  criterion.initialize(params);
+
+  for (nvbench::float64_t measurement: generate(mean, max_noise * 2, size))
+  {
+    criterion.add_measurement(measurement);
+  }
+  ASSERT(!criterion.is_finished());
+}
+
+int main()
+{
+  test_const();
+  test_stdrel();
+}