diff --git a/.gitignore b/.gitignore
index 022fb28..1af36cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
*.pyc
__pycache__
*.egg-info
+.ipynb_checkpoints
diff --git a/demo/Kernel_Tuner_demo.css b/demo/Kernel_Tuner_demo.css
new file mode 100644
index 0000000..5069d77
--- /dev/null
+++ b/demo/Kernel_Tuner_demo.css
@@ -0,0 +1,23 @@
+.reveal,
+.reveal h1,
+.reveal h2,
+.reveal h3,
+.reveal h4,
+.reveal h5,
+.reveal h6 {
+ font-family: "Nunito";
+}
+
+.p {
+ font-family: "Assistant";
+}
+
+body.notebook_app.rise-enabled {
+ background: url('img/slide_background.png');
+ background-position: left top;
+ height: 100%;
+ width: 100%;
+ background-repeat: no-repeat;
+ background-size: auto 100%;
+ padding-top: calc(50px + 2vh);
+}
diff --git a/demo/Kernel_Tuner_demo.ipynb b/demo/Kernel_Tuner_demo.ipynb
new file mode 100644
index 0000000..bbee7f7
--- /dev/null
+++ b/demo/Kernel_Tuner_demo.ipynb
@@ -0,0 +1,317 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "09f74f14",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "
\n",
+ " \n",
+ "# Kernel Tuner demo\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "By Ben van Werkhoven, Netherlands eScience Center
\n",
+ "b.vanwerkhoven@esciencecenter.nl\n",
+ " \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "305a09d5",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "notes"
+ }
+ },
+ "source": [
+ "alt+r to start the slideshow, spacebar or shift+spacebar to move forward to next slide, comma to remove on screen buttons\n",
+ "\n",
+ "preparation: run the next code cell, start a second terminal and go the the directory of this notebook"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4b5e909f",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "skip"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "rm matmul_cache.json\n",
+ "rm vector_add.cu"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0fbf8c5a",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Optimizing GPU Applications\n",
+ "\n",
+ "To maximize GPU code performance, you need to find the best combination of:\n",
+ "\n",
+ "* Different mappings of the problem to threads and thread blocks\n",
+ "* Different data layouts in different memories (shared, constant, …)\n",
+ "* Different ways of exploiting special hardware features\n",
+ "* Thread block dimensions\n",
+ "* Code optimizations that may be applied or not\n",
+ "* Work per thread in each dimension\n",
+ "* Loop unrolling factors\n",
+ "* Overlapping computation and communication\n",
+ "* ...\n",
+ "\n",
+ "Problem:\n",
+ "* Creates a very large design space!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59e489ff",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Kernel Tuner\n",
+ "\n",
+ "*A Python tool for optimizing and tuning GPU applications*\n",
+ "\n",
+ "Started in 2016:\n",
+ "* As a software development tool for GPU projects at the eScience center\n",
+ "* To be used directly on existing kernels\n",
+ "* Without inserting dependences in the kernel code\n",
+ "* Kernels can still be compiled with regular compilers\n",
+ "\n",
+ "Today:\n",
+ "* Comprehensive toolbox for auto-tuning with several tools being built on top\n",
+ "* Developed by a team of 7 developers across CWI, Astron, and eScience center\n",
+ "* Used in over 10 different eScience center projects and by others\n",
+ "\n",
+ "https://github.com/KernelTuner/kernel_tuner"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4e2675bf",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Minimal Example"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "169859e8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%writefile vector_add.cu\n",
+ "__global__ void vector_add(float *c, float *a, float *b, int n) {\n",
+ " int i = blockIdx.x * block_size_x + threadIdx.x;\n",
+ " if (i\n",
+ "\n",
+ "\n",
+ "\n",
+ "#### Live visualizations of auto-tuning sessions using Kernel Tuner\n",
+ "\n",
+ "
\n",
+ " \n",
+ "https://github.com/KernelTuner/dashboard\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "57b5db81",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Tuning a larger problem"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fed6931e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from collections import OrderedDict\n",
+ "problem_size = (512, 512)\n",
+ "A = np.random.randn(*problem_size).astype(np.float32)\n",
+ "B = np.random.randn(*problem_size).astype(np.float32)\n",
+ "C = np.zeros_like(A)\n",
+ "\n",
+ "args = [C, A, B]\n",
+ "\n",
+ "tune_params = OrderedDict()\n",
+ "tune_params[\"block_size_x\"] = [2**i for i in range(0, 11)]\n",
+ "tune_params[\"block_size_y\"] = [2**i for i in range(0, 11)]\n",
+ "tune_params[\"tile_size_x\"] = [2**i for i in range(0, 6)]\n",
+ "tune_params[\"tile_size_y\"] = [2**i for i in range(0, 6)]\n",
+ "\n",
+ "restrict = [\"block_size_x == block_size_y * tile_size_y\"]\n",
+ "grid_div_x = [\"block_size_x\", \"tile_size_x\"]\n",
+ "grid_div_y = [\"block_size_y\", \"tile_size_y\"]\n",
+ "\n",
+ "from kernel_tuner.nvml import NVMLObserver\n",
+ "nvml_observer = NVMLObserver([\"nvml_energy\", \"temperature\", \"core_freq\"])\n",
+ "\n",
+ "metrics = OrderedDict()\n",
+ "metrics[\"GFLOP/s\"] = lambda p : (2 * 512**3 / 1e9) / (p[\"time\"] / 1e3)\n",
+ "metrics[\"GFLOPs/W\"] = lambda p : (2 * 512**3 / 1e9) / (p[\"nvml_energy\"])\n",
+ "\n",
+ "_ = kt.tune_kernel(\"matmul_kernel\", \"matmul.cu\", problem_size, args, tune_params,\n",
+ " observers=[nvml_observer], grid_div_y=grid_div_y, grid_div_x=grid_div_x,\n",
+ " restrictions=restrict, metrics=metrics, cache=\"matmul_cache.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3778bbd1",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "skip"
+ }
+ },
+ "source": [
+ "While the previous cell is running go to second terminal and type \"ktdashboard matmul_cache.json\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7377204",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "slide"
+ }
+ },
+ "source": [
+ "# Final remarks\n",
+ "\n",
+ "Currently, using Kernel Tuner to optimize and tune code in:\n",
+ "* Ultrasound Brain Imaging with Erasmus MC (RECRUIT)\n",
+ "* Atmospheric Modeling (ESiWACE-2 Microhh)\n",
+ "* Radio Astronomy (CORTEX)\n",
+ "\n",
+ "Kernel Tuner can also be used for optimizing the energy efficiency of GPU applications:\n",
+ "> Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning
\n",
+ "R. Schoonhoven, B. Veenboer, B. van Werkhoven, K. J. Batenburg
\n",
+ "International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS) at Supercomputing (SC22) 2022 \n",
+ "\n",
+ "Main repository:
\n",
+ "https://github.com/KernelTuner/kernel_tuner
\n",
+ "Documentation:
\n",
+ "https://KernelTuner.github.io
\n",
+ "Tutorial:
\n",
+ "https://github.com/KernelTuner/kernel_tuner_tutorial
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6f89b4b0",
+ "metadata": {
+ "slideshow": {
+ "slide_type": "skip"
+ }
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Slideshow",
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.12"
+ },
+ "rise": {
+ "transition": "none"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demo/img/dashboard.png b/demo/img/dashboard.png
new file mode 100644
index 0000000..00cab9e
Binary files /dev/null and b/demo/img/dashboard.png differ
diff --git a/demo/img/dashboard_logo.png b/demo/img/dashboard_logo.png
new file mode 100644
index 0000000..a4e8fb7
Binary files /dev/null and b/demo/img/dashboard_logo.png differ
diff --git a/demo/img/slide_background.png b/demo/img/slide_background.png
new file mode 100644
index 0000000..53740f5
Binary files /dev/null and b/demo/img/slide_background.png differ
diff --git a/demo/matmul.cu b/demo/matmul.cu
new file mode 100644
index 0000000..d208308
--- /dev/null
+++ b/demo/matmul.cu
@@ -0,0 +1,91 @@
+/**
+ * The kernel is assumed to be tuned to each device by selecting
+ * the best performing combination of thread block dimensions
+ * and tiling factors in X and Y. In this implementation tiling
+ * in X increases the amount of work per thread block and tiling
+ * in Y increases the amount of work per thread within the block.
+ *
+ * WARNING: THIS KERNEL IS FOR EDUCATIONAL PURPOSES ONLY.
+ * PLEASE *DO NOT USE IT* IN PRODUCTION, USE A BLAS
+ * LIBRARY SUCH AS CUBLAS, CLBLAST OR CUTLASS INSTEAD.
+ *
+ * @author Ben van Werkhoven
+ *
+ */
+
+#define WIDTH 512
+/*
+ * Optimized CUDA kernel for matrix multiplication
+ *
+ * This kernel is optimized according to the directions given
+ * in: "Better performance at lower occupancy" by V. Volkov,
+ * GPU Technology Conference, GTC 2010.
+ *
+ * The thread block dimensions (block_size_x, block_size_y)
+ * and tiling factors (tile_size_x, tile_size_y) are to be
+ * tuned towards each GPU. This kernel assumes that
+ * block_size_x = block_size_y * tile_size_y.
+ *
+ * The kernel computes C=A*B, where A, B, and C are square
+ * matrices with height and width equal to WIDTH
+ */
+__global__ void matmul_kernel(float *C, float *A, float *B) {
+
+ __shared__ float sA[block_size_y*tile_size_y][block_size_x];
+ __shared__ float sB[block_size_y*tile_size_y][block_size_x * tile_size_x];
+
+ int tx = threadIdx.x;
+ int ty = threadIdx.y;
+ int x = blockIdx.x * block_size_x * tile_size_x + threadIdx.x;
+ int y = blockIdx.y * block_size_y * tile_size_y + threadIdx.y;
+ int k, kb;
+
+ float sum[tile_size_y][tile_size_x];
+ #pragma unroll
+ for (int i = 0; i < tile_size_y; i++) {
+ #pragma unroll
+ for (int j = 0; j < tile_size_x; j++) {
+ sum[i][j] = 0.0f;
+ }
+ }
+
+ for (k = 0; k < WIDTH; k += block_size_x) {
+
+ __syncthreads();
+ #pragma unroll
+ for (int i = 0; i < tile_size_y; i++) {
+ sA[ty + block_size_y * i][tx] = A[(y+i*block_size_y) * WIDTH + k + tx];
+
+ #pragma unroll
+ for (int j = 0; j < tile_size_x; j++) {
+ sB[ty + block_size_y * i][tx + j * block_size_x] = B[(k + ty + block_size_y * i) * WIDTH + x + j * block_size_x];
+ }
+ }
+ __syncthreads();
+
+ //compute
+ #pragma unroll
+ for (kb = 0; kb < block_size_x; kb++) {
+
+ #pragma unroll
+ for (int i = 0; i < tile_size_y; i++) {
+ #pragma unroll
+ for (int j = 0; j < tile_size_x; j++) {
+ sum[i][j] += sA[ty + block_size_y * i][kb] * sB[kb][tx + j * block_size_x];
+ }
+ }
+
+ }
+
+ }
+
+ //store result
+ #pragma unroll
+ for (int i = 0; i < tile_size_y; i++) {
+ #pragma unroll
+ for (int j = 0; j < tile_size_x; j++) {
+ C[y * WIDTH + x + block_size_y * i * WIDTH + j * block_size_x] = sum[i][j];
+ }
+ }
+
+}
diff --git a/demo/requirements.txt b/demo/requirements.txt
new file mode 100644
index 0000000..eb15857
--- /dev/null
+++ b/demo/requirements.txt
@@ -0,0 +1,2 @@
+notebook
+RISE