diff --git a/.gitignore b/.gitignore
index 022fb28..1af36cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 __pycache__
 *.egg-info
+.ipynb_checkpoints
diff --git a/demo/Kernel_Tuner_demo.css b/demo/Kernel_Tuner_demo.css
new file mode 100644
index 0000000..5069d77
--- /dev/null
+++ b/demo/Kernel_Tuner_demo.css
@@ -0,0 +1,23 @@
+.reveal,
+.reveal h1,
+.reveal h2,
+.reveal h3,
+.reveal h4,
+.reveal h5,
+.reveal h6 {
+  font-family: "Nunito";
+}
+
+.p {
+  font-family: "Assistant";
+}
+
+body.notebook_app.rise-enabled { 
+  background: url('img/slide_background.png');
+  background-position: left top;
+  height: 100%;
+  width: 100%;
+  background-repeat: no-repeat;
+  background-size: auto 100%;
+  padding-top: calc(50px + 2vh);
+}
diff --git a/demo/Kernel_Tuner_demo.ipynb b/demo/Kernel_Tuner_demo.ipynb
new file mode 100644
index 0000000..bbee7f7
--- /dev/null
+++ b/demo/Kernel_Tuner_demo.ipynb
@@ -0,0 +1,317 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "09f74f14",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    " <div align=\"center\">\n",
+    "    \n",
+    "# Kernel Tuner demo\n",
+    "\n",
+    "<br />\n",
+    "<br />\n",
+    "<br />\n",
+    "<br />\n",
+    "<br />\n",
+    "<br />\n",
+    "<br />\n",
+    "\n",
+    "\n",
+    "By Ben van Werkhoven, Netherlands eScience Center <br />\n",
+    "b.vanwerkhoven@esciencecenter.nl\n",
+    "    \n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "305a09d5",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "notes"
+    }
+   },
+   "source": [
+    "alt+r to start the slideshow, spacebar or shift+spacebar to move forward to next slide, comma to remove on screen buttons\n",
+    "\n",
+    "preparation: run the next code cell, start a second terminal and go the the directory of this notebook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b5e909f",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "skip"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "rm matmul_cache.json\n",
+    "rm vector_add.cu"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0fbf8c5a",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Optimizing GPU Applications\n",
+    "\n",
+    "To maximize GPU code performance, you need to find the best combination of:\n",
+    "\n",
+    "* Different mappings of the problem to threads and thread blocks\n",
+    "* Different data layouts in different memories (shared, constant, …)\n",
+    "* Different ways of exploiting special hardware features\n",
+    "* Thread block dimensions\n",
+    "* Code optimizations that may be applied or not\n",
+    "* Work per thread in each dimension\n",
+    "* Loop unrolling factors\n",
+    "* Overlapping computation and communication\n",
+    "* ...\n",
+    "\n",
+    "<font color=red>Problem</font>:\n",
+    "* Creates a very large design space!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59e489ff",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Kernel Tuner\n",
+    "\n",
+    "*A Python tool for optimizing and tuning GPU applications*\n",
+    "\n",
+    "Started in 2016:\n",
+    "* As a software development tool for GPU projects at the eScience center\n",
+    "* To be used directly on existing kernels\n",
+    "* Without inserting dependences in the kernel code\n",
+    "* Kernels can still be compiled with regular compilers\n",
+    "\n",
+    "Today:\n",
+    "* Comprehensive toolbox for auto-tuning with several tools being built on top\n",
+    "* Developed by a team of 7 developers across CWI, Astron, and eScience center\n",
+    "* Used in over 10 different eScience center projects and by others\n",
+    "\n",
+    "https://github.com/KernelTuner/kernel_tuner"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e2675bf",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Minimal Example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "169859e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile vector_add.cu\n",
+    "__global__ void vector_add(float *c, float *a, float *b, int n) {\n",
+    "    int i = blockIdx.x * block_size_x + threadIdx.x;\n",
+    "    if (i<n) {\n",
+    "        c[i] = a[i] + b[i];\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22f3e90d",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import kernel_tuner as kt\n",
+    "\n",
+    "size = 1000000\n",
+    "\n",
+    "a = np.random.randn(size).astype(np.float32)\n",
+    "b = np.random.randn(size).astype(np.float32)\n",
+    "c = np.zeros_like(b)\n",
+    "args = [c, a, b, np.int32(size)]\n",
+    "\n",
+    "tune_params = dict()\n",
+    "tune_params[\"block_size_x\"] = [32, 64, 128, 256, 512]\n",
+    "\n",
+    "_ = kt.tune_kernel(\"vector_add\", \"vector_add.cu\", size, args, tune_params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b63c7ea9",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "<img src=\"img/dashboard_logo.png\" style=\"height:100px;\">\n",
+    "\n",
+    "<div align=\"left\">\n",
+    "\n",
+    "#### Live visualizations of auto-tuning sessions using Kernel Tuner\n",
+    "\n",
+    "<img src=\"img/dashboard.png\" style=\"height:40%;\">\n",
+    "    \n",
+    "https://github.com/KernelTuner/dashboard\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57b5db81",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Tuning a larger problem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fed6931e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import OrderedDict\n",
+    "problem_size = (512, 512)\n",
+    "A = np.random.randn(*problem_size).astype(np.float32)\n",
+    "B = np.random.randn(*problem_size).astype(np.float32)\n",
+    "C = np.zeros_like(A)\n",
+    "\n",
+    "args = [C, A, B]\n",
+    "\n",
+    "tune_params = OrderedDict()\n",
+    "tune_params[\"block_size_x\"] = [2**i for i in range(0, 11)]\n",
+    "tune_params[\"block_size_y\"] = [2**i for i in range(0, 11)]\n",
+    "tune_params[\"tile_size_x\"] = [2**i for i in range(0, 6)]\n",
+    "tune_params[\"tile_size_y\"] = [2**i for i in range(0, 6)]\n",
+    "\n",
+    "restrict = [\"block_size_x == block_size_y * tile_size_y\"]\n",
+    "grid_div_x = [\"block_size_x\", \"tile_size_x\"]\n",
+    "grid_div_y = [\"block_size_y\", \"tile_size_y\"]\n",
+    "\n",
+    "from kernel_tuner.nvml import NVMLObserver\n",
+    "nvml_observer = NVMLObserver([\"nvml_energy\", \"temperature\", \"core_freq\"])\n",
+    "\n",
+    "metrics = OrderedDict()\n",
+    "metrics[\"GFLOP/s\"] = lambda p : (2 * 512**3 / 1e9) / (p[\"time\"] / 1e3)\n",
+    "metrics[\"GFLOPs/W\"] = lambda p : (2 * 512**3 / 1e9) / (p[\"nvml_energy\"])\n",
+    "\n",
+    "_ = kt.tune_kernel(\"matmul_kernel\", \"matmul.cu\", problem_size, args, tune_params,\n",
+    "                   observers=[nvml_observer], grid_div_y=grid_div_y, grid_div_x=grid_div_x,\n",
+    "                   restrictions=restrict, metrics=metrics, cache=\"matmul_cache.json\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3778bbd1",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "skip"
+    }
+   },
+   "source": [
+    "While the previous cell is running go to second terminal and type \"ktdashboard matmul_cache.json\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7377204",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "slide"
+    }
+   },
+   "source": [
+    "# Final remarks\n",
+    "\n",
+    "Currently, using Kernel Tuner to optimize and tune code in:\n",
+    "* Ultrasound Brain Imaging with Erasmus MC (RECRUIT)\n",
+    "* Atmospheric Modeling (ESiWACE-2 Microhh)\n",
+    "* Radio Astronomy (CORTEX)\n",
+    "\n",
+    "Kernel Tuner can also be used for optimizing the energy efficiency of GPU applications:\n",
+    "> Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning <br />\n",
+    "R. Schoonhoven, B. Veenboer, B. van Werkhoven, K. J. Batenburg <br />\n",
+    "International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS) at Supercomputing (SC22) 2022 \n",
+    "\n",
+    "Main repository: <br />\n",
+    "https://github.com/KernelTuner/kernel_tuner  <br />\n",
+    "Documentation: <br /> \n",
+    "https://KernelTuner.github.io  <br />\n",
+    "Tutorial: <br />\n",
+    "https://github.com/KernelTuner/kernel_tuner_tutorial <br />"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f89b4b0",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "skip"
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Slideshow",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "rise": {
+   "transition": "none"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/demo/img/dashboard.png b/demo/img/dashboard.png
new file mode 100644
index 0000000..00cab9e
Binary files /dev/null and b/demo/img/dashboard.png differ
diff --git a/demo/img/dashboard_logo.png b/demo/img/dashboard_logo.png
new file mode 100644
index 0000000..a4e8fb7
Binary files /dev/null and b/demo/img/dashboard_logo.png differ
diff --git a/demo/img/slide_background.png b/demo/img/slide_background.png
new file mode 100644
index 0000000..53740f5
Binary files /dev/null and b/demo/img/slide_background.png differ
diff --git a/demo/matmul.cu b/demo/matmul.cu
new file mode 100644
index 0000000..d208308
--- /dev/null
+++ b/demo/matmul.cu
@@ -0,0 +1,91 @@
+/**
+ * The kernel is assumed to be tuned to each device by selecting
+ * the best performing combination of thread block dimensions
+ * and tiling factors in X and Y. In this implementation tiling
+ * in X increases the amount of work per thread block and tiling
+ * in Y increases the amount of work per thread within the block.
+ *
+ * WARNING: THIS KERNEL IS FOR EDUCATIONAL PURPOSES ONLY.
+ *          PLEASE *DO NOT USE IT* IN PRODUCTION, USE A BLAS
+ *          LIBRARY SUCH AS CUBLAS, CLBLAST OR CUTLASS INSTEAD.
+ *
+ * @author Ben van Werkhoven <b.vanwerkhoven@esciencecenter.nl>
+ *
+ */
+
+#define WIDTH 512
+/*
+ * Optimized CUDA kernel for matrix multiplication
+ *
+ * This kernel is optimized according to the directions given
+ * in: "Better performance at lower occupancy" by V. Volkov,
+ * GPU Technology Conference, GTC 2010.
+ *
+ * The thread block dimensions (block_size_x, block_size_y)
+ * and tiling factors (tile_size_x, tile_size_y) are to be
+ * tuned towards each GPU. This kernel assumes that
+ * block_size_x = block_size_y * tile_size_y.
+ *
+ * The kernel computes C=A*B, where A, B, and C are square
+ * matrices with height and width equal to WIDTH
+ */
+__global__ void matmul_kernel(float *C, float *A, float *B) {
+
+    __shared__ float sA[block_size_y*tile_size_y][block_size_x];
+    __shared__ float sB[block_size_y*tile_size_y][block_size_x * tile_size_x];
+
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int x = blockIdx.x * block_size_x * tile_size_x + threadIdx.x;
+    int y = blockIdx.y * block_size_y * tile_size_y + threadIdx.y;
+    int k, kb;
+
+    float sum[tile_size_y][tile_size_x];
+    #pragma unroll
+    for (int i = 0; i < tile_size_y; i++) {
+        #pragma unroll
+        for (int j = 0; j < tile_size_x; j++) {
+            sum[i][j] = 0.0f;
+        }
+    }
+
+    for (k = 0; k < WIDTH; k += block_size_x) {
+
+        __syncthreads();
+        #pragma unroll
+        for (int i = 0; i < tile_size_y; i++) {
+            sA[ty + block_size_y * i][tx] = A[(y+i*block_size_y) * WIDTH + k + tx];
+
+            #pragma unroll
+            for (int j = 0; j < tile_size_x; j++) {
+                sB[ty + block_size_y * i][tx + j * block_size_x] = B[(k + ty + block_size_y * i) * WIDTH + x + j * block_size_x];
+            }
+        }
+        __syncthreads();
+
+        //compute
+        #pragma unroll
+        for (kb = 0; kb < block_size_x; kb++) {
+
+            #pragma unroll
+            for (int i = 0; i < tile_size_y; i++) {
+            #pragma unroll
+                for (int j = 0; j < tile_size_x; j++) {
+                    sum[i][j] += sA[ty + block_size_y * i][kb] * sB[kb][tx + j * block_size_x];
+                }
+            }
+
+        }
+
+    }
+
+    //store result
+    #pragma unroll
+    for (int i = 0; i < tile_size_y; i++) {
+        #pragma unroll
+        for (int j = 0; j < tile_size_x; j++) {
+            C[y * WIDTH + x + block_size_y * i * WIDTH + j * block_size_x] = sum[i][j];
+        }
+    }
+
+}
diff --git a/demo/requirements.txt b/demo/requirements.txt
new file mode 100644
index 0000000..eb15857
--- /dev/null
+++ b/demo/requirements.txt
@@ -0,0 +1,2 @@
+notebook
+RISE