diff --git a/.gitignore b/.gitignore index 022fb28..1af36cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.pyc __pycache__ *.egg-info +.ipynb_checkpoints diff --git a/demo/Kernel_Tuner_demo.css b/demo/Kernel_Tuner_demo.css new file mode 100644 index 0000000..5069d77 --- /dev/null +++ b/demo/Kernel_Tuner_demo.css @@ -0,0 +1,23 @@ +.reveal, +.reveal h1, +.reveal h2, +.reveal h3, +.reveal h4, +.reveal h5, +.reveal h6 { + font-family: "Nunito"; +} + +.p { + font-family: "Assistant"; +} + +body.notebook_app.rise-enabled { + background: url('img/slide_background.png'); + background-position: left top; + height: 100%; + width: 100%; + background-repeat: no-repeat; + background-size: auto 100%; + padding-top: calc(50px + 2vh); +} diff --git a/demo/Kernel_Tuner_demo.ipynb b/demo/Kernel_Tuner_demo.ipynb new file mode 100644 index 0000000..bbee7f7 --- /dev/null +++ b/demo/Kernel_Tuner_demo.ipynb @@ -0,0 +1,317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "09f74f14", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "
\n", + " \n", + "# Kernel Tuner demo\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + "By Ben van Werkhoven, Netherlands eScience Center
\n", + "b.vanwerkhoven@esciencecenter.nl\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "305a09d5", + "metadata": { + "slideshow": { + "slide_type": "notes" + } + }, + "source": [ + "alt+r to start the slideshow, spacebar or shift+spacebar to move forward to next slide, comma to remove on screen buttons\n", + "\n", + "preparation: run the next code cell, start a second terminal and go the the directory of this notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b5e909f", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "rm matmul_cache.json\n", + "rm vector_add.cu" + ] + }, + { + "cell_type": "markdown", + "id": "0fbf8c5a", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Optimizing GPU Applications\n", + "\n", + "To maximize GPU code performance, you need to find the best combination of:\n", + "\n", + "* Different mappings of the problem to threads and thread blocks\n", + "* Different data layouts in different memories (shared, constant, …)\n", + "* Different ways of exploiting special hardware features\n", + "* Thread block dimensions\n", + "* Code optimizations that may be applied or not\n", + "* Work per thread in each dimension\n", + "* Loop unrolling factors\n", + "* Overlapping computation and communication\n", + "* ...\n", + "\n", + "Problem:\n", + "* Creates a very large design space!" + ] + }, + { + "cell_type": "markdown", + "id": "59e489ff", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Kernel Tuner\n", + "\n", + "*A Python tool for optimizing and tuning GPU applications*\n", + "\n", + "Started in 2016:\n", + "* As a software development tool for GPU projects at the eScience center\n", + "* To be used directly on existing kernels\n", + "* Without inserting dependences in the kernel code\n", + "* Kernels can still be compiled with regular compilers\n", + "\n", + "Today:\n", + "* Comprehensive toolbox for auto-tuning with several tools being built on top\n", + "* Developed by a team of 7 developers across CWI, Astron, and eScience center\n", + "* Used in over 10 different eScience center projects and by others\n", + "\n", + "https://github.com/KernelTuner/kernel_tuner" + ] + }, + { + "cell_type": "markdown", + "id": "4e2675bf", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Minimal Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "169859e8", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile vector_add.cu\n", + "__global__ void vector_add(float *c, float *a, float *b, int n) {\n", + " int i = blockIdx.x * block_size_x + threadIdx.x;\n", + " if (i\n", + "\n", + "
\n", + "\n", + "#### Live visualizations of auto-tuning sessions using Kernel Tuner\n", + "\n", + "\n", + " \n", + "https://github.com/KernelTuner/dashboard\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "57b5db81", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Tuning a larger problem" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed6931e", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "problem_size = (512, 512)\n", + "A = np.random.randn(*problem_size).astype(np.float32)\n", + "B = np.random.randn(*problem_size).astype(np.float32)\n", + "C = np.zeros_like(A)\n", + "\n", + "args = [C, A, B]\n", + "\n", + "tune_params = OrderedDict()\n", + "tune_params[\"block_size_x\"] = [2**i for i in range(0, 11)]\n", + "tune_params[\"block_size_y\"] = [2**i for i in range(0, 11)]\n", + "tune_params[\"tile_size_x\"] = [2**i for i in range(0, 6)]\n", + "tune_params[\"tile_size_y\"] = [2**i for i in range(0, 6)]\n", + "\n", + "restrict = [\"block_size_x == block_size_y * tile_size_y\"]\n", + "grid_div_x = [\"block_size_x\", \"tile_size_x\"]\n", + "grid_div_y = [\"block_size_y\", \"tile_size_y\"]\n", + "\n", + "from kernel_tuner.nvml import NVMLObserver\n", + "nvml_observer = NVMLObserver([\"nvml_energy\", \"temperature\", \"core_freq\"])\n", + "\n", + "metrics = OrderedDict()\n", + "metrics[\"GFLOP/s\"] = lambda p : (2 * 512**3 / 1e9) / (p[\"time\"] / 1e3)\n", + "metrics[\"GFLOPs/W\"] = lambda p : (2 * 512**3 / 1e9) / (p[\"nvml_energy\"])\n", + "\n", + "_ = kt.tune_kernel(\"matmul_kernel\", \"matmul.cu\", problem_size, args, tune_params,\n", + " observers=[nvml_observer], grid_div_y=grid_div_y, grid_div_x=grid_div_x,\n", + " restrictions=restrict, metrics=metrics, cache=\"matmul_cache.json\")" + ] + }, + { + "cell_type": "markdown", + "id": "3778bbd1", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "While the previous cell is running go to second terminal and type \"ktdashboard matmul_cache.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "a7377204", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Final remarks\n", + "\n", + "Currently, using Kernel Tuner to optimize and tune code in:\n", + "* Ultrasound Brain Imaging with Erasmus MC (RECRUIT)\n", + "* Atmospheric Modeling (ESiWACE-2 Microhh)\n", + "* Radio Astronomy (CORTEX)\n", + "\n", + "Kernel Tuner can also be used for optimizing the energy efficiency of GPU applications:\n", + "> Going green: optimizing GPUs for energy efficiency through model-steered auto-tuning
\n", + "R. Schoonhoven, B. Veenboer, B. van Werkhoven, K. J. Batenburg
\n", + "International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS) at Supercomputing (SC22) 2022 \n", + "\n", + "Main repository:
\n", + "https://github.com/KernelTuner/kernel_tuner
\n", + "Documentation:
\n", + "https://KernelTuner.github.io
\n", + "Tutorial:
\n", + "https://github.com/KernelTuner/kernel_tuner_tutorial
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f89b4b0", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "rise": { + "transition": "none" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/demo/img/dashboard.png b/demo/img/dashboard.png new file mode 100644 index 0000000..00cab9e Binary files /dev/null and b/demo/img/dashboard.png differ diff --git a/demo/img/dashboard_logo.png b/demo/img/dashboard_logo.png new file mode 100644 index 0000000..a4e8fb7 Binary files /dev/null and b/demo/img/dashboard_logo.png differ diff --git a/demo/img/slide_background.png b/demo/img/slide_background.png new file mode 100644 index 0000000..53740f5 Binary files /dev/null and b/demo/img/slide_background.png differ diff --git a/demo/matmul.cu b/demo/matmul.cu new file mode 100644 index 0000000..d208308 --- /dev/null +++ b/demo/matmul.cu @@ -0,0 +1,91 @@ +/** + * The kernel is assumed to be tuned to each device by selecting + * the best performing combination of thread block dimensions + * and tiling factors in X and Y. In this implementation tiling + * in X increases the amount of work per thread block and tiling + * in Y increases the amount of work per thread within the block. + * + * WARNING: THIS KERNEL IS FOR EDUCATIONAL PURPOSES ONLY. + * PLEASE *DO NOT USE IT* IN PRODUCTION, USE A BLAS + * LIBRARY SUCH AS CUBLAS, CLBLAST OR CUTLASS INSTEAD. + * + * @author Ben van Werkhoven + * + */ + +#define WIDTH 512 +/* + * Optimized CUDA kernel for matrix multiplication + * + * This kernel is optimized according to the directions given + * in: "Better performance at lower occupancy" by V. Volkov, + * GPU Technology Conference, GTC 2010. + * + * The thread block dimensions (block_size_x, block_size_y) + * and tiling factors (tile_size_x, tile_size_y) are to be + * tuned towards each GPU. This kernel assumes that + * block_size_x = block_size_y * tile_size_y. + * + * The kernel computes C=A*B, where A, B, and C are square + * matrices with height and width equal to WIDTH + */ +__global__ void matmul_kernel(float *C, float *A, float *B) { + + __shared__ float sA[block_size_y*tile_size_y][block_size_x]; + __shared__ float sB[block_size_y*tile_size_y][block_size_x * tile_size_x]; + + int tx = threadIdx.x; + int ty = threadIdx.y; + int x = blockIdx.x * block_size_x * tile_size_x + threadIdx.x; + int y = blockIdx.y * block_size_y * tile_size_y + threadIdx.y; + int k, kb; + + float sum[tile_size_y][tile_size_x]; + #pragma unroll + for (int i = 0; i < tile_size_y; i++) { + #pragma unroll + for (int j = 0; j < tile_size_x; j++) { + sum[i][j] = 0.0f; + } + } + + for (k = 0; k < WIDTH; k += block_size_x) { + + __syncthreads(); + #pragma unroll + for (int i = 0; i < tile_size_y; i++) { + sA[ty + block_size_y * i][tx] = A[(y+i*block_size_y) * WIDTH + k + tx]; + + #pragma unroll + for (int j = 0; j < tile_size_x; j++) { + sB[ty + block_size_y * i][tx + j * block_size_x] = B[(k + ty + block_size_y * i) * WIDTH + x + j * block_size_x]; + } + } + __syncthreads(); + + //compute + #pragma unroll + for (kb = 0; kb < block_size_x; kb++) { + + #pragma unroll + for (int i = 0; i < tile_size_y; i++) { + #pragma unroll + for (int j = 0; j < tile_size_x; j++) { + sum[i][j] += sA[ty + block_size_y * i][kb] * sB[kb][tx + j * block_size_x]; + } + } + + } + + } + + //store result + #pragma unroll + for (int i = 0; i < tile_size_y; i++) { + #pragma unroll + for (int j = 0; j < tile_size_x; j++) { + C[y * WIDTH + x + block_size_y * i * WIDTH + j * block_size_x] = sum[i][j]; + } + } + +} diff --git a/demo/requirements.txt b/demo/requirements.txt new file mode 100644 index 0000000..eb15857 --- /dev/null +++ b/demo/requirements.txt @@ -0,0 +1,2 @@ +notebook +RISE