From b6b4fb04b7102cdd58aa0811ae6afc638914e15a Mon Sep 17 00:00:00 2001
From: bjodom <bjodom@users.noreply.github.com>
Date: Wed, 26 Apr 2023 14:46:59 -0700
Subject: [PATCH] Add files via upload

---
 .../gemm_oneMKL_SYCL/00_GEMM/00_GEMM.ipynb    |  35 ++
 .../00_GEMM/01_GEMM_DPCPP_Buffers.ipynb       | 311 ++++++++++++++++++
 .../00_GEMM/02_GEMM_DPCPP_USM.ipynb           | 255 ++++++++++++++
 .../gemm_oneMKL_SYCL/00_GEMM/GEMM_OMP.ipynb   | 223 +++++++++++++
 .../00_GEMM/lab/dpcpp_gemm_buffers.cpp        | 107 ++++++
 .../00_GEMM/lab/dpcpp_gemm_usm.cpp            | 124 +++++++
 .../gemm_oneMKL_SYCL/00_GEMM/lab/omp_gemm.cpp |  95 ++++++
 Libraries/gemm_oneMKL_SYCL/00_GEMM/q          |  52 +++
 .../00_GEMM/run_gemm_buffers.sh               |   7 +
 .../gemm_oneMKL_SYCL/00_GEMM/run_gemm_omp.sh  |   5 +
 .../gemm_oneMKL_SYCL/00_GEMM/run_gemm_usm.sh  |   5 +
 .../00_GEMM/src/dpcpp_gemm_buffers.cpp        |  99 ++++++
 .../00_GEMM/src/dpcpp_gemm_usm.cpp            | 123 +++++++
 .../gemm_oneMKL_SYCL/00_GEMM/src/omp_gemm.cpp |  95 ++++++
 Libraries/gemm_oneMKL_SYCL/oneMKL_Intro.ipynb | 149 +++++++++
 15 files changed, 1685 insertions(+)
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/00_GEMM.ipynb
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/01_GEMM_DPCPP_Buffers.ipynb
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/02_GEMM_DPCPP_USM.ipynb
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/GEMM_OMP.ipynb
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_buffers.cpp
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_usm.cpp
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/omp_gemm.cpp
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/q
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_buffers.sh
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_omp.sh
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_usm.sh
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_buffers.cpp
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_usm.cpp
 create mode 100644 Libraries/gemm_oneMKL_SYCL/00_GEMM/src/omp_gemm.cpp
 create mode 100644 Libraries/gemm_oneMKL_SYCL/oneMKL_Intro.ipynb

diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/00_GEMM.ipynb b/Libraries/gemm_oneMKL_SYCL/00_GEMM/00_GEMM.ipynb
new file mode 100644
index 0000000000..84accfef2a
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/00_GEMM.ipynb
@@ -0,0 +1,35 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Matrix Multiplication (GEMM)\n",
+    "### [DPC++ (Buffers)](./01_GEMM_DPCPP_Buffers.ipynb)\n",
+    "### [DPC++ (Unified Shared Memory)](./02_GEMM_DPCPP_USM.ipynb)\n",
+    "### [OpenMP Offload](./GEMM_OMP.ipynb)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (Intel® oneAPI 2023.0)",
+   "language": "python",
+   "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/01_GEMM_DPCPP_Buffers.ipynb b/Libraries/gemm_oneMKL_SYCL/00_GEMM/01_GEMM_DPCPP_Buffers.ipynb
new file mode 100644
index 0000000000..3c0891e3f2
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/01_GEMM_DPCPP_Buffers.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GEMM (Using DPC++ Buffers)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following example shows a simple matrix multiplication program using __DPC++__ with the __buffer/accessor__ style of memory management. Follow along with the instructions of the lab to build and run the program. The lab requires a mixture of observing key components, and making simple modifications."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting lab/dpcpp_gemm_buffers.cpp\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile lab/dpcpp_gemm_buffers.cpp\n",
+    "//==============================================================\n",
+    "// Copyright © 2023 Intel Corporation\n",
+    "//\n",
+    "// SPDX-License-Identifier: MIT\n",
+    "// =============================================================\n",
+    "\n",
+    "#include <iostream>\n",
+    "#include <vector>\n",
+    "\n",
+    "//# sycl namespace\n",
+    "#include <sycl/sycl.hpp>          \n",
+    "using namespace sycl;\n",
+    "\n",
+    "//# oneMKL DPC++ interface for BLAS functions\n",
+    "\n",
+    "#include \"oneapi/mkl/blas.hpp\"  \n",
+    "// # shorten mkl namespace\n",
+    "namespace mkl = oneapi::mkl;    \n",
+    "\n",
+    "//# The following project performs matrix multiplication using oneMKL / DPC++ with buffers.\n",
+    "//# We will execute the simple operation A * B = C\n",
+    "//# The matrix B is set equal to the identity matrix such that A * B = A * I\n",
+    "//# After performing the computation, we will verify A * I = C -> A = C\n",
+    "\n",
+    "\n",
+    "\n",
+    "int main() {\n",
+    "\n",
+    "    //# dimensions\n",
+    "    int m = 3, n = 3, k = 3;\n",
+    "    //# leading dimensions\n",
+    "    int ldA = 3, ldB = 3, ldC = 3;\n",
+    "    //# scalar multipliers\n",
+    "    double alpha = 1.0, beta = 1.0;\n",
+    "    //# transpose status of matrices\n",
+    "    mkl::transpose transA = mkl::transpose::nontrans;\n",
+    "    mkl::transpose transB = mkl::transpose::nontrans;\n",
+    "    //# matrix data\n",
+    "    std::vector<double> A = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};\n",
+    "    std::vector<double> B = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};\n",
+    "    std::vector<double> C = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};\n",
+    "    \n",
+    "    //### Step 1 - Observe the definition of an asynchronous exception handler.\n",
+    "    //# This function object will later be supplied to the queue.\n",
+    "    //# It is designed to handle errors thrown while device code executes.\n",
+    "    auto async_handler = [](sycl::exception_list exceptions) {\n",
+    "        for (std::exception_ptr const &e : exceptions) {\n",
+    "            try {\n",
+    "                std::rethrow_exception(e);\n",
+    "            }\n",
+    "            catch (sycl::exception const &e) {\n",
+    "                std::cout << \"Caught asynchronous SYCL exception: \" << e.what() << std::endl;\n",
+    "           }\n",
+    "        }\n",
+    "    };\n",
+    "\n",
+    "    //### Step 2 - Create a device object. (create device and q in one step)\n",
+    "    //# Device selectors are used to specify the type of a device.\n",
+    "    //# Uncomment _one_ of the following three lines to select a device.\n",
+    "    queue q(default_selector_v, async_handler);  //# default_selector returns a device based on a performance heuristic\n",
+    "    // queue q(cpu_selector_v);      //# cpu_selector returns a cpu device\n",
+    "    // queue q(gpu_selector_v);     //# gpu_selector returns a gpu device\n",
+    "    // queue q;\n",
+    "    //# Print actual device used\n",
+    "    std::cout << \"Device: \" << q.get_device().get_info<info::device::name>() << \"\\n\";\n",
+    "\n",
+    "    //### Step 4 - Create buffers to hold our matrix data.\n",
+    "    //# Buffer objects can be constructed given a container\n",
+    "    //# Observe the creation of buffers for matrices A and B.\n",
+    "    //# Try and create a third buffer for matrix C called C_buffer.\n",
+    "    //# The solution is shown in the hidden cell below.\n",
+    "    buffer A_buffer(A);\n",
+    "    buffer B_buffer(B);\n",
+    "    /* define C_buffer below */\n",
+    "    buffer C_buffer(C);\n",
+    "    \n",
+    "\n",
+    "    //### Step 5 - Execute gemm operation.\n",
+    "    //# Here, we need only pass in our queue and other familiar matrix multiplication parameters.\n",
+    "    //# This includes the dimensions and data buffers for matrices A, B, and C.\n",
+    "    mkl::blas::gemm(q, transA, transB, m, n, k, alpha, A_buffer, ldA, B_buffer, ldB, beta, C_buffer, ldC);\n",
+    "\n",
+    "    //# we cannot explicitly transfer memory to/from the device when using buffers\n",
+    "    //# that is why we must use this operation to ensure result data is returned to the host\n",
+    "    q.wait_and_throw();  //# block until operation completes, throw any errors\n",
+    "\n",
+    "    //### Step 6 - Observe creation of accessors to retrieve data from A_buffer and C_buffer.\n",
+    "    accessor A_acc(A_buffer,read_only);\n",
+    "    accessor C_acc(C_buffer,read_only);\n",
+    "\n",
+    "    int status = 0;\n",
+    "\n",
+    "    // verify C matrix using accessor to observe values held in C_buffer\n",
+    "    std::cout << \"\\n\";\n",
+    "    std::cout << \"C = \\n\";\n",
+    "    for (int i = 0; i < m; ++i) {\n",
+    "        for (int j = 0; j < n; ++j) {\n",
+    "            if (A_acc[i*m+j] != C_acc[i*m+j]) status = 1;\n",
+    "            std::cout << C_acc[i*m+j] << \" \";\n",
+    "        }\n",
+    "        std::cout << \"\\n\";\n",
+    "    }\n",
+    "    std::cout << \"\\n\";\n",
+    "\n",
+    "    status == 0 ? std::cout << \"Verified: A = C\\n\" : std::cout << \"Failed: A != C\\n\";\n",
+    "    return status;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Solutions - click the three dots below to reveal"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Step 4 - The correct line is\n",
+    "```sycl::buffer C_buffer(C);```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run\n",
+    "Select the cell below and click Run ▶ to compile and execute the code above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Job has been submitted to Intel(R) DevCloud and will execute soon.\n",
+      "\n",
+      " If you do not see result in 60 seconds, please restart the Jupyter kernel:\n",
+      " Kernel -> 'Restart Kernel and Clear All Outputs...' and then try again\n",
+      "\n",
+      "Job ID                    Name             User            Time Use S Queue\n",
+      "------------------------- ---------------- --------------- -------- - -----\n",
+      "2281801.v-qsvr-1           ...ub-singleuser u51369          00:01:08 R jupyterhub     \n",
+      "2281830.v-qsvr-1           STDIN            u51369          00:00:06 R batch          \n",
+      "2281854.v-qsvr-1           ...mm_buffers.sh u51369                 0 Q batch          \n",
+      "\n",
+      "Waiting for Output ██████████ Done⬇\n",
+      "\n",
+      "########################################################################\n",
+      "#      Date:           Thu 20 Apr 2023 03:27:43 PM PDT\n",
+      "#    Job ID:           2281854.v-qsvr-1.aidevcloud\n",
+      "#      User:           u51369\n",
+      "# Resources:           cput=75:00:00,neednodes=1:gpu:ppn=2,nodes=1:gpu:ppn=2,walltime=06:00:00\n",
+      "########################################################################\n",
+      "\n",
+      "## u51369 is compiling oneMKL_introduction Module0 -- gemm with buffers - 1 of 3 dpcpp_gemm_buffers.cpp\n",
+      "\n",
+      "########################################################################\n",
+      "# End of output for job 2281854.v-qsvr-1.aidevcloud\n",
+      "# Date: Thu 20 Apr 2023 03:27:48 PM PDT\n",
+      "########################################################################\n",
+      "\n",
+      "lab/dpcpp_gemm_buffers.cpp:7:10: fatal error: 'sycl/sycl.hpp' file not found\n",
+      "#include <sycl/sycl.hpp>          //# sycl namespace\n",
+      "         ^~~~~~~~~~~~~~~\n",
+      "1 error generated.\n",
+      "Job Completed in 10 seconds.\n"
+     ]
+    }
+   ],
+   "source": [
+    "! chmod 755 q; chmod 755 run_gemm_buffers.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run_gemm_buffers.sh; else ./run_gemm_buffers.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The build instructions for this sample can be found in the ```run_gemm_buffers.sh``` script. Consider using the [Link Line Advisor](https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html) to help you create compile and link lines for your oneMKL projects."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:green\"><h1>Survey</h1></span></body></html>\n",
+    "\n",
+    "[We would appreciate any feedback you’d care to give, so that we can improve the overall training quality and experience. Thanks! ](https://intel.az1.qualtrics.com/jfe/form/SV_cCpY08ARDi6NhfT)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:Red\"><h1>Reset Notebook</h1></span></body></html>\n",
+    "\n",
+    "##### Should you be experiencing any issues with your notebook or just want to start fresh run the below cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e9f5a54d77f541bf981b6aebea899d57",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(Button(description='Reset Notebook', icon='check', style=ButtonStyle(), tooltip='This will upda…"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from IPython.display import display, Markdown, clear_output\n",
+    "import ipywidgets as widgets\n",
+    "button = widgets.Button(\n",
+    "    description='Reset Notebook',\n",
+    "    disabled=False,\n",
+    "    button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
+    "    tooltip='This will update this notebook, overwriting any changes.',\n",
+    "    icon='check' # (FontAwesome names without the `fa-` prefix)\n",
+    ")\n",
+    "out = widgets.Output()\n",
+    "def on_button_clicked(_):\n",
+    "      # \"linking function with output\"\n",
+    "      with out:\n",
+    "          # what happens when we press the button\n",
+    "          clear_output()\n",
+    "          !rsync -a --size-only /data/oneapi_workshop/Intel_oneAPI_MKL_Training/00_GEMM/ ~/Intel_oneAPI_MKL_Training/00_GEMM/\n",
+    "          print('Notebook reset -- now click reload on browser.')\n",
+    "# linking button and function together using a button's method\n",
+    "button.on_click(on_button_clicked)\n",
+    "# displaying button and its output together\n",
+    "widgets.VBox([button,out])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (Intel® oneAPI 2023.0)",
+   "language": "python",
+   "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/02_GEMM_DPCPP_USM.ipynb b/Libraries/gemm_oneMKL_SYCL/00_GEMM/02_GEMM_DPCPP_USM.ipynb
new file mode 100644
index 0000000000..d00466d7c5
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/02_GEMM_DPCPP_USM.ipynb
@@ -0,0 +1,255 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GEMM (Using DPC++ Unified Shared Memory)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following example shows a simple matrix multiplication program using __DPC++__ with the __unified shared memory__ style of memory management. Follow along with the instructions of the lab to build and run the program. The lab requires a mixture of observing key components, and making simple modifications."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting lab/dpcpp_gemm_usm.cpp\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile lab/dpcpp_gemm_usm.cpp\n",
+    "//==============================================================\n",
+    "// Copyright © 2020 Intel Corporation\n",
+    "//\n",
+    "// SPDX-License-Identifier: MIT\n",
+    "// =============================================================\n",
+    "#include <iostream>\n",
+    "#include <vector>\n",
+    "#include <CL/sycl.hpp>          //# sycl namespace\n",
+    "#include \"oneapi/mkl/blas.hpp\"  //# oneMKL DPC++ interface for BLAS functions\n",
+    "\n",
+    "//# The following project performs matrix multiplication using oneMKL / DPC++ with Unified Shared Memory (USM)\n",
+    "//# We will execute the simple operation A * B = C\n",
+    "//# The matrix B is set equal to the identity matrix such that A * B = A * I\n",
+    "//# After performing the computation, we will verify A * I = C -> A = C\n",
+    "\n",
+    "using namespace sycl;\n",
+    "namespace mkl = oneapi::mkl;  //# shorten mkl namespace\n",
+    "\n",
+    "int main() {\n",
+    "\n",
+    "    //# dimensions\n",
+    "    int m = 3, n = 3, k = 3;\n",
+    "    //# leading dimensions\n",
+    "    int ldA = 3, ldB = 3, ldC = 3;\n",
+    "    //# scalar multipliers\n",
+    "    double alpha = 1.0, beta = 1.0;\n",
+    "    //# transpose status of matrices\n",
+    "    mkl::transpose transA = mkl::transpose::nontrans;\n",
+    "    mkl::transpose transB = mkl::transpose::nontrans;\n",
+    "\n",
+    "    //### Step 1 - Observe the definition of an asynchronous exception handler.\n",
+    "    //# This function object will later be supplied to the queue.\n",
+    "    //# It is designed to handle errors thrown while device code executes.\n",
+    "    auto async_handler = [](sycl::exception_list exceptions) {\n",
+    "        for (std::exception_ptr const &e : exceptions) {\n",
+    "            try {\n",
+    "                std::rethrow_exception(e);\n",
+    "            }\n",
+    "            catch (sycl::exception const &e) {\n",
+    "                std::cout << \"Caught asynchronous SYCL exception: \" << e.what() << std::endl;\n",
+    "            }\n",
+    "        }\n",
+    "    };\n",
+    "\n",
+    "    //### Step 2 - Create a device object.\n",
+    "    //# Device selectors are used to specify the type of a device.\n",
+    "    //# Uncomment _one_ of the following three lines to select a device.\n",
+    "    // sycl::device device = sycl::device(sycl::default_selector());  //# default_selector returns a device based on a performance heuristic\n",
+    "    // sycl::device device = sycl::device(sycl::cpu_selector());      //# cpu_selector returns a cpu device\n",
+    "    // sycl::device device = sycl::device(sycl::gpu_selector());      //# gpu_selector returns a gpu device\n",
+    "    std::cout << \"Device: \" << device.get_info<sycl::info::device::name>() << \"\\n\";\n",
+    "\n",
+    "    //### Step 3 - Create a queue object.\n",
+    "    //# A queue accepts a single device, and optionally, an exception handler.\n",
+    "    //# Uncomment the following line to initialize a queue with our device and handler.\n",
+    "    // sycl::queue queue(device, async_handler);\n",
+    "\n",
+    "    //### Step 4 - Create a sycl event and allocate USM\n",
+    "    //# The later execution of the gemm operation is tied to this event\n",
+    "    //# The gemm operation will also make use of a vector of sycl events we can call 'gemm_dependencies'\n",
+    "    sycl::event gemm_done;\n",
+    "    std::vector<sycl::event> gemm_dependencies;\n",
+    "    //# Here, we allocate USM pointers for each matrix, using the special 'malloc_shared' function\n",
+    "    //# Make sure to template the function with the correct precision, and pass in our queue to the function call\n",
+    "    double *A_usm = sycl::malloc_shared<double>(m * k, queue);\n",
+    "    double *B_usm = sycl::malloc_shared<double>(k * n, queue);\n",
+    "    double *C_usm = sycl::malloc_shared<double>(m * n, queue);\n",
+    "\n",
+    "    //# define matrix A as the 3x3 matrix\n",
+    "    //# {{ 1, 2, 3}, {4, 5, 6}, {7, 8, 9}}\n",
+    "    for (int i = 0; i < m; i++) {\n",
+    "        for (int j = 0; j < k; j++) {\n",
+    "            A_usm[i*m+j] = (double)(i*m+j) + 1.0;\n",
+    "        }\n",
+    "    }\n",
+    "    \n",
+    "    //# define matrix B as the identity matrix\n",
+    "    for (int i = 0; i < k; i++) {\n",
+    "        for (int j = 0; j < n; j++) {\n",
+    "            if (i == j) B_usm[i*k+j] = 1.0;\n",
+    "            else B_usm[i*k+j] = 0.0;\n",
+    "        }\n",
+    "    }\n",
+    "    \n",
+    "    //# initialize C as a 0 matrix\n",
+    "    for (int i = 0; i < m; i++) {\n",
+    "        for (int j = 0; j < n; j++) {\n",
+    "            C_usm[i*m+j] = 0.0;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    //### Step 5 - Execute gemm operation.\n",
+    "    //# Here, we fill in the familiar parameters for the gemm operation.\n",
+    "    //# However, we must also pass in the queue as the first parameter.\n",
+    "    //# We must also pass in our list of dependencies as the final parameter.\n",
+    "    //# We are also passing in our USM pointers as opposed to a buffer or raw data pointer.\n",
+    "    gemm_done = mkl::blas::gemm(queue, transA, transB, m, n, k, alpha, A_usm, ldA, B_usm, ldB, beta, C_usm, ldC, gemm_dependencies);\n",
+    "\n",
+    "    //# We must now wait for the given event to finish before accessing any data involved in the operation\n",
+    "    //# Otherwise, we may access data before the operation has completed, or before it has been returned to the host\n",
+    "    gemm_done.wait();\n",
+    "\n",
+    "    int status = 0;\n",
+    "\n",
+    "    //# verify C matrix using USM data\n",
+    "    std::cout << \"\\n\";\n",
+    "    std::cout << \"C = \\n\";\n",
+    "    for (int i = 0; i < m; ++i) {\n",
+    "        for (int j = 0; j < n; ++j) {\n",
+    "            if (A_usm[i*m+j] != C_usm[i*m+j]) status = 1;\n",
+    "            std::cout << C_usm[i*m+j] << \" \";\n",
+    "        }\n",
+    "        std::cout << \"\\n\";\n",
+    "    }\n",
+    "    std::cout << \"\\n\";\n",
+    "\n",
+    "    //# free usm pointers\n",
+    "    sycl::free(A_usm, queue);\n",
+    "    sycl::free(B_usm, queue);\n",
+    "    sycl::free(C_usm, queue);\n",
+    "\n",
+    "    status == 0 ? std::cout << \"Verified: A = C\\n\" : std::cout << \"Failed: A != C\\n\";\n",
+    "    return status;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run\n",
+    "Select the cell below and click Run ▶ to compile and execute the code above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! chmod 755 q; chmod 755 run_gemm_usm.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run_gemm_usm.sh; else ./run_gemm_usm.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The build instructions for this sample can be found in the ```run_gemm_usm.sh``` script. Consider using the [Link Line Advisor](https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html) to help you create compile and link lines for your oneMKL projects."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:green\"><h1>Survey</h1></span></body></html>\n",
+    "\n",
+    "[We would appreciate any feedback you’d care to give, so that we can improve the overall training quality and experience. Thanks! ](https://intel.az1.qualtrics.com/jfe/form/SV_3JyBEAtwk5YIg85)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:Red\"><h1>Reset Notebook</h1></span></body></html>\n",
+    "\n",
+    "##### Should you be experiencing any issues with your notebook or just want to start fresh run the below cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, Markdown, clear_output\n",
+    "import ipywidgets as widgets\n",
+    "button = widgets.Button(\n",
+    "    description='Reset Notebook',\n",
+    "    disabled=False,\n",
+    "    button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
+    "    tooltip='This will update this notebook, overwriting any changes.',\n",
+    "    icon='check' # (FontAwesome names without the `fa-` prefix)\n",
+    ")\n",
+    "out = widgets.Output()\n",
+    "def on_button_clicked(_):\n",
+    "      # \"linking function with output\"\n",
+    "      with out:\n",
+    "          # what happens when we press the button\n",
+    "          clear_output()\n",
+    "          !rsync -a --size-only /data/oneapi_workshop/Intel_oneAPI_MKL_Training/00_GEMM/ ~/Intel_oneAPI_MKL_Training/00_GEMM/\n",
+    "          print('Notebook reset -- now click reload on browser.')\n",
+    "# linking button and function together using a button's method\n",
+    "button.on_click(on_button_clicked)\n",
+    "# displaying button and its output together\n",
+    "widgets.VBox([button,out])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (Intel® oneAPI 2023.0)",
+   "language": "python",
+   "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/GEMM_OMP.ipynb b/Libraries/gemm_oneMKL_SYCL/00_GEMM/GEMM_OMP.ipynb
new file mode 100644
index 0000000000..98f02e76a6
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/GEMM_OMP.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# GEMM (Using OpenMP Offload)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following example shows a simple matrix multiplication program using __OpenMP Offload__. Follow along with the instructions of the lab to build and run the program. The lab requires a mixture of observing key components, and making simple modifications."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%writefile lab/omp_gemm.cpp\n",
+    "//==============================================================\n",
+    "// Copyright © 2020 Intel Corporation\n",
+    "//\n",
+    "// SPDX-License-Identifier: MIT\n",
+    "// =============================================================\n",
+    "#include <stdio.h>\n",
+    "#include \"mkl.h\"              //# main mkl header\n",
+    "#include \"mkl_omp_offload.h\"  //# mkl OMP Offload interface\n",
+    "\n",
+    "int dnum = 0;\n",
+    "\n",
+    "int main() {\n",
+    "\n",
+    "    //# dimensions\n",
+    "    MKL_INT m = 3, n = 3, k = 3;\n",
+    "    //# leading dimensions\n",
+    "    MKL_INT ldA = k, ldB = n, ldC = n;\n",
+    "    //# scalar multipliers\n",
+    "    double alpha = 1.0;\n",
+    "    double beta = 1.0;\n",
+    "    //# matrix data\n",
+    "    double *A = (double *)malloc(m * k * sizeof(double));\n",
+    "    double *B = (double *)malloc(k * n * sizeof(double));\n",
+    "    double *C = (double *)malloc(m * n * sizeof(double));\n",
+    "\n",
+    "    //# define matrix A as the 3x3 matrix\n",
+    "    //# {{ 1, 2, 3}, {4, 5, 6}, {7, 8, 9}}\n",
+    "    for (int i = 0; i < m; i++) {\n",
+    "        for (int j = 0; j < k; j++) {\n",
+    "            A[i*m+j] = (double)(i*m+j) + 1.0;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    //# define matrix B as the identity matrix\n",
+    "    for (int i = 0; i < k; i++) {\n",
+    "        for (int j = 0; j < n; j++) {\n",
+    "            if (i == j) B[i*k+j] = 1.0;\n",
+    "            else B[i*k+j] = 0.0;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    //# initialize C as a 0 matrix\n",
+    "    for (int i = 0; i < m; i++) {\n",
+    "        for (int j = 0; j < n; j++) {\n",
+    "            C[i*m+j] = 0.0;\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    MKL_INT sizeA = m*k;\n",
+    "    MKL_INT sizeB = k*n;\n",
+    "    MKL_INT sizeC = m*n;\n",
+    "    \n",
+    "    //# Below are the two compiler directives necessary to offload the GEMM operation\n",
+    "    //# we are using 'dgemm' to specify we are using double-precision values\n",
+    "    \n",
+    "    //# The outer directive maps input data (matrices A & B) 'to' the device.\n",
+    "    //# It also maps output data (matrix C) 'from' the device so that the results of the operation are returned.\n",
+    "    //# Finally, this directive specifies device number 0, which should interact with an available GPU.\n",
+    "    \n",
+    "    //# The inner directive dispatches the correct version of the contained operation, again specifying the device number.\n",
+    "    //# This directive also uses the 'use_devce_ptr' statement to specify the data we are working with (in this case, arrays A, B, & C).\n",
+    "    \n",
+    "    //# Uncomment the two 'pragma' lines below. (Do not remove the '#' character)\n",
+    "    \n",
+    "    //#pragma omp target data map(to:A[0:sizeA],B[0:sizeB]) map(from:C[0:sizeC]) device(dnum)\n",
+    "    {\n",
+    "        //#pragma omp target variant dispatch device(dnum) use_device_ptr(A, B, C)\n",
+    "        {\n",
+    "            dgemm(\"N\", \"N\", &m, &n, &k, &alpha, A, &ldA, B, &ldB, &beta, C, &ldC);\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    int status = 0;\n",
+    "\n",
+    "    //# verify C matrix\n",
+    "    printf(\"\\n\");\n",
+    "    printf(\"C = \\n\");\n",
+    "    for (int i = 0; i < m; i++) {\n",
+    "        for (int j = 0; j < n; j++) {\n",
+    "            if (A[i*m+j] != C[i*m+j]) status = 1;\n",
+    "            printf(\"%0.0f \", C[i*m+j]);\n",
+    "        }\n",
+    "        printf(\"\\n\");\n",
+    "    }\n",
+    "    printf(\"\\n\");\n",
+    "\n",
+    "    //# free matrix data\n",
+    "    free(A);\n",
+    "    free(B);\n",
+    "    free(C);\n",
+    "\n",
+    "    status == 0 ? printf(\"Verified: A = C\\n\") : printf(\"Failed: A != C\\n\");\n",
+    "\n",
+    "    return status;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build and Run\n",
+    "Select the cell below and click Run ▶ to compile and execute the code above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! chmod 755 q; chmod 755 run_gemm_omp.sh;if [ -x \"$(command -v qsub)\" ]; then ./q run_gemm_omp.sh; else ./run_gemm_omp.sh; fi"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The build instructions for this sample can be found in the ```run_gemm_omp.sh``` script. Consider using the [Link Line Advisor](https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html) to help you create compile and link lines for your oneMKL projects.\n",
+    "\n",
+    "To validate your runs of OpenMP, consider setting the following environment variables\n",
+    "* ```LIBOMPTARGET_PROFILE=T``` - Enables OpenMP profiling, also prints the chosen device\n",
+    "* ```LIBOMPTARGET_DEBUG=1``` - Forces OpenMP to dump debug info\n",
+    "* ```LIBOMPTARGET_PLUGIN=OpenCL``` - Target OpenCL backend instead of Level0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:green\"><h1>Survey</h1></span></body></html>\n",
+    "\n",
+    "[We would appreciate any feedback you’d care to give, so that we can improve the overall training quality and experience. Thanks! ](https://intel.az1.qualtrics.com/jfe/form/SV_55Ollfy7wQ1kuNL)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:Red\"><h1>Reset Notebook</h1></span></body></html>\n",
+    "\n",
+    "##### Should you be experiencing any issues with your notebook or just want to start fresh run the below cell."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, Markdown, clear_output\n",
+    "import ipywidgets as widgets\n",
+    "button = widgets.Button(\n",
+    "    description='Reset Notebook',\n",
+    "    disabled=False,\n",
+    "    button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
+    "    tooltip='This will update this notebook, overwriting any changes.',\n",
+    "    icon='check' # (FontAwesome names without the `fa-` prefix)\n",
+    ")\n",
+    "out = widgets.Output()\n",
+    "def on_button_clicked(_):\n",
+    "      # \"linking function with output\"\n",
+    "      with out:\n",
+    "          # what happens when we press the button\n",
+    "          clear_output()\n",
+    "          !rsync -a --size-only /data/oneapi_workshop/Intel_oneAPI_MKL_Training/00_GEMM/ ~/Intel_oneAPI_MKL_Training/00_GEMM/\n",
+    "          print('Notebook reset -- now click reload on browser.')\n",
+    "# linking button and function together using a button's method\n",
+    "button.on_click(on_button_clicked)\n",
+    "# displaying button and its output together\n",
+    "widgets.VBox([button,out])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7 (Intel® oneAPI)",
+   "language": "python",
+   "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_buffers.cpp b/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_buffers.cpp
new file mode 100644
index 0000000000..ada8faa8e9
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_buffers.cpp
@@ -0,0 +1,107 @@
+//==============================================================
+// Copyright © 2023 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include <iostream>
+#include <vector>
+
+//# sycl namespace
+#include <sycl/sycl.hpp>          
+using namespace sycl;
+
+//# oneMKL DPC++ interface for BLAS functions
+
+#include "oneapi/mkl/blas.hpp"  
+// # shorten mkl namespace
+namespace mkl = oneapi::mkl;    
+
+//# The following project performs matrix multiplication using oneMKL / DPC++ with buffers.
+//# We will execute the simple operation A * B = C
+//# The matrix B is set equal to the identity matrix such that A * B = A * I
+//# After performing the computation, we will verify A * I = C -> A = C
+
+
+
+int main() {
+
+    //# dimensions
+    int m = 3, n = 3, k = 3;
+    //# leading dimensions
+    int ldA = 3, ldB = 3, ldC = 3;
+    //# scalar multipliers
+    double alpha = 1.0, beta = 1.0;
+    //# transpose status of matrices
+    mkl::transpose transA = mkl::transpose::nontrans;
+    mkl::transpose transB = mkl::transpose::nontrans;
+    //# matrix data
+    std::vector<double> A = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+    std::vector<double> B = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
+    std::vector<double> C = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    
+    //### Step 1 - Observe the definition of an asynchronous exception handler.
+    //# This function object will later be supplied to the queue.
+    //# It is designed to handle errors thrown while device code executes.
+    auto async_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
+           }
+        }
+    };
+
+    //### Step 2 - Create a device object. (create device and q in one step)
+    //# Device selectors are used to specify the type of a device.
+    //# Uncomment _one_ of the following three lines to select a device.
+    queue q(default_selector_v, async_handler);  //# default_selector returns a device based on a performance heuristic
+    // queue q(cpu_selector_v);      //# cpu_selector returns a cpu device
+    // queue q(gpu_selector_v);     //# gpu_selector returns a gpu device
+    // queue q;
+    //# Print actual device used
+    std::cout << "Device: " << q.get_device().get_info<info::device::name>() << "\n";
+
+    //### Step 4 - Create buffers to hold our matrix data.
+    //# Buffer objects can be constructed given a container
+    //# Observe the creation of buffers for matrices A and B.
+    //# Try and create a third buffer for matrix C called C_buffer.
+    //# The solution is shown in the hidden cell below.
+    buffer A_buffer(A);
+    buffer B_buffer(B);
+    /* define C_buffer below */
+    buffer C_buffer(C);
+    
+
+    //### Step 5 - Execute gemm operation.
+    //# Here, we need only pass in our queue and other familiar matrix multiplication parameters.
+    //# This includes the dimensions and data buffers for matrices A, B, and C.
+    mkl::blas::gemm(q, transA, transB, m, n, k, alpha, A_buffer, ldA, B_buffer, ldB, beta, C_buffer, ldC);
+
+    //# we cannot explicitly transfer memory to/from the device when using buffers
+    //# that is why we must use this operation to ensure result data is returned to the host
+    q.wait_and_throw();  //# block until operation completes, throw any errors
+
+    //### Step 6 - Observe creation of accessors to retrieve data from A_buffer and C_buffer.
+    accessor A_acc(A_buffer,read_only);
+    accessor C_acc(C_buffer,read_only);
+
+    int status = 0;
+
+    // verify C matrix using accessor to observe values held in C_buffer
+    std::cout << "\n";
+    std::cout << "C = \n";
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (A_acc[i*m+j] != C_acc[i*m+j]) status = 1;
+            std::cout << C_acc[i*m+j] << " ";
+        }
+        std::cout << "\n";
+    }
+    std::cout << "\n";
+
+    status == 0 ? std::cout << "Verified: A = C\n" : std::cout << "Failed: A != C\n";
+    return status;
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_usm.cpp b/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_usm.cpp
new file mode 100644
index 0000000000..3d44473fb7
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/dpcpp_gemm_usm.cpp
@@ -0,0 +1,124 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <iostream>
+#include <vector>
+#include <CL/sycl.hpp>          //# sycl namespace
+#include "oneapi/mkl/blas.hpp"  //# oneMKL DPC++ interface for BLAS functions
+
+//# The following project performs matrix multiplication using oneMKL / DPC++ with Unified Shared Memory (USM)
+//# We will execute the simple operation A * B = C
+//# The matrix B is set equal to the identity matrix such that A * B = A * I
+//# After performing the computation, we will verify A * I = C -> A = C
+
+using namespace sycl;
+namespace mkl = oneapi::mkl;  //# shorten mkl namespace
+
+int main() {
+
+    //# dimensions
+    int m = 3, n = 3, k = 3;
+    //# leading dimensions
+    int ldA = 3, ldB = 3, ldC = 3;
+    //# scalar multipliers
+    double alpha = 1.0, beta = 1.0;
+    //# transpose status of matrices
+    mkl::transpose transA = mkl::transpose::nontrans;
+    mkl::transpose transB = mkl::transpose::nontrans;
+
+    //### Step 1 - Observe the definition of an asynchronous exception handler.
+    //# This function object will later be supplied to the queue.
+    //# It is designed to handle errors thrown while device code executes.
+    auto async_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
+            }
+        }
+    };
+
+    //### Step 2 - Create a device object.
+    //# Device selectors are used to specify the type of a device.
+    //# Uncomment _one_ of the following three lines to select a device.
+    // sycl::device device = sycl::device(sycl::default_selector());  //# default_selector returns a device based on a performance heuristic
+    // sycl::device device = sycl::device(sycl::cpu_selector());      //# cpu_selector returns a cpu device
+    // sycl::device device = sycl::device(sycl::gpu_selector());      //# gpu_selector returns a gpu device
+    std::cout << "Device: " << device.get_info<sycl::info::device::name>() << "\n";
+
+    //### Step 3 - Create a queue object.
+    //# A queue accepts a single device, and optionally, an exception handler.
+    //# Uncomment the following line to initialize a queue with our device and handler.
+    // sycl::queue queue(device, async_handler);
+
+    //### Step 4 - Create a sycl event and allocate USM
+    //# The later execution of the gemm operation is tied to this event
+    //# The gemm operation will also make use of a vector of sycl events we can call 'gemm_dependencies'
+    sycl::event gemm_done;
+    std::vector<sycl::event> gemm_dependencies;
+    //# Here, we allocate USM pointers for each matrix, using the special 'malloc_shared' function
+    //# Make sure to template the function with the correct precision, and pass in our queue to the function call
+    double *A_usm = sycl::malloc_shared<double>(m * k, queue);
+    double *B_usm = sycl::malloc_shared<double>(k * n, queue);
+    double *C_usm = sycl::malloc_shared<double>(m * n, queue);
+
+    //# define matrix A as the 3x3 matrix
+    //# {{ 1, 2, 3}, {4, 5, 6}, {7, 8, 9}}
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < k; j++) {
+            A_usm[i*m+j] = (double)(i*m+j) + 1.0;
+        }
+    }
+    
+    //# define matrix B as the identity matrix
+    for (int i = 0; i < k; i++) {
+        for (int j = 0; j < n; j++) {
+            if (i == j) B_usm[i*k+j] = 1.0;
+            else B_usm[i*k+j] = 0.0;
+        }
+    }
+    
+    //# initialize C as a 0 matrix
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            C_usm[i*m+j] = 0.0;
+        }
+    }
+
+    //### Step 5 - Execute gemm operation.
+    //# Here, we fill in the familiar parameters for the gemm operation.
+    //# However, we must also pass in the queue as the first parameter.
+    //# We must also pass in our list of dependencies as the final parameter.
+    //# We are also passing in our USM pointers as opposed to a buffer or raw data pointer.
+    gemm_done = mkl::blas::gemm(queue, transA, transB, m, n, k, alpha, A_usm, ldA, B_usm, ldB, beta, C_usm, ldC, gemm_dependencies);
+
+    //# We must now wait for the given event to finish before accessing any data involved in the operation
+    //# Otherwise, we may access data before the operation has completed, or before it has been returned to the host
+    gemm_done.wait();
+
+    float status = 0.0;
+
+    //# verify C matrix using USM data
+    std::cout << "\n";
+    std::cout << "C = \n";
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (A_usm[i*m+j] != C_usm[i*m+j]) status = 1;
+            std::cout << C_usm[i*m+j] << " ";
+        }
+        std::cout << "\n";
+    }
+    std::cout << "\n";
+
+    //# free usm pointers
+    sycl::free(A_usm, queue);
+    sycl::free(B_usm, queue);
+    sycl::free(C_usm, queue);
+
+    status == 0 ? std::cout << "Verified: A = C\n" : std::cout << "Failed: A != C\n";
+    return status;
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/omp_gemm.cpp b/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/omp_gemm.cpp
new file mode 100644
index 0000000000..a0d1f0e4b0
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/lab/omp_gemm.cpp
@@ -0,0 +1,95 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <stdio.h>
+#include "mkl.h"              //# main mkl header
+#include "mkl_omp_offload.h"  //# mkl OMP Offload interface
+
+int dnum = 0;
+
+int main() {
+
+    //# dimensions
+    MKL_INT m = 3, n = 3, k = 3;
+    //# leading dimensions
+    MKL_INT ldA = k, ldB = n, ldC = n;
+    //# scalar multipliers
+    double alpha = 1.0;
+    double beta = 1.0;
+    //# matrix data
+    double *A = (double *)malloc(m * k * sizeof(double));
+    double *B = (double *)malloc(k * n * sizeof(double));
+    double *C = (double *)malloc(m * n * sizeof(double));
+
+    //# define matrix A as the 3x3 matrix
+    //# {{ 1, 2, 3}, {4, 5, 6}, {7, 8, 9}}
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < k; j++) {
+            A[i*m+j] = (double)(i*m+j) + 1.0;
+        }
+    }
+
+    //# define matrix B as the identity matrix
+    for (int i = 0; i < k; i++) {
+        for (int j = 0; j < n; j++) {
+            if (i == j) B[i*k+j] = 1.0;
+            else B[i*k+j] = 0.0;
+        }
+    }
+
+    //# initialize C as a 0 matrix
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            C[i*m+j] = 0.0;
+        }
+    }
+
+    MKL_INT sizeA = m*k;
+    MKL_INT sizeB = k*n;
+    MKL_INT sizeC = m*n;
+    
+    //# Below are the two compiler directives necessary to offload the GEMM operation
+    //# we are using 'dgemm' to specify we are using double-precision values
+    
+    //# The outer directive maps input data (matrices A & B) 'to' the device.
+    //# It also maps output data (matrix C) 'from' the device so that the results of the operation are returned.
+    //# Finally, this directive specifies device number 0, which should interact with an available GPU.
+    
+    //# The inner directive dispatches the correct version of the contained operation, again specifying the device number.
+    //# This directive also uses the 'use_devce_ptr' statement to specify the data we are working with (in this case, arrays A, B, & C).
+    
+    //# Uncomment the two 'pragma' lines below. (Do not remove the '#' character)
+    
+    #pragma omp target data map(to:A[0:sizeA],B[0:sizeB]) map(from:C[0:sizeC]) device(dnum)
+    {
+        #pragma omp target variant dispatch device(dnum) use_device_ptr(A, B, C)
+        {
+            dgemm("N", "N", &m, &n, &k, &alpha, A, &ldA, B, &ldB, &beta, C, &ldC);
+        }
+    }
+
+    int status = 0;
+
+    //# verify C matrix
+    printf("\n");
+    printf("C = \n");
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            if (A[i*m+j] != C[i*m+j]) status = 1;
+            printf("%0.0f ", C[i*m+j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    //# free matrix data
+    free(A);
+    free(B);
+    free(C);
+
+    status == 0 ? printf("Verified: A = C\n") : printf("Failed: A != C\n");
+
+    return status;
+}
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/q b/Libraries/gemm_oneMKL_SYCL/00_GEMM/q
new file mode 100644
index 0000000000..b06e66b787
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/q
@@ -0,0 +1,52 @@
+#!/bin/bash
+#==========================================
+# Copyright © 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#==========================================
+# Script to submit job in Intel(R) DevCloud
+# Version: 0.7
+#==========================================
+if [ -z "$1" ]; then
+    echo "Missing script argument, Usage: ./q run.sh"
+elif [ ! -f "$1" ]; then
+    echo "File $1 does not exist"
+else
+    echo "Job has been submitted to Intel(R) DevCloud and will execute soon."
+    echo ""
+    echo " If you do not see result in 60 seconds, please restart the Jupyter kernel:"
+    echo " Kernel -> 'Restart Kernel and Clear All Outputs...' and then try again"
+    echo ""
+    script=$1
+    # Remove old output files
+    rm *.sh.* > /dev/null 2>&1
+    # Submit job using qsub
+    qsub_id=`qsub -l nodes=1:gpu:ppn=2 -d . $script`
+    job_id="$(cut -d'.' -f1 <<<"$qsub_id")"
+    # Print qstat output
+    qstat 
+    # Wait for output file to be generated and display
+    echo ""
+    echo -ne "Waiting for Output "
+    until [ -f $script.o$job_id ]; do
+        sleep 1
+        echo -ne "█"
+        ((timeout++))
+        # Timeout if no output file generated within 60 seconds
+        if [ $timeout == 60 ]; then
+            echo ""
+            echo ""
+            echo "TimeOut 60 seconds: Job is still queued for execution, check for output file later ($script.o$job_id)"
+            echo ""
+            break
+        fi
+    done
+    # Print output and error file content if exist
+    if [ -n "$(find -name '*.sh.o'$job_id)" ]; then
+        echo " Done⬇"
+        cat $script.o$job_id
+        cat $script.e$job_id
+        echo "Job Completed in $timeout seconds."
+        rm *.sh.*$job_id > /dev/null 2>&1
+    fi
+fi
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_buffers.sh b/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_buffers.sh
new file mode 100644
index 0000000000..b7a4bdc88f
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_buffers.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
+/bin/echo "##" $(whoami) is compiling oneMKL_introduction Module0 -- gemm with buffers - 1 of 3 dpcpp_gemm_buffers.cpp
+
+icpx -fsycl -fsycl-device-code-split=per_kernel -DMKL_ILP64 -I$MKLROOT/include -L$MKLROOT/lib/intel64 -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lsycl -lOpenCL -lpthread -lm -ldl lab/dpcpp_gemm_buffers.cpp
+
+if [ $? -eq 0 ]; then ./a.out; fi
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_omp.sh b/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_omp.sh
new file mode 100644
index 0000000000..25be751795
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_omp.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
+/bin/echo "##" $(whoami) is compiling oneMKL_introduction Module0 -- gemm with openmp - 3 of 3 omp_gemm.cpp
+icx lab/omp_gemm.cpp -fsycl-device-code-split=per_kernel -DMKL_ILP64 -m64 -I/opt/intel/oneapi/mkl/2021.1-beta10/include -fsycl -fiopenmp -fopenmp-targets=spir64 -mllvm -vpo-paropt-use-raw-dev-ptr -L/opt/intel/oneapi/mkl/2021.1-beta10/lib/intel64 -lmkl_sycl -Wl,--start-group -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -Wl,--end-group -lsycl -lOpenCL -lpthread -ldl -lm -lstdc++
+if [ $? -eq 0 ]; then ./a.out; fi
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_usm.sh b/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_usm.sh
new file mode 100644
index 0000000000..a6e3aa6443
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/run_gemm_usm.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+source /opt/intel/oneapi/setvars.sh > /dev/null 2>&1
+/bin/echo "##" $(whoami) is compiling oneMKL_introduction Module0 -- gemm with usm - 2 of 3 dpcpp_gemm_usm.cpp
+dpcpp lab/dpcpp_gemm_usm.cpp -fsycl-device-code-split=per_kernel -DMKL_ILP64 -I$MKLROOT/include -L$MKLROOT/lib/intel64 -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lsycl -lOpenCL -lpthread -lm -ldl
+if [ $? -eq 0 ]; then ./a.out; fi
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_buffers.cpp b/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_buffers.cpp
new file mode 100644
index 0000000000..db13e2bed6
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_buffers.cpp
@@ -0,0 +1,99 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <iostream>
+#include <vector>
+#include <CL/sycl.hpp>          //# sycl namespace
+#include "oneapi/mkl/blas.hpp"  //# oneMKL DPC++ interface for BLAS functions
+
+//# The following project performs matrix multiplication using oneMKL / DPC++ with buffers.
+//# We will execute the simple operation A * B = C
+//# The matrix B is set equal to the identity matrix such that A * B = A * I
+//# After performing the computation, we will verify A * I = C -> A = C
+
+namespace mkl = oneapi::mkl;  //# shorten mkl namespace
+
+int main() {
+
+    //# dimensions
+    int m = 3, n = 3, k = 3;
+    //# leading dimensions
+    int ldA = 3, ldB = 3, ldC = 3;
+    //# scalar multipliers
+    double alpha = 1.0, beta = 1.0;
+    //# transpose status of matrices
+    mkl::transpose transA = mkl::transpose::nontrans;
+    mkl::transpose transB = mkl::transpose::nontrans;
+    //# matrix data
+    std::vector<double> A = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
+    std::vector<double> B = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0};
+    std::vector<double> C = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+
+    //### Step 1 - Observe the definition of an asynchronous exception handler.
+    //# This function object will later be supplied to the queue.
+    //# It is designed to handle errors thrown while device code executes.
+    auto async_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
+            }
+        }
+    };
+
+    //### Step 2 - Create a device object.
+    //# Device selectors are used to specify the type of a device.
+    //# Uncomment _one_ of the following three lines to select a device.
+    // sycl::device device = sycl::device(sycl::default_selector());  //# default_selector returns a device based on a performance heuristic
+    // sycl::device device = sycl::device(sycl::cpu_selector());      //# cpu_selector returns a cpu device
+    // sycl::device device = sycl::device(sycl::gpu_selector());      //# gpu_selector returns a gpu device
+    std::cout << "Device: " << device.get_info<sycl::info::device::name>() << "\n";
+
+    //### Step 3 - Create a queue object.
+    //# A queue accepts a single device, and optionally, an exception handler.
+    //# Uncomment the following line to initialize a queue with our device and handler.
+    // sycl::queue queue(device, async_handler);
+
+    //### Step 4 - Create buffers to hold our matrix data.
+    //# Buffer objects can be constructed given a container
+    //# Observe the creation of buffers for matrices A and B.
+    //# Try and create a third buffer for matrix C called C_buffer.
+    //# The solution is shown in the hidden cell below.
+    sycl::buffer A_buffer(A);
+    sycl::buffer B_buffer(B);
+    /* define C_buffer here */
+
+    //### Step 5 - Execute gemm operation.
+    //# Here, we need only pass in our queue and other familiar matrix multiplication parameters.
+    //# This includes the dimensions and data buffers for matrices A, B, and C.
+    mkl::blas::gemm(queue, transA, transB, m, n, k, alpha, A_buffer, ldA, B_buffer, ldB, beta, C_buffer, ldC);
+
+    //# we cannot explicitly transfer memory to/from the device when using buffers
+    //# that is why we must use this operation to ensure result data is returned to the host
+    queue.wait_and_throw();  //# block until operation completes, throw any errors
+
+    //### Step 6 - Observe creation of accessors to retrieve data from A_buffer and C_buffer.
+    sycl::host_accessor A_acc(A_buffer, sycl::read_only);
+    sycl::host_accessor C_acc(C_buffer, sycl::read_only);
+
+    int status = 0;
+
+    // verify C matrix using accessor to observe values held in C_buffer
+    std::cout << "\n";
+    std::cout << "C = \n";
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (A_acc[i*m+j] != C_acc[i*m+j]) status = 1;
+            std::cout << C_acc[i*m+j] << " ";
+        }
+        std::cout << "\n";
+    }
+    std::cout << "\n";
+
+    status == 0 ? std::cout << "Verified: A = C\n" : std::cout << "Failed: A != C\n";
+    return status;
+}
\ No newline at end of file
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_usm.cpp b/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_usm.cpp
new file mode 100644
index 0000000000..faecdf25d0
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/dpcpp_gemm_usm.cpp
@@ -0,0 +1,123 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <iostream>
+#include <vector>
+#include <CL/sycl.hpp>          //# sycl namespace
+#include "oneapi/mkl/blas.hpp"  //# oneMKL DPC++ interface for BLAS functions
+
+//# The following project performs matrix multiplication using oneMKL / DPC++ with Unified Shared Memory (USM)
+//# We will execute the simple operation A * B = C
+//# The matrix B is set equal to the identity matrix such that A * B = A * I
+//# After performing the computation, we will verify A * I = C -> A = C
+
+namespace mkl = oneapi::mkl;  //# shorten mkl namespace
+
+int main() {
+
+    //# dimensions
+    int m = 3, n = 3, k = 3;
+    //# leading dimensions
+    int ldA = 3, ldB = 3, ldC = 3;
+    //# scalar multipliers
+    double alpha = 1.0, beta = 1.0;
+    //# transpose status of matrices
+    mkl::transpose transA = mkl::transpose::nontrans;
+    mkl::transpose transB = mkl::transpose::nontrans;
+
+    //### Step 1 - Observe the definition of an asynchronous exception handler.
+    //# This function object will later be supplied to the queue.
+    //# It is designed to handle errors thrown while device code executes.
+    auto async_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
+            }
+        }
+    };
+
+    //### Step 2 - Create a device object.
+    //# Device selectors are used to specify the type of a device.
+    //# Uncomment _one_ of the following three lines to select a device.
+    // sycl::device device = sycl::device(sycl::default_selector());  //# default_selector returns a device based on a performance heuristic
+    // sycl::device device = sycl::device(sycl::cpu_selector());      //# cpu_selector returns a cpu device
+    // sycl::device device = sycl::device(sycl::gpu_selector());      //# gpu_selector returns a gpu device
+    std::cout << "Device: " << device.get_info<sycl::info::device::name>() << "\n";
+
+    //### Step 3 - Create a queue object.
+    //# A queue accepts a single device, and optionally, an exception handler.
+    //# Uncomment the following line to initialize a queue with our device and handler.
+    // sycl::queue queue(device, async_handler);
+
+    //### Step 4 - Create a sycl event and allocate USM
+    //# The later execution of the gemm operation is tied to this event
+    //# The gemm operation will also make use of a vector of sycl events we can call 'gemm_dependencies'
+    sycl::event gemm_done;
+    std::vector<sycl::event> gemm_dependencies;
+    //# Here, we allocate USM pointers for each matrix, using the special 'malloc_shared' function
+    //# Make sure to template the function with the correct precision, and pass in our queue to the function call
+    double *A_usm = sycl::malloc_shared<double>(m * k, queue);
+    double *B_usm = sycl::malloc_shared<double>(k * n, queue);
+    double *C_usm = sycl::malloc_shared<double>(m * n, queue);
+
+    //# define matrix A as the 3x3 matrix
+    //# {{ 1, 2, 3}, {4, 5, 6}, {7, 8, 9}}
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < k; j++) {
+            A_usm[i*m+j] = (double)(i*m+j) + 1.0;
+        }
+    }
+    
+    //# define matrix B as the identity matrix
+    for (int i = 0; i < k; i++) {
+        for (int j = 0; j < n; j++) {
+            if (i == j) B_usm[i*k+j] = 1.0;
+            else B_usm[i*k+j] = 0.0;
+        }
+    }
+    
+    //# initialize C as a 0 matrix
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            C_usm[i*m+j] = 0.0;
+        }
+    }
+
+    //### Step 5 - Execute gemm operation.
+    //# Here, we fill in the familiar parameters for the gemm operation.
+    //# However, we must also pass in the queue as the first parameter.
+    //# We must also pass in our list of dependencies as the final parameter.
+    //# We are also passing in our USM pointers as opposed to a buffer or raw data pointer.
+    gemm_done = mkl::blas::gemm(queue, transA, transB, m, n, k, alpha, A_usm, ldA, B_usm, ldB, beta, C_usm, ldC, gemm_dependencies);
+
+    //# We must now wait for the given event to finish before accessing any data involved in the operation
+    //# Otherwise, we may access data before the operation has completed, or before it has been returned to the host
+    gemm_done.wait();
+
+    int status = 0;
+
+    //# verify C matrix using USM data
+    std::cout << "\n";
+    std::cout << "C = \n";
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            if (A_usm[i*m+j] != C_usm[i*m+j]) status = 1;
+            std::cout << C_usm[i*m+j] << " ";
+        }
+        std::cout << "\n";
+    }
+    std::cout << "\n";
+
+    //# free usm pointers
+    sycl::free(A_usm, queue);
+    sycl::free(B_usm, queue);
+    sycl::free(C_usm, queue);
+
+    status == 0 ? std::cout << "Verified: A = C\n" : std::cout << "Failed: A != C\n";
+    return status;
+}
\ No newline at end of file
diff --git a/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/omp_gemm.cpp b/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/omp_gemm.cpp
new file mode 100644
index 0000000000..ef1413928e
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/00_GEMM/src/omp_gemm.cpp
@@ -0,0 +1,95 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <stdio.h>
+#include "mkl.h"              //# main mkl header
+#include "mkl_omp_offload.h"  //# mkl OMP Offload interface
+
+int dnum = 0;
+
+int main() {
+
+    //# dimensions
+    MKL_INT m = 3, n = 3, k = 3;
+    //# leading dimensions
+    MKL_INT ldA = k, ldB = n, ldC = n;
+    //# scalar multipliers
+    double alpha = 1.0;
+    double beta = 1.0;
+    //# matrix data
+    double *A = (double *)malloc(m * k * sizeof(double));
+    double *B = (double *)malloc(k * n * sizeof(double));
+    double *C = (double *)malloc(m * n * sizeof(double));
+
+    //# define matrix A as the 3x3 matrix
+    //# {{ 1, 2, 3}, {4, 5, 6}, {7, 8, 9}}
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < k; j++) {
+            A[i*m+j] = (double)(i*m+j) + 1.0;
+        }
+    }
+
+    //# define matrix B as the identity matrix
+    for (int i = 0; i < k; i++) {
+        for (int j = 0; j < n; j++) {
+            if (i == j) B[i*k+j] = 1.0;
+            else B[i*k+j] = 0.0;
+        }
+    }
+
+    //# initialize C as a 0 matrix
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            C[i*m+j] = 0.0;
+        }
+    }
+
+    MKL_INT sizeA = m*k;
+    MKL_INT sizeB = k*n;
+    MKL_INT sizeC = m*n;
+    
+    //# Below are the two compiler directives necessary to offload the GEMM operation
+    //# we are using 'dgemm' to specify we are using double-precision values
+    
+    //# The outer directive maps input data (matrices A & B) 'to' the device.
+    //# It also maps output data (matrix C) 'from' the device so that the results of the operation are returned.
+    //# Finally, this directive specifies device number 0, which should interact with an available GPU.
+    
+    //# The inner directive dispatches the correct version of the contained operation, again specifying the device number.
+    //# This directive also uses the 'use_devce_ptr' statement to specify the data we are working with (in this case, arrays A, B, & C).
+    
+    //# Uncomment the two 'pragma' lines below. (Do not remove the '#' character)
+    
+    //#pragma omp target data map(to:A[0:sizeA],B[0:sizeB]) map(from:C[0:sizeC]) device(dnum)
+    {
+        //#pragma omp target variant dispatch device(dnum) use_device_ptr(A, B, C)
+        {
+            dgemm("N", "N", &m, &n, &k, &alpha, A, &ldA, B, &ldB, &beta, C, &ldC);
+        }
+    }
+
+    int status = 0;
+
+    //# verify C matrix
+    printf("\n");
+    printf("C = \n");
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            if (A[i*m+j] != C[i*m+j]) status = 1;
+            printf("%0.0f ", C[i*m+j]);
+        }
+        printf("\n");
+    }
+    printf("\n");
+
+    //# free matrix data
+    free(A);
+    free(B);
+    free(C);
+
+    status == 0 ? printf("Verified: A = C\n") : printf("Failed: A != C\n");
+
+    return status;
+}
\ No newline at end of file
diff --git a/Libraries/gemm_oneMKL_SYCL/oneMKL_Intro.ipynb b/Libraries/gemm_oneMKL_SYCL/oneMKL_Intro.ipynb
new file mode 100644
index 0000000000..48fc9c776e
--- /dev/null
+++ b/Libraries/gemm_oneMKL_SYCL/oneMKL_Intro.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Intel oneAPI MKL Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Sections\n",
+    "* [Learning Objectives](#Learning-Objectives)\n",
+    "* [oneMKL Overview](#oneMKL-Overview)\n",
+    "* [Prerequisites](#Prerequisites)\n",
+    "* [oneMKL With DPC++](#oneMKL-With-DPC++)\n",
+    "* [oneMKL With OpenMP Offload](#oneMKL-With-OpenMP-Offload)\n",
+    "* [Modules](#Modules)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Learning Objectives\n",
+    "* Understand how the __Intel oneAPI Math Kernel Library (oneMKL)__ fits within the __oneAPI programming model__ for heterogeneous computing\n",
+    "* Know the difference between the __Data Parallel C++ (DPC++)__ and __OpenMP Offload__ approaches to oneMKL and when to use each one\n",
+    "* Get __hands-on__ experience with common oneMKL routines"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## oneMKL Overview\n",
+    "oneMKL provides a set of optimized scientific computing routines familiar to users of the Intel Math Kernel Library (MKL). This toolkit extends heterogeneous computing functionality via the DPC++ and OpenMP Offload interfaces.\n",
+    "\n",
+    "Each interface follows a specific use case. Generally, users creating new data-parallel projects *or* migrating CUDA or OpenCL projects should opt for DPC++, while those updating legacy C or Fortran code should use OpenMP Offload."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "The following courses prime the reader on the use of oneAPI with DPC++ and with OpenMP Offload. The courses also provide motivation for the use of each method and serve as a foundation for the learnings in this lab.\n",
+    "\n",
+    "* [Essentials of Data Parallel C++](https://software.intel.com/content/www/us/en/develop/tools/oneapi/training/dpc-essentials.html)\n",
+    "* [OpenMP* Offload Basics](https://software.intel.com/content/www/us/en/develop/tools/oneapi/training/openmp-offload.html)\n",
+    "\n",
+    "oneMKL simplifies the use of the oneAPI programming model and handles much of the work for users. As such it is *not* necessary to work through all of the training modules in the *Essentials of Data Parallel C++* lab. Below is the list of minimum recommended training modules for DPC++ before starting this lab.\n",
+    "\n",
+    "* oneAPI_Essentials/01_oneAPI_Intro\n",
+    "* oneAPI_Essentials/02_DPCPP_Program_structure\n",
+    "* oneAPI_Essentials/03_DPCPP_Unified_Shared_Memory\n",
+    "\n",
+    "As for the OpenMP Offload approach, it will be worthwhile to view all training modules in the OpenMP Offload Basics lab."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## oneMKL With DPC++\n",
+    "The oneMKL DPC++ interface allows DPC++ programs to take advantage of oneMKL routines. When working with the DPC++ we must keep track of a few important components, including:\n",
+    "\n",
+    "* __Device(s)__ on which oneMKL functions will execute\n",
+    "* __Queue__ to schedule submission of tasks to device(s)\n",
+    "\n",
+    "oneMKL also supports different DPC++ memory management models.\n",
+    "\n",
+    "1. __Buffers__ and __accessors__\n",
+    "2. __Unified shared memory__\n",
+    "\n",
+    "A typical DPC++ program requires the user to create a __kernel__, contained within a __command group__. The user must then submit the __command group__ to the __queue__, scheduling its execution on the given __device__. \n",
+    "\n",
+    "oneMKL provides a simpler path. Instead of the traditional approach, the user need only create a __queue__ and pass it to a oneMKL function call. The function selects a pre-written kernel, optimized for the chosen device, and submits it to our queue. There is *no* need to write a __kernel__ or __command group__."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## oneMKL With OpenMP Offload\n",
+    "The OpenMP Offload approach interfaces well with existing C code, allowing programs to execute on GPUs with __minimal__ changes to the source. OpenMP Offload utilizes __directives__ in the form of ```#pragma``` statements. The *OpenMP Offload Basics* lab linked above explores these directives in greater details. The following modules will explain how to target the OpenMP Offload interface for oneMKL, and how to set up the necessary ```#pragma``` statements for each routine."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modules\n",
+    "Each module is a self contained lab explaining the usage of a specific oneMKL routine. Further, each module shows the usage of a given operation under three different paradigms:\n",
+    "1. DPC++ with buffer/accessor memory model\n",
+    "2. DPC++ with unified shared memory model\n",
+    "3. OpenMP Offload\n",
+    "\n",
+    "### 00 - [Matrix Multiplication (GEMM)](./00_GEMM/00_GEMM.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "As is shown in the above modules, oneMKL enables users by providing an easy way to utilize heterogeneous computing platforms. Whether it be DPC++ for new applications or OpenMP Offload for legacy code, oneMKL provides a means to accelerate scientific computing workloads.\n",
+    "\n",
+    "Hopefully, you can now:\n",
+    "\n",
+    "* Understand the use of oneMKL within the oneAPI framework\n",
+    "* Utilize DPC++ to take advantage of heterogeneous computing systems\n",
+    "* Execute oneMKL routines on a GPU with OpenMP Offload"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<html><body><span style=\"color:green\"><h1>Survey</h1></span></body></html>\n",
+    "\n",
+    "[We would appreciate any feedback you’d care to give, so that we can improve the overall training quality and experience. Thanks! ](https://intel.az1.qualtrics.com/jfe/form/SV_3elZDqbEP3ZcXC5)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7 (Intel® oneAPI)",
+   "language": "python",
+   "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}